164339d36f8bd4db5025fe2988eda22b491a9219cFredrik Roubert// Copyright (C) 2016 and later: Unicode, Inc. and others.
264339d36f8bd4db5025fe2988eda22b491a9219cFredrik Roubert// License & terms of use: http://www.unicode.org/copyright.html
3fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius/*
4fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius*******************************************************************************
5fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius* Copyright (C) 2013-2014, International Business Machines
6fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius* Corporation and others.  All Rights Reserved.
7fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius*******************************************************************************
8fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius* collationbuilder.cpp
9fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius*
10fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius* (replaced the former ucol_bld.cpp)
11fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius*
12fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius* created on: 2013may06
13fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius* created by: Markus W. Scherer
14fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius*/
15fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius
16fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#ifdef DEBUG_COLLATION_BUILDER
17fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#include <stdio.h>
18fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#endif
19fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius
20fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#include "unicode/utypes.h"
21fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius
22fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#if !UCONFIG_NO_COLLATION
23fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius
24fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#include "unicode/caniter.h"
25fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#include "unicode/normalizer2.h"
26fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#include "unicode/tblcoll.h"
27fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#include "unicode/parseerr.h"
28fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#include "unicode/uchar.h"
29fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#include "unicode/ucol.h"
30fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#include "unicode/unistr.h"
31fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#include "unicode/usetiter.h"
32fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#include "unicode/utf16.h"
33fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#include "unicode/uversion.h"
34fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#include "cmemory.h"
35fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#include "collation.h"
36fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#include "collationbuilder.h"
37fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#include "collationdata.h"
38fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#include "collationdatabuilder.h"
39fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#include "collationfastlatin.h"
40fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#include "collationroot.h"
41fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#include "collationrootelements.h"
42fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#include "collationruleparser.h"
43fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#include "collationsettings.h"
44fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#include "collationtailoring.h"
45fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#include "collationweights.h"
46fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#include "normalizer2impl.h"
47fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#include "uassert.h"
48fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#include "ucol_imp.h"
49fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#include "utf16collationiterator.h"
50fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius
51fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusU_NAMESPACE_BEGIN
52fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius
53fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusnamespace {
54fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius
55fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusclass BundleImporter : public CollationRuleParser::Importer {
56fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliuspublic:
57f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius    BundleImporter() {}
58fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    virtual ~BundleImporter();
59f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius    virtual void getRules(
60fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius            const char *localeID, const char *collationType,
61f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius            UnicodeString &rules,
62fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius            const char *&errorReason, UErrorCode &errorCode);
63fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius};
64fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius
65f9878a236aa0d9662d8e40cafdaf2e04cd615835ccorneliusBundleImporter::~BundleImporter() {}
66fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius
67f9878a236aa0d9662d8e40cafdaf2e04cd615835ccorneliusvoid
68fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusBundleImporter::getRules(
69fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        const char *localeID, const char *collationType,
70f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius        UnicodeString &rules,
71fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        const char *& /*errorReason*/, UErrorCode &errorCode) {
72f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius    CollationLoader::loadRules(localeID, collationType, rules, errorCode);
73fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius}
74fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius
75fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius}  // namespace
76fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius
77fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius// RuleBasedCollator implementation ---------------------------------------- ***
78fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius
79fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius// These methods are here, rather than in rulebasedcollator.cpp,
80fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius// for modularization:
81fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius// Most code using Collator does not need to build a Collator from rules.
82fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius// By moving these constructors and helper methods to a separate file,
83fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius// most code will not have a static dependency on the builder code.
84fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius
85fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusRuleBasedCollator::RuleBasedCollator()
86fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        : data(NULL),
87fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius          settings(NULL),
88fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius          tailoring(NULL),
89f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius          cacheEntry(NULL),
90fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius          validLocale(""),
91fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius          explicitlySetAttributes(0),
92fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius          actualLocaleIsSameAsValid(FALSE) {
93fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius}
94fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius
95fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusRuleBasedCollator::RuleBasedCollator(const UnicodeString &rules, UErrorCode &errorCode)
96fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        : data(NULL),
97fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius          settings(NULL),
98fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius          tailoring(NULL),
99f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius          cacheEntry(NULL),
100fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius          validLocale(""),
101fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius          explicitlySetAttributes(0),
102fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius          actualLocaleIsSameAsValid(FALSE) {
103fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    internalBuildTailoring(rules, UCOL_DEFAULT, UCOL_DEFAULT, NULL, NULL, errorCode);
104fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius}
105fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius
106fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusRuleBasedCollator::RuleBasedCollator(const UnicodeString &rules, ECollationStrength strength,
107fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                                     UErrorCode &errorCode)
108fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        : data(NULL),
109fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius          settings(NULL),
110fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius          tailoring(NULL),
111f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius          cacheEntry(NULL),
112fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius          validLocale(""),
113fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius          explicitlySetAttributes(0),
114fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius          actualLocaleIsSameAsValid(FALSE) {
115fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    internalBuildTailoring(rules, strength, UCOL_DEFAULT, NULL, NULL, errorCode);
116fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius}
117fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius
118fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusRuleBasedCollator::RuleBasedCollator(const UnicodeString &rules,
119fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                                     UColAttributeValue decompositionMode,
120fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                                     UErrorCode &errorCode)
121fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        : data(NULL),
122fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius          settings(NULL),
123fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius          tailoring(NULL),
124f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius          cacheEntry(NULL),
125fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius          validLocale(""),
126fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius          explicitlySetAttributes(0),
127fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius          actualLocaleIsSameAsValid(FALSE) {
128fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    internalBuildTailoring(rules, UCOL_DEFAULT, decompositionMode, NULL, NULL, errorCode);
129fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius}
130fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius
131fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusRuleBasedCollator::RuleBasedCollator(const UnicodeString &rules,
132fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                                     ECollationStrength strength,
133fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                                     UColAttributeValue decompositionMode,
134fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                                     UErrorCode &errorCode)
135fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        : data(NULL),
136fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius          settings(NULL),
137fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius          tailoring(NULL),
138f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius          cacheEntry(NULL),
139fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius          validLocale(""),
140fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius          explicitlySetAttributes(0),
141fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius          actualLocaleIsSameAsValid(FALSE) {
142fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    internalBuildTailoring(rules, strength, decompositionMode, NULL, NULL, errorCode);
143fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius}
144fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius
145fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusRuleBasedCollator::RuleBasedCollator(const UnicodeString &rules,
146fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                                     UParseError &parseError, UnicodeString &reason,
147fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                                     UErrorCode &errorCode)
148fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        : data(NULL),
149fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius          settings(NULL),
150fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius          tailoring(NULL),
151f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius          cacheEntry(NULL),
152fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius          validLocale(""),
153fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius          explicitlySetAttributes(0),
154fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius          actualLocaleIsSameAsValid(FALSE) {
155fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    internalBuildTailoring(rules, UCOL_DEFAULT, UCOL_DEFAULT, &parseError, &reason, errorCode);
156fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius}
157fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius
158fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusvoid
159fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusRuleBasedCollator::internalBuildTailoring(const UnicodeString &rules,
160fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                                          int32_t strength,
161fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                                          UColAttributeValue decompositionMode,
162fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                                          UParseError *outParseError, UnicodeString *outReason,
163fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                                          UErrorCode &errorCode) {
164fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    const CollationTailoring *base = CollationRoot::getRoot(errorCode);
165fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    if(U_FAILURE(errorCode)) { return; }
166fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    if(outReason != NULL) { outReason->remove(); }
167fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    CollationBuilder builder(base, errorCode);
168fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    UVersionInfo noVersion = { 0, 0, 0, 0 };
169fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    BundleImporter importer;
170fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    LocalPointer<CollationTailoring> t(builder.parseAndBuild(rules, noVersion,
171fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                                                             &importer,
172fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                                                             outParseError, errorCode));
173fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    if(U_FAILURE(errorCode)) {
174fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        const char *reason = builder.getErrorReason();
175fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        if(reason != NULL && outReason != NULL) {
176fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius            *outReason = UnicodeString(reason, -1, US_INV);
177fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        }
178fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        return;
179fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    }
180fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    t->actualLocale.setToBogus();
181f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius    adoptTailoring(t.orphan(), errorCode);
182dbc22bd174be483711cea006f3189d8289835830ccornelius    // Set attributes after building the collator,
183dbc22bd174be483711cea006f3189d8289835830ccornelius    // to keep the default settings consistent with the rule string.
184dbc22bd174be483711cea006f3189d8289835830ccornelius    if(strength != UCOL_DEFAULT) {
185dbc22bd174be483711cea006f3189d8289835830ccornelius        setAttribute(UCOL_STRENGTH, (UColAttributeValue)strength, errorCode);
186dbc22bd174be483711cea006f3189d8289835830ccornelius    }
187dbc22bd174be483711cea006f3189d8289835830ccornelius    if(decompositionMode != UCOL_DEFAULT) {
188dbc22bd174be483711cea006f3189d8289835830ccornelius        setAttribute(UCOL_NORMALIZATION_MODE, decompositionMode, errorCode);
189dbc22bd174be483711cea006f3189d8289835830ccornelius    }
190fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius}
191fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius
192fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius// CollationBuilder implementation ----------------------------------------- ***
193fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius
1941b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert// Some compilers don't care if constants are defined in the .cpp file.
1951b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert// MS Visual C++ does not like it, but gcc requires it. clang does not care.
1961b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert#ifndef _MSC_VER
1971b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubertconst int32_t CollationBuilder::HAS_BEFORE2;
1981b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubertconst int32_t CollationBuilder::HAS_BEFORE3;
1991b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert#endif
2001b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert
201fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusCollationBuilder::CollationBuilder(const CollationTailoring *b, UErrorCode &errorCode)
202fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        : nfd(*Normalizer2::getNFDInstance(errorCode)),
203fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius          fcd(*Normalizer2Factory::getFCDInstance(errorCode)),
204fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius          nfcImpl(*Normalizer2Factory::getNFCImpl(errorCode)),
205fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius          base(b),
206fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius          baseData(b->data),
207fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius          rootElements(b->data->rootElements, b->data->rootElementsLength),
208fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius          variableTop(0),
209fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius          dataBuilder(new CollationDataBuilder(errorCode)), fastLatinEnabled(TRUE),
210fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius          errorReason(NULL),
211fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius          cesLength(0),
212fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius          rootPrimaryIndexes(errorCode), nodes(errorCode) {
213fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    nfcImpl.ensureCanonIterData(errorCode);
214fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    if(U_FAILURE(errorCode)) {
215fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        errorReason = "CollationBuilder fields initialization failed";
216fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        return;
217fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    }
218fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    if(dataBuilder == NULL) {
219fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        errorCode = U_MEMORY_ALLOCATION_ERROR;
220fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        return;
221fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    }
222fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    dataBuilder->initForTailoring(baseData, errorCode);
223fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    if(U_FAILURE(errorCode)) {
224fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        errorReason = "CollationBuilder initialization failed";
225fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    }
226fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius}
227fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius
228fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusCollationBuilder::~CollationBuilder() {
229fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    delete dataBuilder;
230fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius}
231fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius
232fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusCollationTailoring *
233fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusCollationBuilder::parseAndBuild(const UnicodeString &ruleString,
234fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                                const UVersionInfo rulesVersion,
235fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                                CollationRuleParser::Importer *importer,
236fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                                UParseError *outParseError,
237fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                                UErrorCode &errorCode) {
238fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    if(U_FAILURE(errorCode)) { return NULL; }
239fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    if(baseData->rootElements == NULL) {
240fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        errorCode = U_MISSING_RESOURCE_ERROR;
241fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        errorReason = "missing root elements data, tailoring not supported";
242fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        return NULL;
243fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    }
244fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    LocalPointer<CollationTailoring> tailoring(new CollationTailoring(base->settings));
245fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    if(tailoring.isNull() || tailoring->isBogus()) {
246fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        errorCode = U_MEMORY_ALLOCATION_ERROR;
247fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        return NULL;
248fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    }
249fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    CollationRuleParser parser(baseData, errorCode);
250fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    if(U_FAILURE(errorCode)) { return NULL; }
251fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    // Note: This always bases &[last variable] and &[first regular]
252fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    // on the root collator's maxVariable/variableTop.
253fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    // If we wanted this to change after [maxVariable x], then we would keep
254fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    // the tailoring.settings pointer here and read its variableTop when we need it.
255fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    // See http://unicode.org/cldr/trac/ticket/6070
256fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    variableTop = base->settings->variableTop;
257fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    parser.setSink(this);
258fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    parser.setImporter(importer);
259dbc22bd174be483711cea006f3189d8289835830ccornelius    CollationSettings &ownedSettings = *SharedObject::copyOnWrite(tailoring->settings);
260dbc22bd174be483711cea006f3189d8289835830ccornelius    parser.parse(ruleString, ownedSettings, outParseError, errorCode);
261fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    errorReason = parser.getErrorReason();
262fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    if(U_FAILURE(errorCode)) { return NULL; }
263fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    if(dataBuilder->hasMappings()) {
264fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        makeTailoredCEs(errorCode);
265fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        closeOverComposites(errorCode);
266fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        finalizeCEs(errorCode);
267fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        // Copy all of ASCII, and Latin-1 letters, into each tailoring.
268fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        optimizeSet.add(0, 0x7f);
269fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        optimizeSet.add(0xc0, 0xff);
270fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        // Hangul is decomposed on the fly during collation,
271fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        // and the tailoring data is always built with HANGUL_TAG specials.
272fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        optimizeSet.remove(Hangul::HANGUL_BASE, Hangul::HANGUL_END);
273fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        dataBuilder->optimize(optimizeSet, errorCode);
274fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        tailoring->ensureOwnedData(errorCode);
275fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        if(U_FAILURE(errorCode)) { return NULL; }
276fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        if(fastLatinEnabled) { dataBuilder->enableFastLatin(); }
277fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        dataBuilder->build(*tailoring->ownedData, errorCode);
278fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        tailoring->builder = dataBuilder;
279fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        dataBuilder = NULL;
280fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    } else {
281fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        tailoring->data = baseData;
282fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    }
283fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    if(U_FAILURE(errorCode)) { return NULL; }
284dbc22bd174be483711cea006f3189d8289835830ccornelius    ownedSettings.fastLatinOptions = CollationFastLatin::getOptions(
285dbc22bd174be483711cea006f3189d8289835830ccornelius        tailoring->data, ownedSettings,
286f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius        ownedSettings.fastLatinPrimaries, UPRV_LENGTHOF(ownedSettings.fastLatinPrimaries));
287fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    tailoring->rules = ruleString;
288fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    tailoring->rules.getTerminatedBuffer();  // ensure NUL-termination
289fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    tailoring->setVersion(base->version, rulesVersion);
290fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    return tailoring.orphan();
291fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius}
292fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius
293fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusvoid
294fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusCollationBuilder::addReset(int32_t strength, const UnicodeString &str,
295fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                           const char *&parserErrorReason, UErrorCode &errorCode) {
296fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    if(U_FAILURE(errorCode)) { return; }
297fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    U_ASSERT(!str.isEmpty());
298fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    if(str.charAt(0) == CollationRuleParser::POS_LEAD) {
299fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        ces[0] = getSpecialResetPosition(str, parserErrorReason, errorCode);
300fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        cesLength = 1;
301fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        if(U_FAILURE(errorCode)) { return; }
302fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        U_ASSERT((ces[0] & Collation::CASE_AND_QUATERNARY_MASK) == 0);
303fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    } else {
304fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        // normal reset to a character or string
305fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        UnicodeString nfdString = nfd.normalize(str, errorCode);
306fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        if(U_FAILURE(errorCode)) {
307fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius            parserErrorReason = "normalizing the reset position";
308fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius            return;
309fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        }
310fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        cesLength = dataBuilder->getCEs(nfdString, ces, 0);
311fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        if(cesLength > Collation::MAX_EXPANSION_LENGTH) {
312fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius            errorCode = U_ILLEGAL_ARGUMENT_ERROR;
313fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius            parserErrorReason = "reset position maps to too many collation elements (more than 31)";
314fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius            return;
315fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        }
316fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    }
317fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    if(strength == UCOL_IDENTICAL) { return; }  // simple reset-at-position
318fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius
319fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    // &[before strength]position
320fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    U_ASSERT(UCOL_PRIMARY <= strength && strength <= UCOL_TERTIARY);
321fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    int32_t index = findOrInsertNodeForCEs(strength, parserErrorReason, errorCode);
322fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    if(U_FAILURE(errorCode)) { return; }
323fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius
324fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    int64_t node = nodes.elementAti(index);
3251b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert    // If the index is for a "weaker" node,
326fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    // then skip backwards over this and further "weaker" nodes.
327fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    while(strengthFromNode(node) > strength) {
328fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        index = previousIndexFromNode(node);
329fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        node = nodes.elementAti(index);
330fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    }
331fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius
332fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    // Find or insert a node whose index we will put into a temporary CE.
333fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    if(strengthFromNode(node) == strength && isTailoredNode(node)) {
334fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        // Reset to just before this same-strength tailored node.
335fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        index = previousIndexFromNode(node);
336fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    } else if(strength == UCOL_PRIMARY) {
337fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        // root primary node (has no previous index)
338fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        uint32_t p = weight32FromNode(node);
339fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        if(p == 0) {
340fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius            errorCode = U_UNSUPPORTED_ERROR;
341fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius            parserErrorReason = "reset primary-before ignorable not possible";
342fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius            return;
343fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        }
344fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        if(p <= rootElements.getFirstPrimary()) {
345fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius            // There is no primary gap between ignorables and the space-first-primary.
346fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius            errorCode = U_UNSUPPORTED_ERROR;
347fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius            parserErrorReason = "reset primary-before first non-ignorable not supported";
348fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius            return;
349fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        }
350fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        if(p == Collation::FIRST_TRAILING_PRIMARY) {
351fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius            // We do not support tailoring to an unassigned-implicit CE.
352fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius            errorCode = U_UNSUPPORTED_ERROR;
353fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius            parserErrorReason = "reset primary-before [first trailing] not supported";
354fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius            return;
355fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        }
356fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        p = rootElements.getPrimaryBefore(p, baseData->isCompressiblePrimary(p));
357fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        index = findOrInsertNodeForPrimary(p, errorCode);
358fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        // Go to the last node in this list:
359fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        // Tailor after the last node between adjacent root nodes.
360fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        for(;;) {
361fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius            node = nodes.elementAti(index);
362fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius            int32_t nextIndex = nextIndexFromNode(node);
363fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius            if(nextIndex == 0) { break; }
364fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius            index = nextIndex;
365fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        }
366fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    } else {
367fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        // &[before 2] or &[before 3]
368fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        index = findCommonNode(index, UCOL_SECONDARY);
369fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        if(strength >= UCOL_TERTIARY) {
370fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius            index = findCommonNode(index, UCOL_TERTIARY);
371fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        }
3721b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert        // findCommonNode() stayed on the stronger node or moved to
3731b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert        // an explicit common-weight node of the reset-before strength.
374fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        node = nodes.elementAti(index);
375fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        if(strengthFromNode(node) == strength) {
376fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius            // Found a same-strength node with an explicit weight.
377fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius            uint32_t weight16 = weight16FromNode(node);
378fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius            if(weight16 == 0) {
379fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                errorCode = U_UNSUPPORTED_ERROR;
380fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                if(strength == UCOL_SECONDARY) {
381fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                    parserErrorReason = "reset secondary-before secondary ignorable not possible";
382fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                } else {
383fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                    parserErrorReason = "reset tertiary-before completely ignorable not possible";
384fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                }
385fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                return;
386fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius            }
3871b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert            U_ASSERT(weight16 > Collation::BEFORE_WEIGHT16);
3881b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert            // Reset to just before this node.
3891b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert            // Insert the preceding same-level explicit weight if it is not there already.
3901b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert            // Which explicit weight immediately precedes this one?
3911b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert            weight16 = getWeight16Before(index, node, strength);
3921b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert            // Does this preceding weight have a node?
3931b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert            uint32_t previousWeight16;
394fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius            int32_t previousIndex = previousIndexFromNode(node);
3951b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert            for(int32_t i = previousIndex;; i = previousIndexFromNode(node)) {
3961b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert                node = nodes.elementAti(i);
3971b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert                int32_t previousStrength = strengthFromNode(node);
3981b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert                if(previousStrength < strength) {
3991b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert                    U_ASSERT(weight16 >= Collation::COMMON_WEIGHT16 || i == previousIndex);
4001b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert                    // Either the reset element has an above-common weight and
4011b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert                    // the parent node provides the implied common weight,
4021b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert                    // or the reset element has a weight<=common in the node
4031b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert                    // right after the parent, and we need to insert the preceding weight.
4041b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert                    previousWeight16 = Collation::COMMON_WEIGHT16;
4051b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert                    break;
4061b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert                } else if(previousStrength == strength && !isTailoredNode(node)) {
4071b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert                    previousWeight16 = weight16FromNode(node);
4081b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert                    break;
4091b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert                }
4101b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert                // Skip weaker nodes and same-level tailored nodes.
4111b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert            }
4121b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert            if(previousWeight16 == weight16) {
4131b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert                // The preceding weight has a node,
4141b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert                // maybe with following weaker or tailored nodes.
4151b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert                // Reset to the last of them.
416fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                index = previousIndex;
417fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius            } else {
4181b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert                // Insert a node with the preceding weight, reset to that.
4191b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert                node = nodeFromWeight16(weight16) | nodeFromStrength(strength);
4201b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert                index = insertNodeBetween(previousIndex, index, node, errorCode);
421fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius            }
422fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        } else {
423fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius            // Found a stronger node with implied strength-common weight.
4241b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert            uint32_t weight16 = getWeight16Before(index, node, strength);
4251b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert            index = findOrInsertWeakNode(index, weight16, strength, errorCode);
426fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        }
427fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        // Strength of the temporary CE = strength of its reset position.
428fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        // Code above raises an error if the before-strength is stronger.
429fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        strength = ceStrength(ces[cesLength - 1]);
430fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    }
431fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    if(U_FAILURE(errorCode)) {
432fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        parserErrorReason = "inserting reset position for &[before n]";
433fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        return;
434fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    }
435fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    ces[cesLength - 1] = tempCEFromIndexAndStrength(index, strength);
436fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius}
437fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius
4381b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubertuint32_t
4391b7d32f919554dda9c193b32188251337bc756f1Fredrik RoubertCollationBuilder::getWeight16Before(int32_t index, int64_t node, int32_t level) {
4401b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert    U_ASSERT(strengthFromNode(node) < level || !isTailoredNode(node));
4411b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert    // Collect the root CE weights if this node is for a root CE.
4421b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert    // If it is not, then return the low non-primary boundary for a tailored CE.
4431b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert    uint32_t t;
4441b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert    if(strengthFromNode(node) == UCOL_TERTIARY) {
4451b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert        t = weight16FromNode(node);
4461b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert    } else {
4471b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert        t = Collation::COMMON_WEIGHT16;  // Stronger node with implied common weight.
4481b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert    }
4491b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert    while(strengthFromNode(node) > UCOL_SECONDARY) {
4501b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert        index = previousIndexFromNode(node);
4511b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert        node = nodes.elementAti(index);
4521b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert    }
4531b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert    if(isTailoredNode(node)) {
4541b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert        return Collation::BEFORE_WEIGHT16;
4551b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert    }
4561b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert    uint32_t s;
4571b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert    if(strengthFromNode(node) == UCOL_SECONDARY) {
4581b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert        s = weight16FromNode(node);
4591b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert    } else {
4601b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert        s = Collation::COMMON_WEIGHT16;  // Stronger node with implied common weight.
4611b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert    }
4621b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert    while(strengthFromNode(node) > UCOL_PRIMARY) {
4631b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert        index = previousIndexFromNode(node);
4641b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert        node = nodes.elementAti(index);
4651b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert    }
4661b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert    if(isTailoredNode(node)) {
4671b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert        return Collation::BEFORE_WEIGHT16;
4681b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert    }
4691b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert    // [p, s, t] is a root CE. Return the preceding weight for the requested level.
4701b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert    uint32_t p = weight32FromNode(node);
4711b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert    uint32_t weight16;
4721b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert    if(level == UCOL_SECONDARY) {
4731b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert        weight16 = rootElements.getSecondaryBefore(p, s);
4741b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert    } else {
4751b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert        weight16 = rootElements.getTertiaryBefore(p, s, t);
4761b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert        U_ASSERT((weight16 & ~Collation::ONLY_TERTIARY_MASK) == 0);
4771b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert    }
4781b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert    return weight16;
4791b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert}
4801b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert
481fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusint64_t
482fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusCollationBuilder::getSpecialResetPosition(const UnicodeString &str,
483fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                                          const char *&parserErrorReason, UErrorCode &errorCode) {
484fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    U_ASSERT(str.length() == 2);
485fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    int64_t ce;
486fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    int32_t strength = UCOL_PRIMARY;
487fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    UBool isBoundary = FALSE;
488fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    UChar32 pos = str.charAt(1) - CollationRuleParser::POS_BASE;
489fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    U_ASSERT(0 <= pos && pos <= CollationRuleParser::LAST_TRAILING);
490fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    switch(pos) {
491fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    case CollationRuleParser::FIRST_TERTIARY_IGNORABLE:
492fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        // Quaternary CEs are not supported.
493fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        // Non-zero quaternary weights are possible only on tertiary or stronger CEs.
494fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        return 0;
495fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    case CollationRuleParser::LAST_TERTIARY_IGNORABLE:
496fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        return 0;
497fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    case CollationRuleParser::FIRST_SECONDARY_IGNORABLE: {
498fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        // Look for a tailored tertiary node after [0, 0, 0].
499fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        int32_t index = findOrInsertNodeForRootCE(0, UCOL_TERTIARY, errorCode);
500fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        if(U_FAILURE(errorCode)) { return 0; }
501fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        int64_t node = nodes.elementAti(index);
502fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        if((index = nextIndexFromNode(node)) != 0) {
503fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius            node = nodes.elementAti(index);
504fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius            U_ASSERT(strengthFromNode(node) <= UCOL_TERTIARY);
505fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius            if(isTailoredNode(node) && strengthFromNode(node) == UCOL_TERTIARY) {
506fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                return tempCEFromIndexAndStrength(index, UCOL_TERTIARY);
507fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius            }
508fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        }
509fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        return rootElements.getFirstTertiaryCE();
510fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        // No need to look for nodeHasAnyBefore() on a tertiary node.
511fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    }
512fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    case CollationRuleParser::LAST_SECONDARY_IGNORABLE:
513fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        ce = rootElements.getLastTertiaryCE();
514fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        strength = UCOL_TERTIARY;
515fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        break;
516fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    case CollationRuleParser::FIRST_PRIMARY_IGNORABLE: {
517fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        // Look for a tailored secondary node after [0, 0, *].
518fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        int32_t index = findOrInsertNodeForRootCE(0, UCOL_SECONDARY, errorCode);
519fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        if(U_FAILURE(errorCode)) { return 0; }
520fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        int64_t node = nodes.elementAti(index);
521fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        while((index = nextIndexFromNode(node)) != 0) {
522fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius            node = nodes.elementAti(index);
523fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius            strength = strengthFromNode(node);
524fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius            if(strength < UCOL_SECONDARY) { break; }
525fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius            if(strength == UCOL_SECONDARY) {
526fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                if(isTailoredNode(node)) {
527fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                    if(nodeHasBefore3(node)) {
528fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                        index = nextIndexFromNode(nodes.elementAti(nextIndexFromNode(node)));
529fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                        U_ASSERT(isTailoredNode(nodes.elementAti(index)));
530fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                    }
531fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                    return tempCEFromIndexAndStrength(index, UCOL_SECONDARY);
532fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                } else {
533fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                    break;
534fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                }
535fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius            }
536fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        }
537fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        ce = rootElements.getFirstSecondaryCE();
538fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        strength = UCOL_SECONDARY;
539fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        break;
540fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    }
541fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    case CollationRuleParser::LAST_PRIMARY_IGNORABLE:
542fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        ce = rootElements.getLastSecondaryCE();
543fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        strength = UCOL_SECONDARY;
544fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        break;
545fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    case CollationRuleParser::FIRST_VARIABLE:
546fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        ce = rootElements.getFirstPrimaryCE();
547fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        isBoundary = TRUE;  // FractionalUCA.txt: FDD1 00A0, SPACE first primary
548fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        break;
549fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    case CollationRuleParser::LAST_VARIABLE:
550fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        ce = rootElements.lastCEWithPrimaryBefore(variableTop + 1);
551fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        break;
552fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    case CollationRuleParser::FIRST_REGULAR:
553fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        ce = rootElements.firstCEWithPrimaryAtLeast(variableTop + 1);
554fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        isBoundary = TRUE;  // FractionalUCA.txt: FDD1 263A, SYMBOL first primary
555fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        break;
556fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    case CollationRuleParser::LAST_REGULAR:
557fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        // Use the Hani-first-primary rather than the actual last "regular" CE before it,
558fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        // for backward compatibility with behavior before the introduction of
559fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        // script-first-primary CEs in the root collator.
560fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        ce = rootElements.firstCEWithPrimaryAtLeast(
561fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius            baseData->getFirstPrimaryForGroup(USCRIPT_HAN));
562fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        break;
563f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius    case CollationRuleParser::FIRST_IMPLICIT:
564f9878a236aa0d9662d8e40cafdaf2e04cd615835ccornelius        ce = baseData->getSingleCE(0x4e00, errorCode);
565fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        break;
566fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    case CollationRuleParser::LAST_IMPLICIT:
567fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        // We do not support tailoring to an unassigned-implicit CE.
568fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        errorCode = U_UNSUPPORTED_ERROR;
569fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        parserErrorReason = "reset to [last implicit] not supported";
570fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        return 0;
571fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    case CollationRuleParser::FIRST_TRAILING:
572fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        ce = Collation::makeCE(Collation::FIRST_TRAILING_PRIMARY);
573fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        isBoundary = TRUE;  // trailing first primary (there is no mapping for it)
574fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        break;
575fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    case CollationRuleParser::LAST_TRAILING:
576fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        errorCode = U_ILLEGAL_ARGUMENT_ERROR;
577fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        parserErrorReason = "LDML forbids tailoring to U+FFFF";
578fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        return 0;
579fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    default:
580fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        U_ASSERT(FALSE);
581fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        return 0;
582fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    }
583fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius
584fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    int32_t index = findOrInsertNodeForRootCE(ce, strength, errorCode);
585fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    if(U_FAILURE(errorCode)) { return 0; }
586fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    int64_t node = nodes.elementAti(index);
587fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    if((pos & 1) == 0) {
588fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        // even pos = [first xyz]
589fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        if(!nodeHasAnyBefore(node) && isBoundary) {
590fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius            // A <group> first primary boundary is artificially added to FractionalUCA.txt.
591fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius            // It is reachable via its special contraction, but is not normally used.
592fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius            // Find the first character tailored after the boundary CE,
593fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius            // or the first real root CE after it.
594fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius            if((index = nextIndexFromNode(node)) != 0) {
595fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                // If there is a following node, then it must be tailored
596fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                // because there are no root CEs with a boundary primary
597fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                // and non-common secondary/tertiary weights.
598fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                node = nodes.elementAti(index);
599fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                U_ASSERT(isTailoredNode(node));
600fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                ce = tempCEFromIndexAndStrength(index, strength);
601fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius            } else {
602fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                U_ASSERT(strength == UCOL_PRIMARY);
603fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                uint32_t p = (uint32_t)(ce >> 32);
604fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                int32_t pIndex = rootElements.findPrimary(p);
605fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                UBool isCompressible = baseData->isCompressiblePrimary(p);
606fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                p = rootElements.getPrimaryAfter(p, pIndex, isCompressible);
607fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                ce = Collation::makeCE(p);
608fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                index = findOrInsertNodeForRootCE(ce, UCOL_PRIMARY, errorCode);
609fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                if(U_FAILURE(errorCode)) { return 0; }
610fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                node = nodes.elementAti(index);
611fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius            }
612fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        }
613fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        if(nodeHasAnyBefore(node)) {
614fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius            // Get the first node that was tailored before this one at a weaker strength.
615fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius            if(nodeHasBefore2(node)) {
616fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                index = nextIndexFromNode(nodes.elementAti(nextIndexFromNode(node)));
617fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                node = nodes.elementAti(index);
618fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius            }
619fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius            if(nodeHasBefore3(node)) {
620fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                index = nextIndexFromNode(nodes.elementAti(nextIndexFromNode(node)));
621fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius            }
622fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius            U_ASSERT(isTailoredNode(nodes.elementAti(index)));
623fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius            ce = tempCEFromIndexAndStrength(index, strength);
624fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        }
625fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    } else {
626fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        // odd pos = [last xyz]
627fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        // Find the last node that was tailored after the [last xyz]
628fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        // at a strength no greater than the position's strength.
629fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        for(;;) {
630fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius            int32_t nextIndex = nextIndexFromNode(node);
631fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius            if(nextIndex == 0) { break; }
632fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius            int64_t nextNode = nodes.elementAti(nextIndex);
633fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius            if(strengthFromNode(nextNode) < strength) { break; }
634fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius            index = nextIndex;
635fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius            node = nextNode;
636fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        }
637fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        // Do not make a temporary CE for a root node.
638fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        // This last node might be the node for the root CE itself,
639fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        // or a node with a common secondary or tertiary weight.
640fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        if(isTailoredNode(node)) {
641fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius            ce = tempCEFromIndexAndStrength(index, strength);
642fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        }
643fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    }
644fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    return ce;
645fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius}
646fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius
647fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusvoid
648fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusCollationBuilder::addRelation(int32_t strength, const UnicodeString &prefix,
649fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                              const UnicodeString &str, const UnicodeString &extension,
650fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                              const char *&parserErrorReason, UErrorCode &errorCode) {
651fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    if(U_FAILURE(errorCode)) { return; }
652fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    UnicodeString nfdPrefix;
653fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    if(!prefix.isEmpty()) {
654fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        nfd.normalize(prefix, nfdPrefix, errorCode);
655fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        if(U_FAILURE(errorCode)) {
656fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius            parserErrorReason = "normalizing the relation prefix";
657fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius            return;
658fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        }
659fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    }
660fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    UnicodeString nfdString = nfd.normalize(str, errorCode);
661fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    if(U_FAILURE(errorCode)) {
662fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        parserErrorReason = "normalizing the relation string";
663fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        return;
664fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    }
665fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius
666fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    // The runtime code decomposes Hangul syllables on the fly,
667fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    // with recursive processing but without making the Jamo pieces visible for matching.
668fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    // It does not work with certain types of contextual mappings.
669fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    int32_t nfdLength = nfdString.length();
670fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    if(nfdLength >= 2) {
671fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        UChar c = nfdString.charAt(0);
672fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        if(Hangul::isJamoL(c) || Hangul::isJamoV(c)) {
673fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius            // While handling a Hangul syllable, contractions starting with Jamo L or V
674fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius            // would not see the following Jamo of that syllable.
675fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius            errorCode = U_UNSUPPORTED_ERROR;
676fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius            parserErrorReason = "contractions starting with conjoining Jamo L or V not supported";
677fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius            return;
678fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        }
679fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        c = nfdString.charAt(nfdLength - 1);
680fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        if(Hangul::isJamoL(c) ||
681fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                (Hangul::isJamoV(c) && Hangul::isJamoL(nfdString.charAt(nfdLength - 2)))) {
682fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius            // A contraction ending with Jamo L or L+V would require
683fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius            // generating Hangul syllables in addTailComposites() (588 for a Jamo L),
684fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius            // or decomposing a following Hangul syllable on the fly, during contraction matching.
685fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius            errorCode = U_UNSUPPORTED_ERROR;
686fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius            parserErrorReason = "contractions ending with conjoining Jamo L or L+V not supported";
687fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius            return;
688fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        }
689fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        // A Hangul syllable completely inside a contraction is ok.
690fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    }
691fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    // Note: If there is a prefix, then the parser checked that
692fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    // both the prefix and the string beging with NFC boundaries (not Jamo V or T).
693fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    // Therefore: prefix.isEmpty() || !isJamoVOrT(nfdString.charAt(0))
694fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    // (While handling a Hangul syllable, prefixes on Jamo V or T
695fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    // would not see the previous Jamo of that syllable.)
696fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius
697fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    if(strength != UCOL_IDENTICAL) {
698fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        // Find the node index after which we insert the new tailored node.
699fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        int32_t index = findOrInsertNodeForCEs(strength, parserErrorReason, errorCode);
700fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        U_ASSERT(cesLength > 0);
701fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        int64_t ce = ces[cesLength - 1];
702fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        if(strength == UCOL_PRIMARY && !isTempCE(ce) && (uint32_t)(ce >> 32) == 0) {
703fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius            // There is no primary gap between ignorables and the space-first-primary.
704fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius            errorCode = U_UNSUPPORTED_ERROR;
705fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius            parserErrorReason = "tailoring primary after ignorables not supported";
706fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius            return;
707fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        }
708fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        if(strength == UCOL_QUATERNARY && ce == 0) {
709fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius            // The CE data structure does not support non-zero quaternary weights
710fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius            // on tertiary ignorables.
711fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius            errorCode = U_UNSUPPORTED_ERROR;
712fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius            parserErrorReason = "tailoring quaternary after tertiary ignorables not supported";
713fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius            return;
714fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        }
715fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        // Insert the new tailored node.
716fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        index = insertTailoredNodeAfter(index, strength, errorCode);
717fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        if(U_FAILURE(errorCode)) {
718fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius            parserErrorReason = "modifying collation elements";
719fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius            return;
720fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        }
721fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        // Strength of the temporary CE:
722fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        // The new relation may yield a stronger CE but not a weaker one.
723fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        int32_t tempStrength = ceStrength(ce);
724fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        if(strength < tempStrength) { tempStrength = strength; }
725fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        ces[cesLength - 1] = tempCEFromIndexAndStrength(index, tempStrength);
726fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    }
727fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius
728fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    setCaseBits(nfdString, parserErrorReason, errorCode);
729fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    if(U_FAILURE(errorCode)) { return; }
730fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius
731fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    int32_t cesLengthBeforeExtension = cesLength;
732fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    if(!extension.isEmpty()) {
733fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        UnicodeString nfdExtension = nfd.normalize(extension, errorCode);
734fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        if(U_FAILURE(errorCode)) {
735fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius            parserErrorReason = "normalizing the relation extension";
736fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius            return;
737fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        }
738fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        cesLength = dataBuilder->getCEs(nfdExtension, ces, cesLength);
739fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        if(cesLength > Collation::MAX_EXPANSION_LENGTH) {
740fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius            errorCode = U_ILLEGAL_ARGUMENT_ERROR;
741fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius            parserErrorReason =
742fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                "extension string adds too many collation elements (more than 31 total)";
743fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius            return;
744fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        }
745fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    }
746fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    uint32_t ce32 = Collation::UNASSIGNED_CE32;
747fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    if((prefix != nfdPrefix || str != nfdString) &&
748fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius            !ignorePrefix(prefix, errorCode) && !ignoreString(str, errorCode)) {
749fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        // Map from the original input to the CEs.
750fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        // We do this in case the canonical closure is incomplete,
751fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        // so that it is possible to explicitly provide the missing mappings.
752fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        ce32 = addIfDifferent(prefix, str, ces, cesLength, ce32, errorCode);
753fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    }
754fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    addWithClosure(nfdPrefix, nfdString, ces, cesLength, ce32, errorCode);
755fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    if(U_FAILURE(errorCode)) {
756fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        parserErrorReason = "writing collation elements";
757fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        return;
758fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    }
759fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    cesLength = cesLengthBeforeExtension;
760fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius}
761fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius
762fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusint32_t
763fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusCollationBuilder::findOrInsertNodeForCEs(int32_t strength, const char *&parserErrorReason,
764fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                                         UErrorCode &errorCode) {
765fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    if(U_FAILURE(errorCode)) { return 0; }
766fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    U_ASSERT(UCOL_PRIMARY <= strength && strength <= UCOL_QUATERNARY);
767fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius
768fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    // Find the last CE that is at least as "strong" as the requested difference.
769fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    // Note: Stronger is smaller (UCOL_PRIMARY=0).
770fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    int64_t ce;
771fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    for(;; --cesLength) {
772fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        if(cesLength == 0) {
773fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius            ce = ces[0] = 0;
774fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius            cesLength = 1;
775fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius            break;
776fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        } else {
777fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius            ce = ces[cesLength - 1];
778fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        }
779fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        if(ceStrength(ce) <= strength) { break; }
780fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    }
781fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius
782fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    if(isTempCE(ce)) {
783fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        // No need to findCommonNode() here for lower levels
784fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        // because insertTailoredNodeAfter() will do that anyway.
785fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        return indexFromTempCE(ce);
786fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    }
787fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius
788fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    // root CE
789fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    if((uint8_t)(ce >> 56) == Collation::UNASSIGNED_IMPLICIT_BYTE) {
790fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        errorCode = U_UNSUPPORTED_ERROR;
791fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        parserErrorReason = "tailoring relative to an unassigned code point not supported";
792fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        return 0;
793fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    }
794fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    return findOrInsertNodeForRootCE(ce, strength, errorCode);
795fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius}
796fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius
797fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusint32_t
798fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusCollationBuilder::findOrInsertNodeForRootCE(int64_t ce, int32_t strength, UErrorCode &errorCode) {
799fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    if(U_FAILURE(errorCode)) { return 0; }
800fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    U_ASSERT((uint8_t)(ce >> 56) != Collation::UNASSIGNED_IMPLICIT_BYTE);
801fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius
802fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    // Find or insert the node for each of the root CE's weights,
803fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    // down to the requested level/strength.
804fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    // Root CEs must have common=zero quaternary weights (for which we never insert any nodes).
805fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    U_ASSERT((ce & 0xc0) == 0);
8061b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert    int32_t index = findOrInsertNodeForPrimary((uint32_t)(ce >> 32), errorCode);
807fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    if(strength >= UCOL_SECONDARY) {
808fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        uint32_t lower32 = (uint32_t)ce;
809fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        index = findOrInsertWeakNode(index, lower32 >> 16, UCOL_SECONDARY, errorCode);
810fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        if(strength >= UCOL_TERTIARY) {
811fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius            index = findOrInsertWeakNode(index, lower32 & Collation::ONLY_TERTIARY_MASK,
812fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                                         UCOL_TERTIARY, errorCode);
813fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        }
814fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    }
815fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    return index;
816fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius}
817fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius
818fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusnamespace {
819fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius
820fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius/**
821fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * Like Java Collections.binarySearch(List, key, Comparator).
822fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius *
823fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * @return the index>=0 where the item was found,
824fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius *         or the index<0 for inserting the string at ~index in sorted order
825fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius *         (index into rootPrimaryIndexes)
826fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius */
827fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusint32_t
828fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusbinarySearchForRootPrimaryNode(const int32_t *rootPrimaryIndexes, int32_t length,
829fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                               const int64_t *nodes, uint32_t p) {
830fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    if(length == 0) { return ~0; }
831fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    int32_t start = 0;
832fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    int32_t limit = length;
833fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    for (;;) {
834fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        int32_t i = (start + limit) / 2;
835fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        int64_t node = nodes[rootPrimaryIndexes[i]];
836fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        uint32_t nodePrimary = (uint32_t)(node >> 32);  // weight32FromNode(node)
837fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        if (p == nodePrimary) {
838fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius            return i;
839fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        } else if (p < nodePrimary) {
840fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius            if (i == start) {
841fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                return ~start;  // insert s before i
842fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius            }
843fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius            limit = i;
844fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        } else {
845fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius            if (i == start) {
846fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                return ~(start + 1);  // insert s after i
847fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius            }
848fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius            start = i;
849fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        }
850fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    }
851fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius}
852fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius
853fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius}  // namespace
854fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius
855fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusint32_t
856fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusCollationBuilder::findOrInsertNodeForPrimary(uint32_t p, UErrorCode &errorCode) {
857fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    if(U_FAILURE(errorCode)) { return 0; }
858fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius
859fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    int32_t rootIndex = binarySearchForRootPrimaryNode(
860fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        rootPrimaryIndexes.getBuffer(), rootPrimaryIndexes.size(), nodes.getBuffer(), p);
861fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    if(rootIndex >= 0) {
862fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        return rootPrimaryIndexes.elementAti(rootIndex);
863fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    } else {
864fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        // Start a new list of nodes with this primary.
865fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        int32_t index = nodes.size();
866fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        nodes.addElement(nodeFromWeight32(p), errorCode);
867fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        rootPrimaryIndexes.insertElementAt(index, ~rootIndex, errorCode);
868fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        return index;
869fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    }
870fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius}
871fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius
872fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusint32_t
873fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusCollationBuilder::findOrInsertWeakNode(int32_t index, uint32_t weight16, int32_t level, UErrorCode &errorCode) {
874fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    if(U_FAILURE(errorCode)) { return 0; }
875fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    U_ASSERT(0 <= index && index < nodes.size());
8761b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert    U_ASSERT(UCOL_SECONDARY <= level && level <= UCOL_TERTIARY);
877fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius
878fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    if(weight16 == Collation::COMMON_WEIGHT16) {
879fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        return findCommonNode(index, level);
880fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    }
8811b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert
8821b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert    // If this will be the first below-common weight for the parent node,
8831b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert    // then we will also need to insert a common weight after it.
8841b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert    int64_t node = nodes.elementAti(index);
8851b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert    U_ASSERT(strengthFromNode(node) < level);  // parent node is stronger
8861b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert    if(weight16 != 0 && weight16 < Collation::COMMON_WEIGHT16) {
8871b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert        int32_t hasThisLevelBefore = level == UCOL_SECONDARY ? HAS_BEFORE2 : HAS_BEFORE3;
8881b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert        if((node & hasThisLevelBefore) == 0) {
8891b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert            // The parent node has an implied level-common weight.
8901b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert            int64_t commonNode =
8911b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert                nodeFromWeight16(Collation::COMMON_WEIGHT16) | nodeFromStrength(level);
8921b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert            if(level == UCOL_SECONDARY) {
8931b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert                // Move the HAS_BEFORE3 flag from the parent node
8941b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert                // to the new secondary common node.
8951b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert                commonNode |= node & HAS_BEFORE3;
8961b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert                node &= ~(int64_t)HAS_BEFORE3;
8971b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert            }
8981b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert            nodes.setElementAt(node | hasThisLevelBefore, index);
8991b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert            // Insert below-common-weight node.
9001b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert            int32_t nextIndex = nextIndexFromNode(node);
9011b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert            node = nodeFromWeight16(weight16) | nodeFromStrength(level);
9021b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert            index = insertNodeBetween(index, nextIndex, node, errorCode);
9031b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert            // Insert common-weight node.
9041b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert            insertNodeBetween(index, nextIndex, commonNode, errorCode);
9051b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert            // Return index of below-common-weight node.
9061b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert            return index;
9071b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert        }
9081b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert    }
9091b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert
910fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    // Find the root CE's weight for this level.
911fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    // Postpone insertion if not found:
912fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    // Insert the new root node before the next stronger node,
913fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    // or before the next root node with the same strength and a larger weight.
914fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    int32_t nextIndex;
915fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    while((nextIndex = nextIndexFromNode(node)) != 0) {
916fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        node = nodes.elementAti(nextIndex);
917fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        int32_t nextStrength = strengthFromNode(node);
918fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        if(nextStrength <= level) {
919fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius            // Insert before a stronger node.
920fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius            if(nextStrength < level) { break; }
921fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius            // nextStrength == level
922fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius            if(!isTailoredNode(node)) {
923fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                uint32_t nextWeight16 = weight16FromNode(node);
924fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                if(nextWeight16 == weight16) {
925fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                    // Found the node for the root CE up to this level.
926fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                    return nextIndex;
927fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                }
928fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                // Insert before a node with a larger same-strength weight.
929fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                if(nextWeight16 > weight16) { break; }
930fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius            }
931fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        }
932fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        // Skip the next node.
933fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        index = nextIndex;
934fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    }
935fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    node = nodeFromWeight16(weight16) | nodeFromStrength(level);
936fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    return insertNodeBetween(index, nextIndex, node, errorCode);
937fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius}
938fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius
939fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusint32_t
940fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusCollationBuilder::insertTailoredNodeAfter(int32_t index, int32_t strength, UErrorCode &errorCode) {
941fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    if(U_FAILURE(errorCode)) { return 0; }
942fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    U_ASSERT(0 <= index && index < nodes.size());
943fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    if(strength >= UCOL_SECONDARY) {
944fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        index = findCommonNode(index, UCOL_SECONDARY);
945fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        if(strength >= UCOL_TERTIARY) {
946fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius            index = findCommonNode(index, UCOL_TERTIARY);
947fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        }
948fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    }
949fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    // Postpone insertion:
950fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    // Insert the new node before the next one with a strength at least as strong.
951fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    int64_t node = nodes.elementAti(index);
952fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    int32_t nextIndex;
953fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    while((nextIndex = nextIndexFromNode(node)) != 0) {
954fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        node = nodes.elementAti(nextIndex);
955fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        if(strengthFromNode(node) <= strength) { break; }
956fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        // Skip the next node which has a weaker (larger) strength than the new one.
957fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        index = nextIndex;
958fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    }
959fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    node = IS_TAILORED | nodeFromStrength(strength);
960fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    return insertNodeBetween(index, nextIndex, node, errorCode);
961fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius}
962fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius
963fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusint32_t
964fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusCollationBuilder::insertNodeBetween(int32_t index, int32_t nextIndex, int64_t node,
965fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                                    UErrorCode &errorCode) {
966fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    if(U_FAILURE(errorCode)) { return 0; }
967fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    U_ASSERT(previousIndexFromNode(node) == 0);
968fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    U_ASSERT(nextIndexFromNode(node) == 0);
969fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    U_ASSERT(nextIndexFromNode(nodes.elementAti(index)) == nextIndex);
970fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    // Append the new node and link it to the existing nodes.
971fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    int32_t newIndex = nodes.size();
972fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    node |= nodeFromPreviousIndex(index) | nodeFromNextIndex(nextIndex);
973fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    nodes.addElement(node, errorCode);
974fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    if(U_FAILURE(errorCode)) { return 0; }
975fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    // nodes[index].nextIndex = newIndex
976fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    node = nodes.elementAti(index);
977fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    nodes.setElementAt(changeNodeNextIndex(node, newIndex), index);
978fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    // nodes[nextIndex].previousIndex = newIndex
979fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    if(nextIndex != 0) {
980fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        node = nodes.elementAti(nextIndex);
981fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        nodes.setElementAt(changeNodePreviousIndex(node, newIndex), nextIndex);
982fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    }
983fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    return newIndex;
984fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius}
985fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius
986fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusint32_t
987fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusCollationBuilder::findCommonNode(int32_t index, int32_t strength) const {
988fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    U_ASSERT(UCOL_SECONDARY <= strength && strength <= UCOL_TERTIARY);
989fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    int64_t node = nodes.elementAti(index);
990fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    if(strengthFromNode(node) >= strength) {
991fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        // The current node is no stronger.
992fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        return index;
993fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    }
994fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    if(strength == UCOL_SECONDARY ? !nodeHasBefore2(node) : !nodeHasBefore3(node)) {
995fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        // The current node implies the strength-common weight.
996fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        return index;
997fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    }
998fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    index = nextIndexFromNode(node);
999fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    node = nodes.elementAti(index);
1000fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    U_ASSERT(!isTailoredNode(node) && strengthFromNode(node) == strength &&
10011b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert            weight16FromNode(node) < Collation::COMMON_WEIGHT16);
1002fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    // Skip to the explicit common node.
1003fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    do {
1004fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        index = nextIndexFromNode(node);
1005fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        node = nodes.elementAti(index);
1006fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        U_ASSERT(strengthFromNode(node) >= strength);
10071b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert    } while(isTailoredNode(node) || strengthFromNode(node) > strength ||
10081b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert            weight16FromNode(node) < Collation::COMMON_WEIGHT16);
1009fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    U_ASSERT(weight16FromNode(node) == Collation::COMMON_WEIGHT16);
1010fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    return index;
1011fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius}
1012fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius
1013fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusvoid
1014fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusCollationBuilder::setCaseBits(const UnicodeString &nfdString,
1015fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                              const char *&parserErrorReason, UErrorCode &errorCode) {
1016fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    if(U_FAILURE(errorCode)) { return; }
1017fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    int32_t numTailoredPrimaries = 0;
1018fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    for(int32_t i = 0; i < cesLength; ++i) {
1019fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        if(ceStrength(ces[i]) == UCOL_PRIMARY) { ++numTailoredPrimaries; }
1020fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    }
1021fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    // We should not be able to get too many case bits because
1022fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    // cesLength<=31==MAX_EXPANSION_LENGTH.
1023fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    // 31 pairs of case bits fit into an int64_t without setting its sign bit.
1024fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    U_ASSERT(numTailoredPrimaries <= 31);
1025fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius
1026fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    int64_t cases = 0;
1027fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    if(numTailoredPrimaries > 0) {
1028fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        const UChar *s = nfdString.getBuffer();
1029fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        UTF16CollationIterator baseCEs(baseData, FALSE, s, s, s + nfdString.length());
1030fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        int32_t baseCEsLength = baseCEs.fetchCEs(errorCode) - 1;
1031fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        if(U_FAILURE(errorCode)) {
1032fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius            parserErrorReason = "fetching root CEs for tailored string";
1033fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius            return;
1034fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        }
1035fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        U_ASSERT(baseCEsLength >= 0 && baseCEs.getCE(baseCEsLength) == Collation::NO_CE);
1036fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius
1037fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        uint32_t lastCase = 0;
1038fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        int32_t numBasePrimaries = 0;
1039fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        for(int32_t i = 0; i < baseCEsLength; ++i) {
1040fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius            int64_t ce = baseCEs.getCE(i);
1041fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius            if((ce >> 32) != 0) {
1042fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                ++numBasePrimaries;
1043fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                uint32_t c = ((uint32_t)ce >> 14) & 3;
1044fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                U_ASSERT(c == 0 || c == 2);  // lowercase or uppercase, no mixed case in any base CE
1045fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                if(numBasePrimaries < numTailoredPrimaries) {
1046fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                    cases |= (int64_t)c << ((numBasePrimaries - 1) * 2);
1047fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                } else if(numBasePrimaries == numTailoredPrimaries) {
1048fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                    lastCase = c;
1049fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                } else if(c != lastCase) {
1050fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                    // There are more base primary CEs than tailored primaries.
1051fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                    // Set mixed case if the case bits of the remainder differ.
1052fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                    lastCase = 1;
1053fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                    // Nothing more can change.
1054fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                    break;
1055fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                }
1056fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius            }
1057fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        }
1058fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        if(numBasePrimaries >= numTailoredPrimaries) {
1059fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius            cases |= (int64_t)lastCase << ((numTailoredPrimaries - 1) * 2);
1060fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        }
1061fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    }
1062fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius
1063fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    for(int32_t i = 0; i < cesLength; ++i) {
1064fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        int64_t ce = ces[i] & INT64_C(0xffffffffffff3fff);  // clear old case bits
1065fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        int32_t strength = ceStrength(ce);
1066fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        if(strength == UCOL_PRIMARY) {
1067fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius            ce |= (cases & 3) << 14;
1068fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius            cases >>= 2;
1069fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        } else if(strength == UCOL_TERTIARY) {
1070fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius            // Tertiary CEs must have uppercase bits.
1071fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius            // See the LDML spec, and comments in class CollationCompare.
1072fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius            ce |= 0x8000;
1073fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        }
1074fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        // Tertiary ignorable CEs must have 0 case bits.
1075fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        // We set 0 case bits for secondary CEs too
1076fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        // since currently only U+0345 is cased and maps to a secondary CE,
1077fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        // and it is lowercase. Other secondaries are uncased.
1078fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        // See [[:Cased:]&[:uca1=:]] where uca1 queries the root primary weight.
1079fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        ces[i] = ce;
1080fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    }
1081fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius}
1082fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius
1083fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusvoid
1084fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusCollationBuilder::suppressContractions(const UnicodeSet &set, const char *&parserErrorReason,
1085fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                                       UErrorCode &errorCode) {
1086fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    if(U_FAILURE(errorCode)) { return; }
1087fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    dataBuilder->suppressContractions(set, errorCode);
1088fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    if(U_FAILURE(errorCode)) {
1089fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        parserErrorReason = "application of [suppressContractions [set]] failed";
1090fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    }
1091fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius}
1092fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius
1093fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusvoid
1094fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusCollationBuilder::optimize(const UnicodeSet &set, const char *& /* parserErrorReason */,
1095fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                           UErrorCode &errorCode) {
1096fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    if(U_FAILURE(errorCode)) { return; }
1097fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    optimizeSet.addAll(set);
1098fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius}
1099fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius
1100fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusuint32_t
1101fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusCollationBuilder::addWithClosure(const UnicodeString &nfdPrefix, const UnicodeString &nfdString,
1102fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                                 const int64_t newCEs[], int32_t newCEsLength, uint32_t ce32,
1103fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                                 UErrorCode &errorCode) {
1104fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    // Map from the NFD input to the CEs.
1105fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    ce32 = addIfDifferent(nfdPrefix, nfdString, newCEs, newCEsLength, ce32, errorCode);
1106fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    ce32 = addOnlyClosure(nfdPrefix, nfdString, newCEs, newCEsLength, ce32, errorCode);
1107fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    addTailComposites(nfdPrefix, nfdString, errorCode);
1108fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    return ce32;
1109fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius}
1110fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius
1111fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusuint32_t
1112fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusCollationBuilder::addOnlyClosure(const UnicodeString &nfdPrefix, const UnicodeString &nfdString,
1113fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                                 const int64_t newCEs[], int32_t newCEsLength, uint32_t ce32,
1114fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                                 UErrorCode &errorCode) {
1115fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    if(U_FAILURE(errorCode)) { return ce32; }
1116fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius
1117fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    // Map from canonically equivalent input to the CEs. (But not from the all-NFD input.)
1118fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    if(nfdPrefix.isEmpty()) {
1119fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        CanonicalIterator stringIter(nfdString, errorCode);
1120fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        if(U_FAILURE(errorCode)) { return ce32; }
1121fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        UnicodeString prefix;
1122fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        for(;;) {
1123fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius            UnicodeString str = stringIter.next();
1124fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius            if(str.isBogus()) { break; }
1125fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius            if(ignoreString(str, errorCode) || str == nfdString) { continue; }
1126fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius            ce32 = addIfDifferent(prefix, str, newCEs, newCEsLength, ce32, errorCode);
1127fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius            if(U_FAILURE(errorCode)) { return ce32; }
1128fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        }
1129fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    } else {
1130fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        CanonicalIterator prefixIter(nfdPrefix, errorCode);
1131fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        CanonicalIterator stringIter(nfdString, errorCode);
1132fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        if(U_FAILURE(errorCode)) { return ce32; }
1133fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        for(;;) {
1134fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius            UnicodeString prefix = prefixIter.next();
1135fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius            if(prefix.isBogus()) { break; }
1136fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius            if(ignorePrefix(prefix, errorCode)) { continue; }
1137fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius            UBool samePrefix = prefix == nfdPrefix;
1138fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius            for(;;) {
1139fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                UnicodeString str = stringIter.next();
1140fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                if(str.isBogus()) { break; }
1141fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                if(ignoreString(str, errorCode) || (samePrefix && str == nfdString)) { continue; }
1142fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                ce32 = addIfDifferent(prefix, str, newCEs, newCEsLength, ce32, errorCode);
1143fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                if(U_FAILURE(errorCode)) { return ce32; }
1144fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius            }
1145fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius            stringIter.reset();
1146fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        }
1147fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    }
1148fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    return ce32;
1149fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius}
1150fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius
1151fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusvoid
1152fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusCollationBuilder::addTailComposites(const UnicodeString &nfdPrefix, const UnicodeString &nfdString,
1153fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                                    UErrorCode &errorCode) {
1154fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    if(U_FAILURE(errorCode)) { return; }
1155fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius
1156fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    // Look for the last starter in the NFD string.
1157fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    UChar32 lastStarter;
1158fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    int32_t indexAfterLastStarter = nfdString.length();
1159fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    for(;;) {
1160fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        if(indexAfterLastStarter == 0) { return; }  // no starter at all
1161fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        lastStarter = nfdString.char32At(indexAfterLastStarter - 1);
1162fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        if(nfd.getCombiningClass(lastStarter) == 0) { break; }
1163fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        indexAfterLastStarter -= U16_LENGTH(lastStarter);
1164fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    }
1165fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    // No closure to Hangul syllables since we decompose them on the fly.
1166fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    if(Hangul::isJamoL(lastStarter)) { return; }
1167fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius
1168fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    // Are there any composites whose decomposition starts with the lastStarter?
1169fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    // Note: Normalizer2Impl does not currently return start sets for NFC_QC=Maybe characters.
1170fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    // We might find some more equivalent mappings here if it did.
1171fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    UnicodeSet composites;
1172fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    if(!nfcImpl.getCanonStartSet(lastStarter, composites)) { return; }
1173fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius
1174fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    UnicodeString decomp;
1175fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    UnicodeString newNFDString, newString;
1176fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    int64_t newCEs[Collation::MAX_EXPANSION_LENGTH];
1177fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    UnicodeSetIterator iter(composites);
1178fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    while(iter.next()) {
1179fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        U_ASSERT(!iter.isString());
1180fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        UChar32 composite = iter.getCodepoint();
1181fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        nfd.getDecomposition(composite, decomp);
1182fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        if(!mergeCompositeIntoString(nfdString, indexAfterLastStarter, composite, decomp,
1183fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                                     newNFDString, newString, errorCode)) {
1184fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius            continue;
1185fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        }
1186fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        int32_t newCEsLength = dataBuilder->getCEs(nfdPrefix, newNFDString, newCEs, 0);
1187fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        if(newCEsLength > Collation::MAX_EXPANSION_LENGTH) {
1188fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius            // Ignore mappings that we cannot store.
1189fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius            continue;
1190fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        }
1191fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        // Note: It is possible that the newCEs do not make use of the mapping
1192fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        // for which we are adding the tail composites, in which case we might be adding
1193fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        // unnecessary mappings.
1194fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        // For example, when we add tail composites for ae^ (^=combining circumflex),
1195fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        // UCA discontiguous-contraction matching does not find any matches
1196fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        // for ae_^ (_=any combining diacritic below) *unless* there is also
1197fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        // a contraction mapping for ae.
1198fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        // Thus, if there is no ae contraction, then the ae^ mapping is ignored
1199fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        // while fetching the newCEs for ae_^.
1200fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        // TODO: Try to detect this effectively.
1201fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        // (Alternatively, print a warning when prefix contractions are missing.)
1202fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius
1203fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        // We do not need an explicit mapping for the NFD strings.
1204fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        // It is fine if the NFD input collates like this via a sequence of mappings.
1205fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        // It also saves a little bit of space, and may reduce the set of characters with contractions.
1206fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        uint32_t ce32 = addIfDifferent(nfdPrefix, newString,
1207fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                                       newCEs, newCEsLength, Collation::UNASSIGNED_CE32, errorCode);
1208fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        if(ce32 != Collation::UNASSIGNED_CE32) {
1209fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius            // was different, was added
1210fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius            addOnlyClosure(nfdPrefix, newNFDString, newCEs, newCEsLength, ce32, errorCode);
1211fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        }
1212fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    }
1213fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius}
1214fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius
1215fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusUBool
1216fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusCollationBuilder::mergeCompositeIntoString(const UnicodeString &nfdString,
1217fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                                           int32_t indexAfterLastStarter,
1218fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                                           UChar32 composite, const UnicodeString &decomp,
1219fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                                           UnicodeString &newNFDString, UnicodeString &newString,
1220fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                                           UErrorCode &errorCode) const {
1221fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    if(U_FAILURE(errorCode)) { return FALSE; }
1222fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    U_ASSERT(nfdString.char32At(indexAfterLastStarter - 1) == decomp.char32At(0));
1223fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    int32_t lastStarterLength = decomp.moveIndex32(0, 1);
1224fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    if(lastStarterLength == decomp.length()) {
1225fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        // Singleton decompositions should be found by addWithClosure()
1226fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        // and the CanonicalIterator, so we can ignore them here.
1227fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        return FALSE;
1228fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    }
1229fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    if(nfdString.compare(indexAfterLastStarter, 0x7fffffff,
1230fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                         decomp, lastStarterLength, 0x7fffffff) == 0) {
1231fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        // same strings, nothing new to be found here
1232fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        return FALSE;
1233fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    }
1234fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius
1235fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    // Make new FCD strings that combine a composite, or its decomposition,
1236fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    // into the nfdString's last starter and the combining marks following it.
1237fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    // Make an NFD version, and a version with the composite.
1238fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    newNFDString.setTo(nfdString, 0, indexAfterLastStarter);
1239fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    newString.setTo(nfdString, 0, indexAfterLastStarter - lastStarterLength).append(composite);
1240fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius
1241fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    // The following is related to discontiguous contraction matching,
1242fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    // but builds only FCD strings (or else returns FALSE).
1243fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    int32_t sourceIndex = indexAfterLastStarter;
1244fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    int32_t decompIndex = lastStarterLength;
1245fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    // Small optimization: We keep the source character across loop iterations
1246fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    // because we do not always consume it,
1247fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    // and then need not fetch it again nor look up its combining class again.
1248fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    UChar32 sourceChar = U_SENTINEL;
1249fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    // The cc variables need to be declared before the loop so that at the end
1250fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    // they are set to the last combining classes seen.
1251fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    uint8_t sourceCC = 0;
1252fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    uint8_t decompCC = 0;
1253fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    for(;;) {
1254fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        if(sourceChar < 0) {
1255fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius            if(sourceIndex >= nfdString.length()) { break; }
1256fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius            sourceChar = nfdString.char32At(sourceIndex);
1257fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius            sourceCC = nfd.getCombiningClass(sourceChar);
1258fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius            U_ASSERT(sourceCC != 0);
1259fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        }
1260fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        // We consume a decomposition character in each iteration.
1261fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        if(decompIndex >= decomp.length()) { break; }
1262fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        UChar32 decompChar = decomp.char32At(decompIndex);
1263fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        decompCC = nfd.getCombiningClass(decompChar);
1264fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        // Compare the two characters and their combining classes.
1265fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        if(decompCC == 0) {
1266fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius            // Unable to merge because the source contains a non-zero combining mark
1267fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius            // but the composite's decomposition contains another starter.
1268fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius            // The strings would not be equivalent.
1269fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius            return FALSE;
1270fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        } else if(sourceCC < decompCC) {
1271fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius            // Composite + sourceChar would not be FCD.
1272fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius            return FALSE;
1273fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        } else if(decompCC < sourceCC) {
1274fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius            newNFDString.append(decompChar);
1275fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius            decompIndex += U16_LENGTH(decompChar);
1276fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        } else if(decompChar != sourceChar) {
1277fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius            // Blocked because same combining class.
1278fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius            return FALSE;
1279fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        } else {  // match: decompChar == sourceChar
1280fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius            newNFDString.append(decompChar);
1281fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius            decompIndex += U16_LENGTH(decompChar);
1282fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius            sourceIndex += U16_LENGTH(decompChar);
1283fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius            sourceChar = U_SENTINEL;
1284fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        }
1285fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    }
1286fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    // We are at the end of at least one of the two inputs.
1287fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    if(sourceChar >= 0) {  // more characters from nfdString but not from decomp
1288fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        if(sourceCC < decompCC) {
1289fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius            // Appending the next source character to the composite would not be FCD.
1290fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius            return FALSE;
1291fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        }
1292fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        newNFDString.append(nfdString, sourceIndex, 0x7fffffff);
1293fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        newString.append(nfdString, sourceIndex, 0x7fffffff);
1294fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    } else if(decompIndex < decomp.length()) {  // more characters from decomp, not from nfdString
1295fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        newNFDString.append(decomp, decompIndex, 0x7fffffff);
1296fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    }
1297fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    U_ASSERT(nfd.isNormalized(newNFDString, errorCode));
1298fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    U_ASSERT(fcd.isNormalized(newString, errorCode));
1299fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    U_ASSERT(nfd.normalize(newString, errorCode) == newNFDString);  // canonically equivalent
1300fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    return TRUE;
1301fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius}
1302fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius
1303fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusUBool
1304fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusCollationBuilder::ignorePrefix(const UnicodeString &s, UErrorCode &errorCode) const {
1305fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    // Do not map non-FCD prefixes.
1306fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    return !isFCD(s, errorCode);
1307fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius}
1308fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius
1309fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusUBool
1310fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusCollationBuilder::ignoreString(const UnicodeString &s, UErrorCode &errorCode) const {
1311fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    // Do not map non-FCD strings.
1312fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    // Do not map strings that start with Hangul syllables: We decompose those on the fly.
1313fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    return !isFCD(s, errorCode) || Hangul::isHangul(s.charAt(0));
1314fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius}
1315fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius
1316fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusUBool
1317fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusCollationBuilder::isFCD(const UnicodeString &s, UErrorCode &errorCode) const {
1318fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    return U_SUCCESS(errorCode) && fcd.isNormalized(s, errorCode);
1319fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius}
1320fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius
1321fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusvoid
1322fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusCollationBuilder::closeOverComposites(UErrorCode &errorCode) {
1323fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    UnicodeSet composites(UNICODE_STRING_SIMPLE("[:NFD_QC=N:]"), errorCode);  // Java: static final
1324fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    if(U_FAILURE(errorCode)) { return; }
1325fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    // Hangul is decomposed on the fly during collation.
1326fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    composites.remove(Hangul::HANGUL_BASE, Hangul::HANGUL_END);
1327fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    UnicodeString prefix;  // empty
1328fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    UnicodeString nfdString;
1329fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    UnicodeSetIterator iter(composites);
1330fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    while(iter.next()) {
1331fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        U_ASSERT(!iter.isString());
1332fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        nfd.getDecomposition(iter.getCodepoint(), nfdString);
1333fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        cesLength = dataBuilder->getCEs(nfdString, ces, 0);
1334fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        if(cesLength > Collation::MAX_EXPANSION_LENGTH) {
1335fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius            // Too many CEs from the decomposition (unusual), ignore this composite.
1336fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius            // We could add a capacity parameter to getCEs() and reallocate if necessary.
1337fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius            // However, this can only really happen in contrived cases.
1338fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius            continue;
1339fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        }
1340fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        const UnicodeString &composite(iter.getString());
1341fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        addIfDifferent(prefix, composite, ces, cesLength, Collation::UNASSIGNED_CE32, errorCode);
1342fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    }
1343fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius}
1344fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius
1345fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusuint32_t
1346fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusCollationBuilder::addIfDifferent(const UnicodeString &prefix, const UnicodeString &str,
1347fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                                 const int64_t newCEs[], int32_t newCEsLength, uint32_t ce32,
1348fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                                 UErrorCode &errorCode) {
1349fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    if(U_FAILURE(errorCode)) { return ce32; }
1350fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    int64_t oldCEs[Collation::MAX_EXPANSION_LENGTH];
1351fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    int32_t oldCEsLength = dataBuilder->getCEs(prefix, str, oldCEs, 0);
1352fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    if(!sameCEs(newCEs, newCEsLength, oldCEs, oldCEsLength)) {
1353fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        if(ce32 == Collation::UNASSIGNED_CE32) {
1354fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius            ce32 = dataBuilder->encodeCEs(newCEs, newCEsLength, errorCode);
1355fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        }
1356fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        dataBuilder->addCE32(prefix, str, ce32, errorCode);
1357fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    }
1358fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    return ce32;
1359fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius}
1360fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius
1361fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusUBool
1362fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusCollationBuilder::sameCEs(const int64_t ces1[], int32_t ces1Length,
1363fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                          const int64_t ces2[], int32_t ces2Length) {
1364fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    if(ces1Length != ces2Length) {
1365fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        return FALSE;
1366fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    }
1367fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    U_ASSERT(ces1Length <= Collation::MAX_EXPANSION_LENGTH);
1368fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    for(int32_t i = 0; i < ces1Length; ++i) {
1369fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        if(ces1[i] != ces2[i]) { return FALSE; }
1370fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    }
1371fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    return TRUE;
1372fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius}
1373fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius
1374fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#ifdef DEBUG_COLLATION_BUILDER
1375fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius
1376fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusuint32_t
1377fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusalignWeightRight(uint32_t w) {
1378fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    if(w != 0) {
1379fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        while((w & 0xff) == 0) { w >>= 8; }
1380fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    }
1381fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    return w;
1382fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius}
1383fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius
1384fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#endif
1385fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius
1386fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusvoid
1387fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusCollationBuilder::makeTailoredCEs(UErrorCode &errorCode) {
1388fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    if(U_FAILURE(errorCode)) { return; }
1389fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius
1390fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    CollationWeights primaries, secondaries, tertiaries;
1391fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    int64_t *nodesArray = nodes.getBuffer();
13921b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert#ifdef DEBUG_COLLATION_BUILDER
13931b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert        puts("\nCollationBuilder::makeTailoredCEs()");
13941b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert#endif
1395fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius
1396fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    for(int32_t rpi = 0; rpi < rootPrimaryIndexes.size(); ++rpi) {
1397fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        int32_t i = rootPrimaryIndexes.elementAti(rpi);
1398fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        int64_t node = nodesArray[i];
1399fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        uint32_t p = weight32FromNode(node);
1400fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        uint32_t s = p == 0 ? 0 : Collation::COMMON_WEIGHT16;
1401fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        uint32_t t = s;
1402fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        uint32_t q = 0;
1403fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        UBool pIsTailored = FALSE;
1404fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        UBool sIsTailored = FALSE;
1405fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        UBool tIsTailored = FALSE;
1406fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#ifdef DEBUG_COLLATION_BUILDER
1407fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        printf("\nprimary     %lx\n", (long)alignWeightRight(p));
1408fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#endif
1409fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        int32_t pIndex = p == 0 ? 0 : rootElements.findPrimary(p);
1410fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        int32_t nextIndex = nextIndexFromNode(node);
1411fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        while(nextIndex != 0) {
1412fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius            i = nextIndex;
1413fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius            node = nodesArray[i];
1414fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius            nextIndex = nextIndexFromNode(node);
1415fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius            int32_t strength = strengthFromNode(node);
1416fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius            if(strength == UCOL_QUATERNARY) {
1417fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                U_ASSERT(isTailoredNode(node));
1418fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#ifdef DEBUG_COLLATION_BUILDER
1419fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                printf("      quat+     ");
1420fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#endif
1421fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                if(q == 3) {
1422fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                    errorCode = U_BUFFER_OVERFLOW_ERROR;
1423fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                    errorReason = "quaternary tailoring gap too small";
1424fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                    return;
1425fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                }
1426fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                ++q;
1427fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius            } else {
1428fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                if(strength == UCOL_TERTIARY) {
1429fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                    if(isTailoredNode(node)) {
1430fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#ifdef DEBUG_COLLATION_BUILDER
1431fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                        printf("    ter+        ");
1432fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#endif
1433fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                        if(!tIsTailored) {
1434fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                            // First tailored tertiary node for [p, s].
1435fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                            int32_t tCount = countTailoredNodes(nodesArray, nextIndex,
1436fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                                                                UCOL_TERTIARY) + 1;
1437fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                            uint32_t tLimit;
1438fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                            if(t == 0) {
1439fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                                // Gap at the beginning of the tertiary CE range.
1440fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                                t = rootElements.getTertiaryBoundary() - 0x100;
1441fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                                tLimit = rootElements.getFirstTertiaryCE() & Collation::ONLY_TERTIARY_MASK;
1442fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                            } else if(!pIsTailored && !sIsTailored) {
1443fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                                // p and s are root weights.
1444fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                                tLimit = rootElements.getTertiaryAfter(pIndex, s, t);
14451b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert                            } else if(t == Collation::BEFORE_WEIGHT16) {
14461b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert                                tLimit = Collation::COMMON_WEIGHT16;
1447fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                            } else {
1448fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                                // [p, s] is tailored.
1449fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                                U_ASSERT(t == Collation::COMMON_WEIGHT16);
1450fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                                tLimit = rootElements.getTertiaryBoundary();
1451fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                            }
1452fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                            U_ASSERT(tLimit == 0x4000 || (tLimit & ~Collation::ONLY_TERTIARY_MASK) == 0);
1453fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                            tertiaries.initForTertiary();
1454fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                            if(!tertiaries.allocWeights(t, tLimit, tCount)) {
1455fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                                errorCode = U_BUFFER_OVERFLOW_ERROR;
1456fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                                errorReason = "tertiary tailoring gap too small";
1457fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                                return;
1458fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                            }
1459fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                            tIsTailored = TRUE;
1460fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                        }
1461fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                        t = tertiaries.nextWeight();
1462fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                        U_ASSERT(t != 0xffffffff);
1463fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                    } else {
1464fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                        t = weight16FromNode(node);
1465fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                        tIsTailored = FALSE;
1466fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#ifdef DEBUG_COLLATION_BUILDER
1467fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                        printf("    ter     %lx\n", (long)alignWeightRight(t));
1468fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#endif
1469fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                    }
1470fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                } else {
1471fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                    if(strength == UCOL_SECONDARY) {
1472fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                        if(isTailoredNode(node)) {
1473fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#ifdef DEBUG_COLLATION_BUILDER
1474fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                            printf("  sec+          ");
1475fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#endif
1476fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                            if(!sIsTailored) {
1477fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                                // First tailored secondary node for p.
1478fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                                int32_t sCount = countTailoredNodes(nodesArray, nextIndex,
1479fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                                                                    UCOL_SECONDARY) + 1;
1480fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                                uint32_t sLimit;
1481fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                                if(s == 0) {
1482fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                                    // Gap at the beginning of the secondary CE range.
1483fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                                    s = rootElements.getSecondaryBoundary() - 0x100;
1484fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                                    sLimit = rootElements.getFirstSecondaryCE() >> 16;
1485fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                                } else if(!pIsTailored) {
1486fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                                    // p is a root primary.
1487fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                                    sLimit = rootElements.getSecondaryAfter(pIndex, s);
14881b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert                                } else if(s == Collation::BEFORE_WEIGHT16) {
14891b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert                                    sLimit = Collation::COMMON_WEIGHT16;
1490fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                                } else {
1491fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                                    // p is a tailored primary.
1492fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                                    U_ASSERT(s == Collation::COMMON_WEIGHT16);
1493fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                                    sLimit = rootElements.getSecondaryBoundary();
1494fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                                }
1495fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                                if(s == Collation::COMMON_WEIGHT16) {
1496fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                                    // Do not tailor into the getSortKey() range of
1497fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                                    // compressed common secondaries.
1498fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                                    s = rootElements.getLastCommonSecondary();
1499fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                                }
1500fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                                secondaries.initForSecondary();
1501fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                                if(!secondaries.allocWeights(s, sLimit, sCount)) {
1502fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                                    errorCode = U_BUFFER_OVERFLOW_ERROR;
1503fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                                    errorReason = "secondary tailoring gap too small";
15041b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert#ifdef DEBUG_COLLATION_BUILDER
15051b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert                                    printf("!secondaries.allocWeights(%lx, %lx, sCount=%ld)\n",
15061b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert                                           (long)alignWeightRight(s), (long)alignWeightRight(sLimit),
15071b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert                                           (long)alignWeightRight(sCount));
15081b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert#endif
1509fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                                    return;
1510fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                                }
1511fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                                sIsTailored = TRUE;
1512fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                            }
1513fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                            s = secondaries.nextWeight();
1514fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                            U_ASSERT(s != 0xffffffff);
1515fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                        } else {
1516fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                            s = weight16FromNode(node);
1517fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                            sIsTailored = FALSE;
1518fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#ifdef DEBUG_COLLATION_BUILDER
1519fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                            printf("  sec       %lx\n", (long)alignWeightRight(s));
1520fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#endif
1521fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                        }
1522fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                    } else /* UCOL_PRIMARY */ {
1523fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                        U_ASSERT(isTailoredNode(node));
1524fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#ifdef DEBUG_COLLATION_BUILDER
1525fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                        printf("pri+            ");
1526fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#endif
1527fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                        if(!pIsTailored) {
1528fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                            // First tailored primary node in this list.
1529fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                            int32_t pCount = countTailoredNodes(nodesArray, nextIndex,
1530fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                                                                UCOL_PRIMARY) + 1;
1531fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                            UBool isCompressible = baseData->isCompressiblePrimary(p);
1532fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                            uint32_t pLimit =
1533fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                                rootElements.getPrimaryAfter(p, pIndex, isCompressible);
1534fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                            primaries.initForPrimary(isCompressible);
1535fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                            if(!primaries.allocWeights(p, pLimit, pCount)) {
1536fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                                errorCode = U_BUFFER_OVERFLOW_ERROR;  // TODO: introduce a more specific UErrorCode?
1537fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                                errorReason = "primary tailoring gap too small";
1538fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                                return;
1539fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                            }
1540fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                            pIsTailored = TRUE;
1541fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                        }
1542fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                        p = primaries.nextWeight();
1543fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                        U_ASSERT(p != 0xffffffff);
1544fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                        s = Collation::COMMON_WEIGHT16;
1545fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                        sIsTailored = FALSE;
1546fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                    }
1547fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                    t = s == 0 ? 0 : Collation::COMMON_WEIGHT16;
1548fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                    tIsTailored = FALSE;
1549fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                }
1550fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                q = 0;
1551fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius            }
1552fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius            if(isTailoredNode(node)) {
1553fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                nodesArray[i] = Collation::makeCE(p, s, t, q);
1554fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#ifdef DEBUG_COLLATION_BUILDER
1555fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                printf("%016llx\n", (long long)nodesArray[i]);
1556fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#endif
1557fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius            }
1558fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        }
1559fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    }
1560fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius}
1561fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius
1562fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusint32_t
1563fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusCollationBuilder::countTailoredNodes(const int64_t *nodesArray, int32_t i, int32_t strength) {
1564fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    int32_t count = 0;
1565fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    for(;;) {
1566fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        if(i == 0) { break; }
1567fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        int64_t node = nodesArray[i];
1568fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        if(strengthFromNode(node) < strength) { break; }
1569fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        if(strengthFromNode(node) == strength) {
1570fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius            if(isTailoredNode(node)) {
1571fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                ++count;
1572fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius            } else {
1573fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                break;
1574fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius            }
1575fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        }
1576fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        i = nextIndexFromNode(node);
1577fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    }
1578fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    return count;
1579fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius}
1580fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius
1581fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusclass CEFinalizer : public CollationDataBuilder::CEModifier {
1582fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliuspublic:
1583fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    CEFinalizer(const int64_t *ces) : finalCEs(ces) {}
1584fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    virtual ~CEFinalizer();
1585fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    virtual int64_t modifyCE32(uint32_t ce32) const {
1586fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        U_ASSERT(!Collation::isSpecialCE32(ce32));
1587fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        if(CollationBuilder::isTempCE32(ce32)) {
1588fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius            // retain case bits
1589fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius            return finalCEs[CollationBuilder::indexFromTempCE32(ce32)] | ((ce32 & 0xc0) << 8);
1590fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        } else {
1591fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius            return Collation::NO_CE;
1592fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        }
1593fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    }
1594fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    virtual int64_t modifyCE(int64_t ce) const {
1595fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        if(CollationBuilder::isTempCE(ce)) {
1596fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius            // retain case bits
1597fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius            return finalCEs[CollationBuilder::indexFromTempCE(ce)] | (ce & 0xc000);
1598fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        } else {
1599fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius            return Collation::NO_CE;
1600fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        }
1601fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    }
1602fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius
1603fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusprivate:
1604fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    const int64_t *finalCEs;
1605fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius};
1606fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius
1607fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusCEFinalizer::~CEFinalizer() {}
1608fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius
1609fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusvoid
1610fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusCollationBuilder::finalizeCEs(UErrorCode &errorCode) {
1611fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    if(U_FAILURE(errorCode)) { return; }
16121b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert    LocalPointer<CollationDataBuilder> newBuilder(new CollationDataBuilder(errorCode), errorCode);
16131b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert    if(U_FAILURE(errorCode)) {
1614fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        return;
1615fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    }
1616fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    newBuilder->initForTailoring(baseData, errorCode);
1617fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    CEFinalizer finalizer(nodes.getBuffer());
1618fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    newBuilder->copyFrom(*dataBuilder, finalizer, errorCode);
1619fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    if(U_FAILURE(errorCode)) { return; }
1620fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    delete dataBuilder;
1621fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    dataBuilder = newBuilder.orphan();
1622fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius}
1623fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius
1624fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusint32_t
1625fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusCollationBuilder::ceStrength(int64_t ce) {
1626fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    return
1627fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        isTempCE(ce) ? strengthFromTempCE(ce) :
1628fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        (ce & INT64_C(0xff00000000000000)) != 0 ? UCOL_PRIMARY :
1629fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        ((uint32_t)ce & 0xff000000) != 0 ? UCOL_SECONDARY :
1630fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        ce != 0 ? UCOL_TERTIARY :
1631fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        UCOL_IDENTICAL;
1632fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius}
1633fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius
1634fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusU_NAMESPACE_END
1635fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius
1636fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusU_NAMESPACE_USE
1637fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius
1638fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusU_CAPI UCollator * U_EXPORT2
1639fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusucol_openRules(const UChar *rules, int32_t rulesLength,
1640fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius               UColAttributeValue normalizationMode, UCollationStrength strength,
1641fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius               UParseError *parseError, UErrorCode *pErrorCode) {
1642fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    if(U_FAILURE(*pErrorCode)) { return NULL; }
1643fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    if(rules == NULL && rulesLength != 0) {
1644fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
1645fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        return NULL;
1646fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    }
1647fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    RuleBasedCollator *coll = new RuleBasedCollator();
1648fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    if(coll == NULL) {
1649fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        *pErrorCode = U_MEMORY_ALLOCATION_ERROR;
1650fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        return NULL;
1651fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    }
1652fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    UnicodeString r((UBool)(rulesLength < 0), rules, rulesLength);
1653fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    coll->internalBuildTailoring(r, strength, normalizationMode, parseError, NULL, *pErrorCode);
1654fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    if(U_FAILURE(*pErrorCode)) {
1655fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        delete coll;
1656fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        return NULL;
1657fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    }
1658fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    return coll->toUCollator();
1659fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius}
1660fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius
1661fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusstatic const int32_t internalBufferSize = 512;
1662fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius
1663fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius// The @internal ucol_getUnsafeSet() was moved here from ucol_sit.cpp
1664fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius// because it calls UnicodeSet "builder" code that depends on all Unicode properties,
1665fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius// and the rest of the collation "runtime" code only depends on normalization.
1666fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius// This function is not related to the collation builder,
1667fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius// but it did not seem worth moving it into its own .cpp file,
1668fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius// nor rewriting it to use lower-level UnicodeSet and Normalizer2Impl methods.
1669fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusU_CAPI int32_t U_EXPORT2
1670fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusucol_getUnsafeSet( const UCollator *coll,
1671fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                  USet *unsafe,
1672fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                  UErrorCode *status)
1673fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius{
1674fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    UChar buffer[internalBufferSize];
1675fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    int32_t len = 0;
1676fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius
1677fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    uset_clear(unsafe);
1678fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius
1679fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    // cccpattern = "[[:^tccc=0:][:^lccc=0:]]", unfortunately variant
1680fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    static const UChar cccpattern[25] = { 0x5b, 0x5b, 0x3a, 0x5e, 0x74, 0x63, 0x63, 0x63, 0x3d, 0x30, 0x3a, 0x5d,
1681fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                                    0x5b, 0x3a, 0x5e, 0x6c, 0x63, 0x63, 0x63, 0x3d, 0x30, 0x3a, 0x5d, 0x5d, 0x00 };
1682fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius
1683fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    // add chars that fail the fcd check
1684fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    uset_applyPattern(unsafe, cccpattern, 24, USET_IGNORE_SPACE, status);
1685fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius
1686fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    // add lead/trail surrogates
1687fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    // (trail surrogates should need to be unsafe only if the caller tests for UTF-16 code *units*,
1688fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    // not when testing code *points*)
1689fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    uset_addRange(unsafe, 0xd800, 0xdfff);
1690fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius
1691fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    USet *contractions = uset_open(0,0);
1692fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius
1693fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    int32_t i = 0, j = 0;
1694fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    ucol_getContractionsAndExpansions(coll, contractions, NULL, FALSE, status);
1695fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    int32_t contsSize = uset_size(contractions);
1696fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    UChar32 c = 0;
1697fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    // Contraction set consists only of strings
1698fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    // to get unsafe code points, we need to
1699fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    // break the strings apart and add them to the unsafe set
1700fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    for(i = 0; i < contsSize; i++) {
1701fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        len = uset_getItem(contractions, i, NULL, NULL, buffer, internalBufferSize, status);
1702fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        if(len > 0) {
1703fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius            j = 0;
1704fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius            while(j < len) {
1705fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                U16_NEXT(buffer, j, len, c);
1706fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                if(j < len) {
1707fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                    uset_add(unsafe, c);
1708fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius                }
1709fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius            }
1710fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        }
1711fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    }
1712fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius
1713fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    uset_close(contractions);
1714fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius
1715fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    return uset_size(unsafe);
1716fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius}
1717fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius
1718fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#endif  // !UCONFIG_NO_COLLATION
1719