1fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius/* 2fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius******************************************************************************* 31b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert* Copyright (C) 2010-2015, International Business Machines 4fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius* Corporation and others. All Rights Reserved. 5fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius******************************************************************************* 6fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius* collation.h 7fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius* 8fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius* created on: 2010oct27 9fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius* created by: Markus W. Scherer 10fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius*/ 11fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 12fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#ifndef __COLLATION_H__ 13fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#define __COLLATION_H__ 14fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 15fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#include "unicode/utypes.h" 16fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 17fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#if !UCONFIG_NO_COLLATION 18fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 19fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusU_NAMESPACE_BEGIN 20fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 21fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius/** 22fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * Collation v2 basic definitions and static helper functions. 23fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * 24fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * Data structures except for expansion tables store 32-bit CEs which are 25fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * either specials (see tags below) or are compact forms of 64-bit CEs. 26fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius */ 27fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusclass U_I18N_API Collation { 28fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliuspublic: 29fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // Special sort key bytes for all levels. 30fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius static const uint8_t TERMINATOR_BYTE = 0; 31fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius static const uint8_t LEVEL_SEPARATOR_BYTE = 1; 321b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert 331b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert /** The secondary/tertiary lower limit for tailoring before any root elements. */ 341b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert static const uint32_t BEFORE_WEIGHT16 = 0x0100; 351b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert 36fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius /** 37fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * Merge-sort-key separator. 381b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert * Same as the unique primary and identical-level weights of U+FFFE. 391b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert * Must not be used as primary compression low terminator. 40fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * Otherwise usable. 41fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius */ 42fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius static const uint8_t MERGE_SEPARATOR_BYTE = 2; 43fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius static const uint32_t MERGE_SEPARATOR_PRIMARY = 0x02000000; // U+FFFE 441b7d32f919554dda9c193b32188251337bc756f1Fredrik Roubert static const uint32_t MERGE_SEPARATOR_CE32 = 0x02000505; // U+FFFE 45fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 46fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius /** 47fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * Primary compression low terminator, must be greater than MERGE_SEPARATOR_BYTE. 48fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * Reserved value in primary second byte if the lead byte is compressible. 49fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * Otherwise usable in all CE weight bytes. 50fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius */ 51fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius static const uint8_t PRIMARY_COMPRESSION_LOW_BYTE = 3; 52fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius /** 53fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * Primary compression high terminator. 54fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * Reserved value in primary second byte if the lead byte is compressible. 55fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * Otherwise usable in all CE weight bytes. 56fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius */ 57fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius static const uint8_t PRIMARY_COMPRESSION_HIGH_BYTE = 0xff; 58fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 59fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius /** Default secondary/tertiary weight lead byte. */ 60fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius static const uint8_t COMMON_BYTE = 5; 61fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius static const uint32_t COMMON_WEIGHT16 = 0x0500; 62fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius /** Middle 16 bits of a CE with a common secondary weight. */ 63fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius static const uint32_t COMMON_SECONDARY_CE = 0x05000000; 64fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius /** Lower 16 bits of a CE with a common tertiary weight. */ 65fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius static const uint32_t COMMON_TERTIARY_CE = 0x0500; 66fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius /** Lower 32 bits of a CE with common secondary and tertiary weights. */ 67fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius static const uint32_t COMMON_SEC_AND_TER_CE = 0x05000500; 68fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 69fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius static const uint32_t SECONDARY_MASK = 0xffff0000; 70fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius static const uint32_t CASE_MASK = 0xc000; 71fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius static const uint32_t SECONDARY_AND_CASE_MASK = SECONDARY_MASK | CASE_MASK; 72fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius /** Only the 2*6 bits for the pure tertiary weight. */ 73fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius static const uint32_t ONLY_TERTIARY_MASK = 0x3f3f; 74fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius /** Only the secondary & tertiary bits; no case, no quaternary. */ 75fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius static const uint32_t ONLY_SEC_TER_MASK = SECONDARY_MASK | ONLY_TERTIARY_MASK; 76fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius /** Case bits and tertiary bits. */ 77fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius static const uint32_t CASE_AND_TERTIARY_MASK = CASE_MASK | ONLY_TERTIARY_MASK; 78fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius static const uint32_t QUATERNARY_MASK = 0xc0; 79fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius /** Case bits and quaternary bits. */ 80fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius static const uint32_t CASE_AND_QUATERNARY_MASK = CASE_MASK | QUATERNARY_MASK; 81fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 82fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius static const uint8_t UNASSIGNED_IMPLICIT_BYTE = 0xfe; // compressible 83fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius /** 84fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * First unassigned: AlphabeticIndex overflow boundary. 85fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * We want a 3-byte primary so that it fits into the root elements table. 86fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * 87fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * This 3-byte primary will not collide with 88fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * any unassigned-implicit 4-byte primaries because 89fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * the first few hundred Unicode code points all have real mappings. 90fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius */ 91fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius static const uint32_t FIRST_UNASSIGNED_PRIMARY = 0xfe040200; 92fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 93fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius static const uint8_t TRAIL_WEIGHT_BYTE = 0xff; // not compressible 94fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius static const uint32_t FIRST_TRAILING_PRIMARY = 0xff020200; // [first trailing] 95fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius static const uint32_t MAX_PRIMARY = 0xffff0000; // U+FFFF 96fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius static const uint32_t MAX_REGULAR_CE32 = 0xffff0505; // U+FFFF 97fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 98fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // CE32 value for U+FFFD as well as illegal UTF-8 byte sequences (which behave like U+FFFD). 99fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // We use the third-highest primary weight for U+FFFD (as in UCA 6.3+). 100fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius static const uint32_t FFFD_PRIMARY = MAX_PRIMARY - 0x20000; 101fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius static const uint32_t FFFD_CE32 = MAX_REGULAR_CE32 - 0x20000; 102fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 103fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius /** 104fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * A CE32 is special if its low byte is this or greater. 105fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * Impossible case bits 11 mark special CE32s. 106fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * This value itself is used to indicate a fallback to the base collator. 107fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius */ 108fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius static const uint8_t SPECIAL_CE32_LOW_BYTE = 0xc0; 109fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius static const uint32_t FALLBACK_CE32 = SPECIAL_CE32_LOW_BYTE; 110fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius /** 111fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * Low byte of a long-primary special CE32. 112fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius */ 113fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius static const uint8_t LONG_PRIMARY_CE32_LOW_BYTE = 0xc1; // SPECIAL_CE32_LOW_BYTE | LONG_PRIMARY_TAG 114fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 115fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius static const uint32_t UNASSIGNED_CE32 = 0xffffffff; // Compute an unassigned-implicit CE. 116fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 117fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius static const uint32_t NO_CE32 = 1; 118fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 119fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius /** No CE: End of input. Only used in runtime code, not stored in data. */ 120fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius static const uint32_t NO_CE_PRIMARY = 1; // not a left-adjusted weight 121fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius static const uint32_t NO_CE_WEIGHT16 = 0x0100; // weight of LEVEL_SEPARATOR_BYTE 122fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius static const int64_t NO_CE = INT64_C(0x101000100); // NO_CE_PRIMARY, NO_CE_WEIGHT16, NO_CE_WEIGHT16 123fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 124fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius /** Sort key levels. */ 125fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius enum Level { 126fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius /** Unspecified level. */ 127fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius NO_LEVEL, 128fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius PRIMARY_LEVEL, 129fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius SECONDARY_LEVEL, 130fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius CASE_LEVEL, 131fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius TERTIARY_LEVEL, 132fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius QUATERNARY_LEVEL, 133fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius IDENTICAL_LEVEL, 134fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius /** Beyond sort key bytes. */ 135fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius ZERO_LEVEL 136fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius }; 137fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 138fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius /** 139fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * Sort key level flags: xx_FLAG = 1 << xx_LEVEL. 140fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * In Java, use enum Level with flag() getters, or use EnumSet rather than hand-made bit sets. 141fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius */ 142fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius static const uint32_t NO_LEVEL_FLAG = 1; 143fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius static const uint32_t PRIMARY_LEVEL_FLAG = 2; 144fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius static const uint32_t SECONDARY_LEVEL_FLAG = 4; 145fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius static const uint32_t CASE_LEVEL_FLAG = 8; 146fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius static const uint32_t TERTIARY_LEVEL_FLAG = 0x10; 147fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius static const uint32_t QUATERNARY_LEVEL_FLAG = 0x20; 148fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius static const uint32_t IDENTICAL_LEVEL_FLAG = 0x40; 149fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius static const uint32_t ZERO_LEVEL_FLAG = 0x80; 150fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 151fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius /** 152fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * Special-CE32 tags, from bits 3..0 of a special 32-bit CE. 153fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * Bits 31..8 are available for tag-specific data. 154fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * Bits 5..4: Reserved. May be used in the future to indicate lccc!=0 and tccc!=0. 155fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius */ 156fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius enum { 157fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius /** 158fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * Fall back to the base collator. 159fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * This is the tag value in SPECIAL_CE32_LOW_BYTE and FALLBACK_CE32. 160fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * Bits 31..8: Unused, 0. 161fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius */ 162fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius FALLBACK_TAG = 0, 163fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius /** 164fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * Long-primary CE with COMMON_SEC_AND_TER_CE. 165fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * Bits 31..8: Three-byte primary. 166fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius */ 167fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius LONG_PRIMARY_TAG = 1, 168fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius /** 169fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * Long-secondary CE with zero primary. 170fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * Bits 31..16: Secondary weight. 171fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * Bits 15.. 8: Tertiary weight. 172fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius */ 173fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius LONG_SECONDARY_TAG = 2, 174fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius /** 175fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * Unused. 176fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * May be used in the future for single-byte secondary CEs (SHORT_SECONDARY_TAG), 177fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * storing the secondary in bits 31..24, the ccc in bits 23..16, 178fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * and the tertiary in bits 15..8. 179fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius */ 180fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius RESERVED_TAG_3 = 3, 181fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius /** 182fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * Latin mini expansions of two simple CEs [pp, 05, tt] [00, ss, 05]. 183fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * Bits 31..24: Single-byte primary weight pp of the first CE. 184fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * Bits 23..16: Tertiary weight tt of the first CE. 185fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * Bits 15.. 8: Secondary weight ss of the second CE. 186fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius */ 187fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius LATIN_EXPANSION_TAG = 4, 188fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius /** 189fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * Points to one or more simple/long-primary/long-secondary 32-bit CE32s. 190fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * Bits 31..13: Index into uint32_t table. 191fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * Bits 12.. 8: Length=1..31. 192fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius */ 193fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius EXPANSION32_TAG = 5, 194fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius /** 195fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * Points to one or more 64-bit CEs. 196fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * Bits 31..13: Index into CE table. 197fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * Bits 12.. 8: Length=1..31. 198fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius */ 199fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius EXPANSION_TAG = 6, 200fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius /** 201fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * Builder data, used only in the CollationDataBuilder, not in runtime data. 202fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * 203fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * If bit 8 is 0: Builder context, points to a list of context-sensitive mappings. 204fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * Bits 31..13: Index to the builder's list of ConditionalCE32 for this character. 205fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * Bits 12.. 9: Unused, 0. 206fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * 207fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * If bit 8 is 1 (IS_BUILDER_JAMO_CE32): Builder-only jamoCE32 value. 208fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * The builder fetches the Jamo CE32 from the trie. 209fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * Bits 31..13: Jamo code point. 210fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * Bits 12.. 9: Unused, 0. 211fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius */ 212fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius BUILDER_DATA_TAG = 7, 213fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius /** 214fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * Points to prefix trie. 215fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * Bits 31..13: Index into prefix/contraction data. 216fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * Bits 12.. 8: Unused, 0. 217fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius */ 218fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius PREFIX_TAG = 8, 219fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius /** 220fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * Points to contraction data. 221fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * Bits 31..13: Index into prefix/contraction data. 222fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * Bits 12..11: Unused, 0. 223fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * Bit 10: CONTRACT_TRAILING_CCC flag. 224fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * Bit 9: CONTRACT_NEXT_CCC flag. 225fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * Bit 8: CONTRACT_SINGLE_CP_NO_MATCH flag. 226fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius */ 227fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius CONTRACTION_TAG = 9, 228fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius /** 229fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * Decimal digit. 230fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * Bits 31..13: Index into uint32_t table for non-numeric-collation CE32. 231fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * Bit 12: Unused, 0. 232fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * Bits 11.. 8: Digit value 0..9. 233fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius */ 234fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius DIGIT_TAG = 10, 235fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius /** 236fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * Tag for U+0000, for moving the NUL-termination handling 237fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * from the regular fastpath into specials-handling code. 238fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * Bits 31..8: Unused, 0. 239fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius */ 240fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius U0000_TAG = 11, 241fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius /** 242fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * Tag for a Hangul syllable. 243fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * Bits 31..9: Unused, 0. 244fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * Bit 8: HANGUL_NO_SPECIAL_JAMO flag. 245fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius */ 246fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius HANGUL_TAG = 12, 247fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius /** 248fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * Tag for a lead surrogate code unit. 249fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * Optional optimization for UTF-16 string processing. 250fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * Bits 31..10: Unused, 0. 251fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * 9.. 8: =0: All associated supplementary code points are unassigned-implict. 252fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * =1: All associated supplementary code points fall back to the base data. 253fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * else: (Normally 2) Look up the data for the supplementary code point. 254fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius */ 255fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius LEAD_SURROGATE_TAG = 13, 256fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius /** 257fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * Tag for CEs with primary weights in code point order. 258fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * Bits 31..13: Index into CE table, for one data "CE". 259fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * Bits 12.. 8: Unused, 0. 260fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * 261fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * This data "CE" has the following bit fields: 262fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * Bits 63..32: Three-byte primary pppppp00. 263fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * 31.. 8: Start/base code point of the in-order range. 264fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * 7: Flag isCompressible primary. 265fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * 6.. 0: Per-code point primary-weight increment. 266fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius */ 267fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius OFFSET_TAG = 14, 268fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius /** 269fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * Implicit CE tag. Compute an unassigned-implicit CE. 270fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * All bits are set (UNASSIGNED_CE32=0xffffffff). 271fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius */ 272fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius IMPLICIT_TAG = 15 273fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius }; 274fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 275fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius static UBool isAssignedCE32(uint32_t ce32) { 276fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return ce32 != FALLBACK_CE32 && ce32 != UNASSIGNED_CE32; 277fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 278fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 279fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius /** 280fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * We limit the number of CEs in an expansion 281fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * so that we can use a small number of length bits in the data structure, 282fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * and so that an implementation can copy CEs at runtime without growing a destination buffer. 283fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius */ 284fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius static const int32_t MAX_EXPANSION_LENGTH = 31; 285fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius static const int32_t MAX_INDEX = 0x7ffff; 286fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 287fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius /** 288fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * Set if there is no match for the single (no-suffix) character itself. 289fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * This is only possible if there is a prefix. 290fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * In this case, discontiguous contraction matching cannot add combining marks 291fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * starting from an empty suffix. 292fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * The default CE32 is used anyway if there is no suffix match. 293fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius */ 294fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius static const uint32_t CONTRACT_SINGLE_CP_NO_MATCH = 0x100; 295fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius /** Set if the first character of every contraction suffix has lccc!=0. */ 296fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius static const uint32_t CONTRACT_NEXT_CCC = 0x200; 297fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius /** Set if any contraction suffix ends with lccc!=0. */ 298fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius static const uint32_t CONTRACT_TRAILING_CCC = 0x400; 299fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 300fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius /** For HANGUL_TAG: None of its Jamo CE32s isSpecialCE32(). */ 301fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius static const uint32_t HANGUL_NO_SPECIAL_JAMO = 0x100; 302fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 303fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius static const uint32_t LEAD_ALL_UNASSIGNED = 0; 304fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius static const uint32_t LEAD_ALL_FALLBACK = 0x100; 305fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius static const uint32_t LEAD_MIXED = 0x200; 306fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius static const uint32_t LEAD_TYPE_MASK = 0x300; 307fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 308fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius static uint32_t makeLongPrimaryCE32(uint32_t p) { return p | LONG_PRIMARY_CE32_LOW_BYTE; } 309fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 310fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius /** Turns the long-primary CE32 into a primary weight pppppp00. */ 311fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius static inline uint32_t primaryFromLongPrimaryCE32(uint32_t ce32) { 312fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return ce32 & 0xffffff00; 313fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 314fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius static inline int64_t ceFromLongPrimaryCE32(uint32_t ce32) { 315fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return ((int64_t)(ce32 & 0xffffff00) << 32) | COMMON_SEC_AND_TER_CE; 316fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 317fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 318fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius static uint32_t makeLongSecondaryCE32(uint32_t lower32) { 319fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return lower32 | SPECIAL_CE32_LOW_BYTE | LONG_SECONDARY_TAG; 320fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 321fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius static inline int64_t ceFromLongSecondaryCE32(uint32_t ce32) { 322fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return ce32 & 0xffffff00; 323fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 324fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 325fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius /** Makes a special CE32 with tag, index and length. */ 326fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius static uint32_t makeCE32FromTagIndexAndLength(int32_t tag, int32_t index, int32_t length) { 327fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return (index << 13) | (length << 8) | SPECIAL_CE32_LOW_BYTE | tag; 328fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 329fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius /** Makes a special CE32 with only tag and index. */ 330fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius static uint32_t makeCE32FromTagAndIndex(int32_t tag, int32_t index) { 331fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return (index << 13) | SPECIAL_CE32_LOW_BYTE | tag; 332fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 333fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 334fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius static inline UBool isSpecialCE32(uint32_t ce32) { 335fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return (ce32 & 0xff) >= SPECIAL_CE32_LOW_BYTE; 336fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 337fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 338fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius static inline int32_t tagFromCE32(uint32_t ce32) { 339fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return (int32_t)(ce32 & 0xf); 340fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 341fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 342fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius static inline UBool hasCE32Tag(uint32_t ce32, int32_t tag) { 343fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return isSpecialCE32(ce32) && tagFromCE32(ce32) == tag; 344fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 345fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 346fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius static inline UBool isLongPrimaryCE32(uint32_t ce32) { 347fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return hasCE32Tag(ce32, LONG_PRIMARY_TAG); 348fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 349fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 350fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius static UBool isSimpleOrLongCE32(uint32_t ce32) { 351fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return !isSpecialCE32(ce32) || 352fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius tagFromCE32(ce32) == LONG_PRIMARY_TAG || 353fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius tagFromCE32(ce32) == LONG_SECONDARY_TAG; 354fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 355fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 356fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius /** 357fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * @return TRUE if the ce32 yields one or more CEs without further data lookups 358fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius */ 359fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius static UBool isSelfContainedCE32(uint32_t ce32) { 360fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return !isSpecialCE32(ce32) || 361fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius tagFromCE32(ce32) == LONG_PRIMARY_TAG || 362fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius tagFromCE32(ce32) == LONG_SECONDARY_TAG || 363fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius tagFromCE32(ce32) == LATIN_EXPANSION_TAG; 364fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 365fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 366fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius static inline UBool isPrefixCE32(uint32_t ce32) { 367fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return hasCE32Tag(ce32, PREFIX_TAG); 368fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 369fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 370fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius static inline UBool isContractionCE32(uint32_t ce32) { 371fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return hasCE32Tag(ce32, CONTRACTION_TAG); 372fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 373fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 374fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius static inline UBool ce32HasContext(uint32_t ce32) { 375fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return isSpecialCE32(ce32) && 376fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius (tagFromCE32(ce32) == PREFIX_TAG || 377fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius tagFromCE32(ce32) == CONTRACTION_TAG); 378fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 379fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 380fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius /** 381fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * Get the first of the two Latin-expansion CEs encoded in ce32. 382fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * @see LATIN_EXPANSION_TAG 383fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius */ 384fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius static inline int64_t latinCE0FromCE32(uint32_t ce32) { 385fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return ((int64_t)(ce32 & 0xff000000) << 32) | COMMON_SECONDARY_CE | ((ce32 & 0xff0000) >> 8); 386fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 387fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 388fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius /** 389fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * Get the second of the two Latin-expansion CEs encoded in ce32. 390fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * @see LATIN_EXPANSION_TAG 391fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius */ 392fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius static inline int64_t latinCE1FromCE32(uint32_t ce32) { 393fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return ((ce32 & 0xff00) << 16) | COMMON_TERTIARY_CE; 394fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 395fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 396fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius /** 397fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * Returns the data index from a special CE32. 398fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius */ 399fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius static inline int32_t indexFromCE32(uint32_t ce32) { 400fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return (int32_t)(ce32 >> 13); 401fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 402fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 403fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius /** 404fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * Returns the data length from a ce32. 405fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius */ 406fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius static inline int32_t lengthFromCE32(uint32_t ce32) { 407fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return (ce32 >> 8) & 31; 408fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 409fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 410fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius /** 411fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * Returns the digit value from a DIGIT_TAG ce32. 412fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius */ 413fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius static inline char digitFromCE32(uint32_t ce32) { 414fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return (char)((ce32 >> 8) & 0xf); 415fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 416fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 417fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius /** Returns a 64-bit CE from a simple CE32 (not special). */ 418fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius static inline int64_t ceFromSimpleCE32(uint32_t ce32) { 419fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // normal form ppppsstt -> pppp0000ss00tt00 420fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // assert (ce32 & 0xff) < SPECIAL_CE32_LOW_BYTE 421fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return ((int64_t)(ce32 & 0xffff0000) << 32) | ((ce32 & 0xff00) << 16) | ((ce32 & 0xff) << 8); 422fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 423fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 424fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius /** Returns a 64-bit CE from a simple/long-primary/long-secondary CE32. */ 425fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius static inline int64_t ceFromCE32(uint32_t ce32) { 426fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius uint32_t tertiary = ce32 & 0xff; 427fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if(tertiary < SPECIAL_CE32_LOW_BYTE) { 428fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // normal form ppppsstt -> pppp0000ss00tt00 429fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return ((int64_t)(ce32 & 0xffff0000) << 32) | ((ce32 & 0xff00) << 16) | (tertiary << 8); 430fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } else { 431fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius ce32 -= tertiary; 432fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius if((tertiary & 0xf) == LONG_PRIMARY_TAG) { 433fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // long-primary form ppppppC1 -> pppppp00050000500 434fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return ((int64_t)ce32 << 32) | COMMON_SEC_AND_TER_CE; 435fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } else { 436fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // long-secondary form ssssttC2 -> 00000000sssstt00 437fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // assert (tertiary & 0xf) == LONG_SECONDARY_TAG 438fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return ce32; 439fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 440fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 441fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 442fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 443fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius /** Creates a CE from a primary weight. */ 444fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius static inline int64_t makeCE(uint32_t p) { 445fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return ((int64_t)p << 32) | COMMON_SEC_AND_TER_CE; 446fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 447fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius /** 448fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * Creates a CE from a primary weight, 449fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * 16-bit secondary/tertiary weights, and a 2-bit quaternary. 450fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius */ 451fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius static inline int64_t makeCE(uint32_t p, uint32_t s, uint32_t t, uint32_t q) { 452fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return ((int64_t)p << 32) | (s << 16) | t | (q << 6); 453fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 454fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 455fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius /** 456fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * Increments a 2-byte primary by a code point offset. 457fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius */ 458fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius static uint32_t incTwoBytePrimaryByOffset(uint32_t basePrimary, UBool isCompressible, 459fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int32_t offset); 460fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 461fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius /** 462fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * Increments a 3-byte primary by a code point offset. 463fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius */ 464fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius static uint32_t incThreeBytePrimaryByOffset(uint32_t basePrimary, UBool isCompressible, 465fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int32_t offset); 466fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 467fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius /** 468fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * Decrements a 2-byte primary by one range step (1..0x7f). 469fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius */ 470fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius static uint32_t decTwoBytePrimaryByOneStep(uint32_t basePrimary, UBool isCompressible, int32_t step); 471fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 472fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius /** 473fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * Decrements a 3-byte primary by one range step (1..0x7f). 474fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius */ 475fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius static uint32_t decThreeBytePrimaryByOneStep(uint32_t basePrimary, UBool isCompressible, int32_t step); 476fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 477fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius /** 478fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * Computes a 3-byte primary for c's OFFSET_TAG data "CE". 479fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius */ 480fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius static uint32_t getThreeBytePrimaryForOffsetData(UChar32 c, int64_t dataCE); 481fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 482fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius /** 483fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * Returns the unassigned-character implicit primary weight for any valid code point c. 484fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius */ 485fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius static uint32_t unassignedPrimaryFromCodePoint(UChar32 c); 486fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 487fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius static inline int64_t unassignedCEFromCodePoint(UChar32 c) { 488fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius return makeCE(unassignedPrimaryFromCodePoint(c)); 489fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 490fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 491fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusprivate: 492fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius Collation(); // No instantiation. 493fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius}; 494fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 495fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusU_NAMESPACE_END 496fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 497fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#endif // !UCONFIG_NO_COLLATION 498fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#endif // __COLLATION_H__ 499