1fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius/* 2fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius******************************************************************************* 3fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius* Copyright (C) 2013-2014, International Business Machines 4fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius* Corporation and others. All Rights Reserved. 5fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius******************************************************************************* 6fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius* collationsets.h 7fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius* 8fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius* created on: 2013feb09 9fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius* created by: Markus W. Scherer 10fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius*/ 11fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 12fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#ifndef __COLLATIONSETS_H__ 13fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#define __COLLATIONSETS_H__ 14fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 15fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#include "unicode/utypes.h" 16fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 17fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#if !UCONFIG_NO_COLLATION 18fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 19fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#include "unicode/uniset.h" 20fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#include "collation.h" 21fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 22fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusU_NAMESPACE_BEGIN 23fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 24fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusstruct CollationData; 25fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 26fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius/** 27fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * Finds the set of characters and strings that sort differently in the tailoring 28fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * from the base data. 29fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * 30fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * Every mapping in the tailoring needs to be compared to the base, 31fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * because some mappings are copied for optimization, and 32fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * all contractions for a character are copied if any contractions for that character 33fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * are added, modified or removed. 34fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * 35fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * It might be simpler to re-parse the rule string, but: 36fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * - That would require duplicating some of the from-rules builder code. 37fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * - That would make the runtime code depend on the builder. 38fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * - That would only work if we have the rule string, and we allow users to 39fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * omit the rule string from data files. 40fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius */ 41fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusclass TailoredSet : public UMemory { 42fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliuspublic: 43fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius TailoredSet(UnicodeSet *t) 44fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius : data(NULL), baseData(NULL), 45fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius tailored(t), 46fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius suffix(NULL), 47fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius errorCode(U_ZERO_ERROR) {} 48fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 49fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius void forData(const CollationData *d, UErrorCode &errorCode); 50fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 51fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius /** 52fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * @return U_SUCCESS(errorCode) in C++, void in Java 53fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * @internal only public for access by callback 54fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius */ 55fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UBool handleCE32(UChar32 start, UChar32 end, uint32_t ce32); 56fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 57fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusprivate: 58fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius void compare(UChar32 c, uint32_t ce32, uint32_t baseCE32); 59fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius void comparePrefixes(UChar32 c, const UChar *p, const UChar *q); 60fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius void compareContractions(UChar32 c, const UChar *p, const UChar *q); 61fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 62fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius void addPrefixes(const CollationData *d, UChar32 c, const UChar *p); 63fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius void addPrefix(const CollationData *d, const UnicodeString &pfx, UChar32 c, uint32_t ce32); 64fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius void addContractions(UChar32 c, const UChar *p); 65fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius void addSuffix(UChar32 c, const UnicodeString &sfx); 66fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius void add(UChar32 c); 67fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 68fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius /** Prefixes are reversed in the data structure. */ 69fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius void setPrefix(const UnicodeString &pfx) { 70fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius unreversedPrefix = pfx; 71fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius unreversedPrefix.reverse(); 72fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 73fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius void resetPrefix() { 74fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius unreversedPrefix.remove(); 75fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 76fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 77fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius const CollationData *data; 78fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius const CollationData *baseData; 79fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UnicodeSet *tailored; 80fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UnicodeString unreversedPrefix; 81fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius const UnicodeString *suffix; 82fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UErrorCode errorCode; 83fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius}; 84fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 85fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusclass ContractionsAndExpansions : public UMemory { 86fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliuspublic: 87fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius class CESink : public UMemory { 88fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius public: 89fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius virtual ~CESink(); 90fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius virtual void handleCE(int64_t ce) = 0; 91fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius virtual void handleExpansion(const int64_t ces[], int32_t length) = 0; 92fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius }; 93fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 94fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius ContractionsAndExpansions(UnicodeSet *con, UnicodeSet *exp, CESink *s, UBool prefixes) 95fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius : data(NULL), 96fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius contractions(con), expansions(exp), 97fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius sink(s), 98fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius addPrefixes(prefixes), 99fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius checkTailored(0), 100fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius suffix(NULL), 101fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius errorCode(U_ZERO_ERROR) {} 102fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 103fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius void forData(const CollationData *d, UErrorCode &errorCode); 104fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius void forCodePoint(const CollationData *d, UChar32 c, UErrorCode &ec); 105fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 106fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius // all following: @internal, only public for access by callback 107fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 108fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius void handleCE32(UChar32 start, UChar32 end, uint32_t ce32); 109fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 110fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius void handlePrefixes(UChar32 start, UChar32 end, uint32_t ce32); 111fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius void handleContractions(UChar32 start, UChar32 end, uint32_t ce32); 112fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 113fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius void addExpansions(UChar32 start, UChar32 end); 114fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius void addStrings(UChar32 start, UChar32 end, UnicodeSet *set); 115fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 116fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius /** Prefixes are reversed in the data structure. */ 117fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius void setPrefix(const UnicodeString &pfx) { 118fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius unreversedPrefix = pfx; 119fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius unreversedPrefix.reverse(); 120fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 121fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius void resetPrefix() { 122fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius unreversedPrefix.remove(); 123fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius } 124fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 125fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius const CollationData *data; 126fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UnicodeSet *contractions; 127fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UnicodeSet *expansions; 128fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius CESink *sink; 129fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UBool addPrefixes; 130fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int8_t checkTailored; // -1: collected tailored +1: exclude tailored 131fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UnicodeSet tailored; 132fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UnicodeSet ranges; 133fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UnicodeString unreversedPrefix; 134fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius const UnicodeString *suffix; 135fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius int64_t ces[Collation::MAX_EXPANSION_LENGTH]; 136fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius UErrorCode errorCode; 137fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius}; 138fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 139fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusU_NAMESPACE_END 140fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius 141fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#endif // !UCONFIG_NO_COLLATION 142fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#endif // __COLLATIONSETS_H__ 143