1fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius/*
2fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius*******************************************************************************
3fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius* Copyright (C) 2013-2014, International Business Machines
4fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius* Corporation and others.  All Rights Reserved.
5fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius*******************************************************************************
6fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius* collationsets.h
7fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius*
8fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius* created on: 2013feb09
9fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius* created by: Markus W. Scherer
10fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius*/
11fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius
12fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#ifndef __COLLATIONSETS_H__
13fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#define __COLLATIONSETS_H__
14fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius
15fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#include "unicode/utypes.h"
16fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius
17fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#if !UCONFIG_NO_COLLATION
18fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius
19fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#include "unicode/uniset.h"
20fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#include "collation.h"
21fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius
22fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusU_NAMESPACE_BEGIN
23fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius
24fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusstruct CollationData;
25fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius
26fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius/**
27fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * Finds the set of characters and strings that sort differently in the tailoring
28fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * from the base data.
29fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius *
30fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * Every mapping in the tailoring needs to be compared to the base,
31fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * because some mappings are copied for optimization, and
32fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * all contractions for a character are copied if any contractions for that character
33fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * are added, modified or removed.
34fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius *
35fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * It might be simpler to re-parse the rule string, but:
36fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * - That would require duplicating some of the from-rules builder code.
37fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * - That would make the runtime code depend on the builder.
38fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius * - That would only work if we have the rule string, and we allow users to
39fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius *   omit the rule string from data files.
40fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius */
41fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusclass TailoredSet : public UMemory {
42fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliuspublic:
43fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    TailoredSet(UnicodeSet *t)
44fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius            : data(NULL), baseData(NULL),
45fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius              tailored(t),
46fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius              suffix(NULL),
47fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius              errorCode(U_ZERO_ERROR) {}
48fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius
49fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    void forData(const CollationData *d, UErrorCode &errorCode);
50fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius
51fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    /**
52fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius     * @return U_SUCCESS(errorCode) in C++, void in Java
53fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius     * @internal only public for access by callback
54fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius     */
55fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    UBool handleCE32(UChar32 start, UChar32 end, uint32_t ce32);
56fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius
57fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusprivate:
58fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    void compare(UChar32 c, uint32_t ce32, uint32_t baseCE32);
59fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    void comparePrefixes(UChar32 c, const UChar *p, const UChar *q);
60fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    void compareContractions(UChar32 c, const UChar *p, const UChar *q);
61fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius
62fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    void addPrefixes(const CollationData *d, UChar32 c, const UChar *p);
63fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    void addPrefix(const CollationData *d, const UnicodeString &pfx, UChar32 c, uint32_t ce32);
64fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    void addContractions(UChar32 c, const UChar *p);
65fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    void addSuffix(UChar32 c, const UnicodeString &sfx);
66fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    void add(UChar32 c);
67fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius
68fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    /** Prefixes are reversed in the data structure. */
69fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    void setPrefix(const UnicodeString &pfx) {
70fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        unreversedPrefix = pfx;
71fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        unreversedPrefix.reverse();
72fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    }
73fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    void resetPrefix() {
74fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        unreversedPrefix.remove();
75fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    }
76fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius
77fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    const CollationData *data;
78fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    const CollationData *baseData;
79fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    UnicodeSet *tailored;
80fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    UnicodeString unreversedPrefix;
81fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    const UnicodeString *suffix;
82fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    UErrorCode errorCode;
83fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius};
84fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius
85fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusclass ContractionsAndExpansions : public UMemory {
86fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliuspublic:
87fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    class CESink : public UMemory {
88fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    public:
89fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        virtual ~CESink();
90fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        virtual void handleCE(int64_t ce) = 0;
91fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        virtual void handleExpansion(const int64_t ces[], int32_t length) = 0;
92fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    };
93fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius
94fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    ContractionsAndExpansions(UnicodeSet *con, UnicodeSet *exp, CESink *s, UBool prefixes)
95fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius            : data(NULL),
96fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius              contractions(con), expansions(exp),
97fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius              sink(s),
98fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius              addPrefixes(prefixes),
99fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius              checkTailored(0),
100fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius              suffix(NULL),
101fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius              errorCode(U_ZERO_ERROR) {}
102fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius
103fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    void forData(const CollationData *d, UErrorCode &errorCode);
104fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    void forCodePoint(const CollationData *d, UChar32 c, UErrorCode &ec);
105fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius
106fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    // all following: @internal, only public for access by callback
107fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius
108fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    void handleCE32(UChar32 start, UChar32 end, uint32_t ce32);
109fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius
110fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    void handlePrefixes(UChar32 start, UChar32 end, uint32_t ce32);
111fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    void handleContractions(UChar32 start, UChar32 end, uint32_t ce32);
112fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius
113fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    void addExpansions(UChar32 start, UChar32 end);
114fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    void addStrings(UChar32 start, UChar32 end, UnicodeSet *set);
115fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius
116fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    /** Prefixes are reversed in the data structure. */
117fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    void setPrefix(const UnicodeString &pfx) {
118fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        unreversedPrefix = pfx;
119fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        unreversedPrefix.reverse();
120fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    }
121fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    void resetPrefix() {
122fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius        unreversedPrefix.remove();
123fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    }
124fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius
125fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    const CollationData *data;
126fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    UnicodeSet *contractions;
127fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    UnicodeSet *expansions;
128fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    CESink *sink;
129fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    UBool addPrefixes;
130fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    int8_t checkTailored;  // -1: collected tailored  +1: exclude tailored
131fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    UnicodeSet tailored;
132fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    UnicodeSet ranges;
133fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    UnicodeString unreversedPrefix;
134fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    const UnicodeString *suffix;
135fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    int64_t ces[Collation::MAX_EXPANSION_LENGTH];
136fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius    UErrorCode errorCode;
137fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius};
138fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius
139fceb39872958b9fa2505e63f8b8699a9e0f882f4ccorneliusU_NAMESPACE_END
140fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius
141fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#endif  // !UCONFIG_NO_COLLATION
142fceb39872958b9fa2505e63f8b8699a9e0f882f4ccornelius#endif  // __COLLATIONSETS_H__
143