1/*
2******************************************************************************
3*
4*   Copyright (C) 2007, International Business Machines
5*   Corporation and others.  All Rights Reserved.
6*
7******************************************************************************
8*   file name:  bmpset.h
9*   encoding:   US-ASCII
10*   tab size:   8 (not used)
11*   indentation:4
12*
13*   created on: 2007jan29
14*   created by: Markus W. Scherer
15*/
16
17#ifndef __BMPSET_H__
18#define __BMPSET_H__
19
20#include "unicode/utypes.h"
21#include "unicode/uniset.h"
22
23U_NAMESPACE_BEGIN
24
25/*
26 * Helper class for frozen UnicodeSets, implements contains() and span()
27 * optimized for BMP code points. Structured to be UTF-8-friendly.
28 *
29 * ASCII: Look up bytes.
30 * 2-byte characters: Bits organized vertically.
31 * 3-byte characters: Use zero/one/mixed data per 64-block in U+0000..U+FFFF,
32 *                    with mixed for illegal ranges.
33 * Supplementary characters: Call contains() on the parent set.
34 */
35class BMPSet : public UMemory {
36public:
37    BMPSet(const int32_t *parentList, int32_t parentListLength);
38    BMPSet(const BMPSet &otherBMPSet, const int32_t *newParentList, int32_t newParentListLength);
39    virtual ~BMPSet();
40
41    virtual UBool contains(UChar32 c) const;
42
43    /*
44     * Span the initial substring for which each character c has spanCondition==contains(c).
45     * It must be s<limit and spanCondition==0 or 1.
46     * @return The string pointer which limits the span.
47     */
48    const UChar *span(const UChar *s, const UChar *limit, USetSpanCondition spanCondition) const;
49
50    /*
51     * Span the trailing substring for which each character c has spanCondition==contains(c).
52     * It must be s<limit and spanCondition==0 or 1.
53     * @return The string pointer which starts the span.
54     */
55    const UChar *spanBack(const UChar *s, const UChar *limit, USetSpanCondition spanCondition) const;
56
57    /*
58     * Span the initial substring for which each character c has spanCondition==contains(c).
59     * It must be length>0 and spanCondition==0 or 1.
60     * @return The string pointer which limits the span.
61     */
62    const uint8_t *spanUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const;
63
64    /*
65     * Span the trailing substring for which each character c has spanCondition==contains(c).
66     * It must be length>0 and spanCondition==0 or 1.
67     * @return The start of the span.
68     */
69    int32_t spanBackUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const;
70
71private:
72    void initBits();
73    void overrideIllegal();
74
75    /**
76     * Same as UnicodeSet::findCodePoint(UChar32 c) const except that the
77     * binary search is restricted for finding code points in a certain range.
78     *
79     * For restricting the search for finding in the range start..end,
80     * pass in
81     *   lo=findCodePoint(start) and
82     *   hi=findCodePoint(end)
83     * with 0<=lo<=hi<len.
84     * findCodePoint(c) defaults to lo=0 and hi=len-1.
85     *
86     * @param c a character in a subrange of MIN_VALUE..MAX_VALUE
87     * @param lo The lowest index to be returned.
88     * @param hi The highest index to be returned.
89     * @return the smallest integer i in the range lo..hi,
90     *         inclusive, such that c < list[i]
91     */
92    int32_t findCodePoint(UChar32 c, int32_t lo, int32_t hi) const;
93
94    inline UBool containsSlow(UChar32 c, int32_t lo, int32_t hi) const;
95
96    /*
97     * One byte per ASCII character, or trail byte in lead position.
98     * 0 or 1 for ASCII characters.
99     * The value for trail bytes is the result of contains(FFFD)
100     * for faster validity checking at runtime.
101     */
102    UBool asciiBytes[0xc0];
103
104    /*
105     * One bit per code point from U+0000..U+07FF.
106     * The bits are organized vertically; consecutive code points
107     * correspond to the same bit positions in consecutive table words.
108     * With code point parts
109     *   lead=c{10..6}
110     *   trail=c{5..0}
111     * it is set.contains(c)==(table7FF[trail] bit lead)
112     *
113     * Bits for 0..7F (non-shortest forms) are set to the result of contains(FFFD)
114     * for faster validity checking at runtime.
115     */
116    uint32_t table7FF[64];
117
118    /*
119     * One bit per 64 BMP code points.
120     * The bits are organized vertically; consecutive 64-code point blocks
121     * correspond to the same bit position in consecutive table words.
122     * With code point parts
123     *   lead=c{15..12}
124     *   t1=c{11..6}
125     * test bits (lead+16) and lead in bmpBlockBits[t1].
126     * If the upper bit is 0, then the lower bit indicates if contains(c)
127     * for all code points in the 64-block.
128     * If the upper bit is 1, then the block is mixed and set.contains(c)
129     * must be called.
130     *
131     * Bits for 0..7FF (non-shortest forms) and D800..DFFF are set to
132     * the result of contains(FFFD) for faster validity checking at runtime.
133     */
134    uint32_t bmpBlockBits[64];
135
136    /*
137     * Inversion list indexes for restricted binary searches in
138     * findCodePoint(), from
139     * findCodePoint(U+0800, U+1000, U+2000, .., U+F000, U+10000).
140     * U+0800 is the first 3-byte-UTF-8 code point. Code points below U+0800 are
141     * always looked up in the bit tables.
142     * The last pair of indexes is for finding supplementary code points.
143     */
144    int32_t list4kStarts[18];
145
146    /*
147     * The inversion list of the parent set, for the slower contains() implementation
148     * for mixed BMP blocks and for supplementary code points.
149     * The list is terminated with list[listLength-1]=0x110000.
150     */
151    const int32_t *list;
152    int32_t listLength;
153};
154
155inline UBool BMPSet::containsSlow(UChar32 c, int32_t lo, int32_t hi) const {
156    return (UBool)(findCodePoint(c, lo, hi) & 1);
157}
158
159U_NAMESPACE_END
160
161#endif
162