1/*
2******************************************************************************
3*
4*   Copyright (C) 2007, International Business Machines
5*   Corporation and others.  All Rights Reserved.
6*
7******************************************************************************
8*   file name:  unisetspan.h
9*   encoding:   US-ASCII
10*   tab size:   8 (not used)
11*   indentation:4
12*
13*   created on: 2007mar01
14*   created by: Markus W. Scherer
15*/
16
17#ifndef __UNISETSPAN_H__
18#define __UNISETSPAN_H__
19
20#include "unicode/utypes.h"
21#include "unicode/uniset.h"
22
23U_NAMESPACE_BEGIN
24
25/*
26 * Implement span() etc. for a set with strings.
27 * Avoid recursion because of its exponential complexity.
28 * Instead, try multiple paths at once and track them with an IndexList.
29 */
30class UnicodeSetStringSpan : public UMemory {
31public:
32    /*
33     * Which span() variant will be used?
34     * The object is either built for one variant and used once,
35     * or built for all and may be used many times.
36     */
37    enum {
38        FWD             = 0x20,
39        BACK            = 0x10,
40        UTF16           = 8,
41        UTF8            = 4,
42        CONTAINED       = 2,
43        NOT_CONTAINED   = 1,
44
45        ALL             = 0x3f,
46
47        FWD_UTF16_CONTAINED     = FWD  | UTF16 |     CONTAINED,
48        FWD_UTF16_NOT_CONTAINED = FWD  | UTF16 | NOT_CONTAINED,
49        FWD_UTF8_CONTAINED      = FWD  | UTF8  |     CONTAINED,
50        FWD_UTF8_NOT_CONTAINED  = FWD  | UTF8  | NOT_CONTAINED,
51        BACK_UTF16_CONTAINED    = BACK | UTF16 |     CONTAINED,
52        BACK_UTF16_NOT_CONTAINED= BACK | UTF16 | NOT_CONTAINED,
53        BACK_UTF8_CONTAINED     = BACK | UTF8  |     CONTAINED,
54        BACK_UTF8_NOT_CONTAINED = BACK | UTF8  | NOT_CONTAINED
55    };
56
57    UnicodeSetStringSpan(const UnicodeSet &set, const UVector &setStrings, uint32_t which);
58
59    // Copy constructor. Assumes which==ALL for a frozen set.
60    UnicodeSetStringSpan(const UnicodeSetStringSpan &otherStringSpan, const UVector &newParentSetStrings);
61
62    ~UnicodeSetStringSpan();
63
64    /*
65     * Do the strings need to be checked in span() etc.?
66     * @return TRUE if strings need to be checked (call span() here),
67     *         FALSE if not (use a BMPSet for best performance).
68     */
69    inline UBool needsStringSpanUTF16();
70    inline UBool needsStringSpanUTF8();
71
72    // For fast UnicodeSet::contains(c).
73    inline UBool contains(UChar32 c) const;
74
75    int32_t span(const UChar *s, int32_t length, USetSpanCondition spanCondition) const;
76
77    int32_t spanBack(const UChar *s, int32_t length, USetSpanCondition spanCondition) const;
78
79    int32_t spanUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const;
80
81    int32_t spanBackUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const;
82
83private:
84    // Special spanLength byte values.
85    enum {
86        // The spanLength is >=0xfe.
87        LONG_SPAN=0xfe,
88        // All code points in the string are contained in the parent set.
89        ALL_CP_CONTAINED=0xff
90    };
91
92    // Add a starting or ending string character to the spanNotSet
93    // so that a character span ends before any string.
94    void addToSpanNotSet(UChar32 c);
95
96    int32_t spanNot(const UChar *s, int32_t length) const;
97    int32_t spanNotBack(const UChar *s, int32_t length) const;
98    int32_t spanNotUTF8(const uint8_t *s, int32_t length) const;
99    int32_t spanNotBackUTF8(const uint8_t *s, int32_t length) const;
100
101    // Set for span(). Same as parent but without strings.
102    UnicodeSet spanSet;
103
104    // Set for span(not contained).
105    // Same as spanSet, plus characters that start or end strings.
106    UnicodeSet *pSpanNotSet;
107
108    // The strings of the parent set.
109    const UVector &strings;
110
111    // Pointer to the UTF-8 string lengths.
112    // Also pointer to further allocated storage for meta data and
113    // UTF-8 string contents as necessary.
114    int32_t *utf8Lengths;
115
116    // Pointer to the part of the (utf8Lengths) memory block that stores
117    // the lengths of span(), spanBack() etc. for each string.
118    uint8_t *spanLengths;
119
120    // Pointer to the part of the (utf8Lengths) memory block that stores
121    // the UTF-8 versions of the parent set's strings.
122    uint8_t *utf8;
123
124    // Number of bytes for all UTF-8 versions of strings together.
125    int32_t utf8Length;
126
127    // Maximum lengths of relevant strings.
128    int32_t maxLength16;
129    int32_t maxLength8;
130
131    // Set up for all variants of span()?
132    UBool all;
133
134    // Memory for small numbers and lengths of strings.
135    // For example, for 8 strings:
136    // 8 UTF-8 lengths, 8*4 bytes span lengths, 8*2 3-byte UTF-8 characters
137    // = 112 bytes = int32_t[28].
138    int32_t staticLengths[32];
139};
140
141UBool UnicodeSetStringSpan::needsStringSpanUTF16() {
142    return (UBool)(maxLength16!=0);
143}
144
145UBool UnicodeSetStringSpan::needsStringSpanUTF8() {
146    return (UBool)(maxLength8!=0);
147}
148
149UBool UnicodeSetStringSpan::contains(UChar32 c) const {
150    return spanSet.contains(c);
151}
152
153U_NAMESPACE_END
154
155#endif
156