1/*
2***************************************************************************
3* Copyright (C) 1999-2013, International Business Machines Corporation
4* and others. All Rights Reserved.
5***************************************************************************
6*   Date        Name        Description
7*   10/20/99    alan        Creation.
8***************************************************************************
9*/
10
11#ifndef UNICODESET_H
12#define UNICODESET_H
13
14#include "unicode/unifilt.h"
15#include "unicode/unistr.h"
16#include "unicode/uset.h"
17
18/**
19 * \file
20 * \brief C++ API: Unicode Set
21 */
22
23U_NAMESPACE_BEGIN
24
25// Forward Declarations.
26void UnicodeSet_initInclusion(int32_t src, UErrorCode &status); /**< @internal */
27
28class BMPSet;
29class ParsePosition;
30class RBBIRuleScanner;
31class SymbolTable;
32class UnicodeSetStringSpan;
33class UVector;
34class RuleCharacterIterator;
35
36/**
37 * A mutable set of Unicode characters and multicharacter strings.  Objects of this class
38 * represent <em>character classes</em> used in regular expressions.
39 * A character specifies a subset of Unicode code points.  Legal
40 * code points are U+0000 to U+10FFFF, inclusive.
41 *
42 * <p>The UnicodeSet class is not designed to be subclassed.
43 *
44 * <p><code>UnicodeSet</code> supports two APIs. The first is the
45 * <em>operand</em> API that allows the caller to modify the value of
46 * a <code>UnicodeSet</code> object. It conforms to Java 2's
47 * <code>java.util.Set</code> interface, although
48 * <code>UnicodeSet</code> does not actually implement that
49 * interface. All methods of <code>Set</code> are supported, with the
50 * modification that they take a character range or single character
51 * instead of an <code>Object</code>, and they take a
52 * <code>UnicodeSet</code> instead of a <code>Collection</code>.  The
53 * operand API may be thought of in terms of boolean logic: a boolean
54 * OR is implemented by <code>add</code>, a boolean AND is implemented
55 * by <code>retain</code>, a boolean XOR is implemented by
56 * <code>complement</code> taking an argument, and a boolean NOT is
57 * implemented by <code>complement</code> with no argument.  In terms
58 * of traditional set theory function names, <code>add</code> is a
59 * union, <code>retain</code> is an intersection, <code>remove</code>
60 * is an asymmetric difference, and <code>complement</code> with no
61 * argument is a set complement with respect to the superset range
62 * <code>MIN_VALUE-MAX_VALUE</code>
63 *
64 * <p>The second API is the
65 * <code>applyPattern()</code>/<code>toPattern()</code> API from the
66 * <code>java.text.Format</code>-derived classes.  Unlike the
67 * methods that add characters, add categories, and control the logic
68 * of the set, the method <code>applyPattern()</code> sets all
69 * attributes of a <code>UnicodeSet</code> at once, based on a
70 * string pattern.
71 *
72 * <p><b>Pattern syntax</b></p>
73 *
74 * Patterns are accepted by the constructors and the
75 * <code>applyPattern()</code> methods and returned by the
76 * <code>toPattern()</code> method.  These patterns follow a syntax
77 * similar to that employed by version 8 regular expression character
78 * classes.  Here are some simple examples:
79 *
80 * \htmlonly<blockquote>\endhtmlonly
81 *   <table>
82 *     <tr align="top">
83 *       <td nowrap valign="top" align="left"><code>[]</code></td>
84 *       <td valign="top">No characters</td>
85 *     </tr><tr align="top">
86 *       <td nowrap valign="top" align="left"><code>[a]</code></td>
87 *       <td valign="top">The character 'a'</td>
88 *     </tr><tr align="top">
89 *       <td nowrap valign="top" align="left"><code>[ae]</code></td>
90 *       <td valign="top">The characters 'a' and 'e'</td>
91 *     </tr>
92 *     <tr>
93 *       <td nowrap valign="top" align="left"><code>[a-e]</code></td>
94 *       <td valign="top">The characters 'a' through 'e' inclusive, in Unicode code
95 *       point order</td>
96 *     </tr>
97 *     <tr>
98 *       <td nowrap valign="top" align="left"><code>[\\u4E01]</code></td>
99 *       <td valign="top">The character U+4E01</td>
100 *     </tr>
101 *     <tr>
102 *       <td nowrap valign="top" align="left"><code>[a{ab}{ac}]</code></td>
103 *       <td valign="top">The character 'a' and the multicharacter strings &quot;ab&quot; and
104 *       &quot;ac&quot;</td>
105 *     </tr>
106 *     <tr>
107 *       <td nowrap valign="top" align="left"><code>[\\p{Lu}]</code></td>
108 *       <td valign="top">All characters in the general category Uppercase Letter</td>
109 *     </tr>
110 *   </table>
111 * \htmlonly</blockquote>\endhtmlonly
112 *
113 * Any character may be preceded by a backslash in order to remove any special
114 * meaning.  White space characters, as defined by UCharacter.isWhitespace(), are
115 * ignored, unless they are escaped.
116 *
117 * <p>Property patterns specify a set of characters having a certain
118 * property as defined by the Unicode standard.  Both the POSIX-like
119 * "[:Lu:]" and the Perl-like syntax "\\p{Lu}" are recognized.  For a
120 * complete list of supported property patterns, see the User's Guide
121 * for UnicodeSet at
122 * <a href="http://icu-project.org/userguide/unicodeSet.html">
123 * http://icu-project.org/userguide/unicodeSet.html</a>.
124 * Actual determination of property data is defined by the underlying
125 * Unicode database as implemented by UCharacter.
126 *
127 * <p>Patterns specify individual characters, ranges of characters, and
128 * Unicode property sets.  When elements are concatenated, they
129 * specify their union.  To complement a set, place a '^' immediately
130 * after the opening '['.  Property patterns are inverted by modifying
131 * their delimiters; "[:^foo]" and "\\P{foo}".  In any other location,
132 * '^' has no special meaning.
133 *
134 * <p>Ranges are indicated by placing two a '-' between two
135 * characters, as in "a-z".  This specifies the range of all
136 * characters from the left to the right, in Unicode order.  If the
137 * left character is greater than or equal to the
138 * right character it is a syntax error.  If a '-' occurs as the first
139 * character after the opening '[' or '[^', or if it occurs as the
140 * last character before the closing ']', then it is taken as a
141 * literal.  Thus "[a\-b]", "[-ab]", and "[ab-]" all indicate the same
142 * set of three characters, 'a', 'b', and '-'.
143 *
144 * <p>Sets may be intersected using the '&' operator or the asymmetric
145 * set difference may be taken using the '-' operator, for example,
146 * "[[:L:]&[\\u0000-\\u0FFF]]" indicates the set of all Unicode letters
147 * with values less than 4096.  Operators ('&' and '|') have equal
148 * precedence and bind left-to-right.  Thus
149 * "[[:L:]-[a-z]-[\\u0100-\\u01FF]]" is equivalent to
150 * "[[[:L:]-[a-z]]-[\\u0100-\\u01FF]]".  This only really matters for
151 * difference; intersection is commutative.
152 *
153 * <table>
154 * <tr valign=top><td nowrap><code>[a]</code><td>The set containing 'a'
155 * <tr valign=top><td nowrap><code>[a-z]</code><td>The set containing 'a'
156 * through 'z' and all letters in between, in Unicode order
157 * <tr valign=top><td nowrap><code>[^a-z]</code><td>The set containing
158 * all characters but 'a' through 'z',
159 * that is, U+0000 through 'a'-1 and 'z'+1 through U+10FFFF
160 * <tr valign=top><td nowrap><code>[[<em>pat1</em>][<em>pat2</em>]]</code>
161 * <td>The union of sets specified by <em>pat1</em> and <em>pat2</em>
162 * <tr valign=top><td nowrap><code>[[<em>pat1</em>]&[<em>pat2</em>]]</code>
163 * <td>The intersection of sets specified by <em>pat1</em> and <em>pat2</em>
164 * <tr valign=top><td nowrap><code>[[<em>pat1</em>]-[<em>pat2</em>]]</code>
165 * <td>The asymmetric difference of sets specified by <em>pat1</em> and
166 * <em>pat2</em>
167 * <tr valign=top><td nowrap><code>[:Lu:] or \\p{Lu}</code>
168 * <td>The set of characters having the specified
169 * Unicode property; in
170 * this case, Unicode uppercase letters
171 * <tr valign=top><td nowrap><code>[:^Lu:] or \\P{Lu}</code>
172 * <td>The set of characters <em>not</em> having the given
173 * Unicode property
174 * </table>
175 *
176 * <p><b>Warning</b>: you cannot add an empty string ("") to a UnicodeSet.</p>
177 *
178 * <p><b>Formal syntax</b></p>
179 *
180 * \htmlonly<blockquote>\endhtmlonly
181 *   <table>
182 *     <tr align="top">
183 *       <td nowrap valign="top" align="right"><code>pattern :=&nbsp; </code></td>
184 *       <td valign="top"><code>('[' '^'? item* ']') |
185 *       property</code></td>
186 *     </tr>
187 *     <tr align="top">
188 *       <td nowrap valign="top" align="right"><code>item :=&nbsp; </code></td>
189 *       <td valign="top"><code>char | (char '-' char) | pattern-expr<br>
190 *       </code></td>
191 *     </tr>
192 *     <tr align="top">
193 *       <td nowrap valign="top" align="right"><code>pattern-expr :=&nbsp; </code></td>
194 *       <td valign="top"><code>pattern | pattern-expr pattern |
195 *       pattern-expr op pattern<br>
196 *       </code></td>
197 *     </tr>
198 *     <tr align="top">
199 *       <td nowrap valign="top" align="right"><code>op :=&nbsp; </code></td>
200 *       <td valign="top"><code>'&amp;' | '-'<br>
201 *       </code></td>
202 *     </tr>
203 *     <tr align="top">
204 *       <td nowrap valign="top" align="right"><code>special :=&nbsp; </code></td>
205 *       <td valign="top"><code>'[' | ']' | '-'<br>
206 *       </code></td>
207 *     </tr>
208 *     <tr align="top">
209 *       <td nowrap valign="top" align="right"><code>char :=&nbsp; </code></td>
210 *       <td valign="top"><em>any character that is not</em><code> special<br>
211 *       | ('\' </code><em>any character</em><code>)<br>
212 *       | ('\\u' hex hex hex hex)<br>
213 *       </code></td>
214 *     </tr>
215 *     <tr align="top">
216 *       <td nowrap valign="top" align="right"><code>hex :=&nbsp; </code></td>
217 *       <td valign="top"><em>any character for which
218 *       </em><code>Character.digit(c, 16)</code><em>
219 *       returns a non-negative result</em></td>
220 *     </tr>
221 *     <tr>
222 *       <td nowrap valign="top" align="right"><code>property :=&nbsp; </code></td>
223 *       <td valign="top"><em>a Unicode property set pattern</em></td>
224 *     </tr>
225 *   </table>
226 *   <br>
227 *   <table border="1">
228 *     <tr>
229 *       <td>Legend: <table>
230 *         <tr>
231 *           <td nowrap valign="top"><code>a := b</code></td>
232 *           <td width="20" valign="top">&nbsp; </td>
233 *           <td valign="top"><code>a</code> may be replaced by <code>b</code> </td>
234 *         </tr>
235 *         <tr>
236 *           <td nowrap valign="top"><code>a?</code></td>
237 *           <td valign="top"></td>
238 *           <td valign="top">zero or one instance of <code>a</code><br>
239 *           </td>
240 *         </tr>
241 *         <tr>
242 *           <td nowrap valign="top"><code>a*</code></td>
243 *           <td valign="top"></td>
244 *           <td valign="top">one or more instances of <code>a</code><br>
245 *           </td>
246 *         </tr>
247 *         <tr>
248 *           <td nowrap valign="top"><code>a | b</code></td>
249 *           <td valign="top"></td>
250 *           <td valign="top">either <code>a</code> or <code>b</code><br>
251 *           </td>
252 *         </tr>
253 *         <tr>
254 *           <td nowrap valign="top"><code>'a'</code></td>
255 *           <td valign="top"></td>
256 *           <td valign="top">the literal string between the quotes </td>
257 *         </tr>
258 *       </table>
259 *       </td>
260 *     </tr>
261 *   </table>
262 * \htmlonly</blockquote>\endhtmlonly
263 *
264 * <p>Note:
265 *  - Most UnicodeSet methods do not take a UErrorCode parameter because
266 *   there are usually very few opportunities for failure other than a shortage
267 *   of memory, error codes in low-level C++ string methods would be inconvenient,
268 *   and the error code as the last parameter (ICU convention) would prevent
269 *   the use of default parameter values.
270 *   Instead, such methods set the UnicodeSet into a "bogus" state
271 *   (see isBogus()) if an error occurs.
272 *
273 * @author Alan Liu
274 * @stable ICU 2.0
275 */
276class U_COMMON_API UnicodeSet : public UnicodeFilter {
277
278    int32_t len; // length of list used; 0 <= len <= capacity
279    int32_t capacity; // capacity of list
280    UChar32* list; // MUST be terminated with HIGH
281    BMPSet *bmpSet; // The set is frozen iff either bmpSet or stringSpan is not NULL.
282    UChar32* buffer; // internal buffer, may be NULL
283    int32_t bufferCapacity; // capacity of buffer
284    int32_t patLen;
285
286    /**
287     * The pattern representation of this set.  This may not be the
288     * most economical pattern.  It is the pattern supplied to
289     * applyPattern(), with variables substituted and whitespace
290     * removed.  For sets constructed without applyPattern(), or
291     * modified using the non-pattern API, this string will be empty,
292     * indicating that toPattern() must generate a pattern
293     * representation from the inversion list.
294     */
295    UChar *pat;
296    UVector* strings; // maintained in sorted order
297    UnicodeSetStringSpan *stringSpan;
298
299private:
300    enum { // constants
301        kIsBogus = 1       // This set is bogus (i.e. not valid)
302    };
303    uint8_t fFlags;         // Bit flag (see constants above)
304public:
305    /**
306     * Determine if this object contains a valid set.
307     * A bogus set has no value. It is different from an empty set.
308     * It can be used to indicate that no set value is available.
309     *
310     * @return TRUE if the set is valid, FALSE otherwise
311     * @see setToBogus()
312     * @stable ICU 4.0
313     */
314    inline UBool isBogus(void) const;
315
316    /**
317     * Make this UnicodeSet object invalid.
318     * The string will test TRUE with isBogus().
319     *
320     * A bogus set has no value. It is different from an empty set.
321     * It can be used to indicate that no set value is available.
322     *
323     * This utility function is used throughout the UnicodeSet
324     * implementation to indicate that a UnicodeSet operation failed,
325     * and may be used in other functions,
326     * especially but not exclusively when such functions do not
327     * take a UErrorCode for simplicity.
328     *
329     * @see isBogus()
330     * @stable ICU 4.0
331     */
332    void setToBogus();
333
334public:
335
336    enum {
337        /**
338         * Minimum value that can be stored in a UnicodeSet.
339         * @stable ICU 2.4
340         */
341        MIN_VALUE = 0,
342
343        /**
344         * Maximum value that can be stored in a UnicodeSet.
345         * @stable ICU 2.4
346         */
347        MAX_VALUE = 0x10ffff
348    };
349
350    //----------------------------------------------------------------
351    // Constructors &c
352    //----------------------------------------------------------------
353
354public:
355
356    /**
357     * Constructs an empty set.
358     * @stable ICU 2.0
359     */
360    UnicodeSet();
361
362    /**
363     * Constructs a set containing the given range. If <code>end >
364     * start</code> then an empty set is created.
365     *
366     * @param start first character, inclusive, of range
367     * @param end last character, inclusive, of range
368     * @stable ICU 2.4
369     */
370    UnicodeSet(UChar32 start, UChar32 end);
371
372    /**
373     * Constructs a set from the given pattern.  See the class
374     * description for the syntax of the pattern language.
375     * @param pattern a string specifying what characters are in the set
376     * @param status returns <code>U_ILLEGAL_ARGUMENT_ERROR</code> if the pattern
377     * contains a syntax error.
378     * @stable ICU 2.0
379     */
380    UnicodeSet(const UnicodeString& pattern,
381               UErrorCode& status);
382
383#ifndef U_HIDE_INTERNAL_API
384    /**
385     * Constructs a set from the given pattern.  See the class
386     * description for the syntax of the pattern language.
387     * @param pattern a string specifying what characters are in the set
388     * @param options bitmask for options to apply to the pattern.
389     * Valid options are USET_IGNORE_SPACE and USET_CASE_INSENSITIVE.
390     * @param symbols a symbol table mapping variable names to values
391     * and stand-in characters to UnicodeSets; may be NULL
392     * @param status returns <code>U_ILLEGAL_ARGUMENT_ERROR</code> if the pattern
393     * contains a syntax error.
394     * @internal
395     */
396    UnicodeSet(const UnicodeString& pattern,
397               uint32_t options,
398               const SymbolTable* symbols,
399               UErrorCode& status);
400#endif  /* U_HIDE_INTERNAL_API */
401
402    /**
403     * Constructs a set from the given pattern.  See the class description
404     * for the syntax of the pattern language.
405     * @param pattern a string specifying what characters are in the set
406     * @param pos on input, the position in pattern at which to start parsing.
407     * On output, the position after the last character parsed.
408     * @param options bitmask for options to apply to the pattern.
409     * Valid options are USET_IGNORE_SPACE and USET_CASE_INSENSITIVE.
410     * @param symbols a symbol table mapping variable names to values
411     * and stand-in characters to UnicodeSets; may be NULL
412     * @param status input-output error code
413     * @stable ICU 2.8
414     */
415    UnicodeSet(const UnicodeString& pattern, ParsePosition& pos,
416               uint32_t options,
417               const SymbolTable* symbols,
418               UErrorCode& status);
419
420    /**
421     * Constructs a set that is identical to the given UnicodeSet.
422     * @stable ICU 2.0
423     */
424    UnicodeSet(const UnicodeSet& o);
425
426    /**
427     * Destructs the set.
428     * @stable ICU 2.0
429     */
430    virtual ~UnicodeSet();
431
432    /**
433     * Assigns this object to be a copy of another.
434     * A frozen set will not be modified.
435     * @stable ICU 2.0
436     */
437    UnicodeSet& operator=(const UnicodeSet& o);
438
439    /**
440     * Compares the specified object with this set for equality.  Returns
441     * <tt>true</tt> if the two sets
442     * have the same size, and every member of the specified set is
443     * contained in this set (or equivalently, every member of this set is
444     * contained in the specified set).
445     *
446     * @param o set to be compared for equality with this set.
447     * @return <tt>true</tt> if the specified set is equal to this set.
448     * @stable ICU 2.0
449     */
450    virtual UBool operator==(const UnicodeSet& o) const;
451
452    /**
453     * Compares the specified object with this set for equality.  Returns
454     * <tt>true</tt> if the specified set is not equal to this set.
455     * @stable ICU 2.0
456     */
457    UBool operator!=(const UnicodeSet& o) const;
458
459    /**
460     * Returns a copy of this object.  All UnicodeFunctor objects have
461     * to support cloning in order to allow classes using
462     * UnicodeFunctors, such as Transliterator, to implement cloning.
463     * If this set is frozen, then the clone will be frozen as well.
464     * Use cloneAsThawed() for a mutable clone of a frozen set.
465     * @see cloneAsThawed
466     * @stable ICU 2.0
467     */
468    virtual UnicodeFunctor* clone() const;
469
470    /**
471     * Returns the hash code value for this set.
472     *
473     * @return the hash code value for this set.
474     * @see Object#hashCode()
475     * @stable ICU 2.0
476     */
477    virtual int32_t hashCode(void) const;
478
479    /**
480     * Get a UnicodeSet pointer from a USet
481     *
482     * @param uset a USet (the ICU plain C type for UnicodeSet)
483     * @return the corresponding UnicodeSet pointer.
484     *
485     * @stable ICU 4.2
486     */
487    inline static UnicodeSet *fromUSet(USet *uset);
488
489    /**
490     * Get a UnicodeSet pointer from a const USet
491     *
492     * @param uset a const USet (the ICU plain C type for UnicodeSet)
493     * @return the corresponding UnicodeSet pointer.
494     *
495     * @stable ICU 4.2
496     */
497    inline static const UnicodeSet *fromUSet(const USet *uset);
498
499    /**
500     * Produce a USet * pointer for this UnicodeSet.
501     * USet is the plain C type for UnicodeSet
502     *
503     * @return a USet pointer for this UnicodeSet
504     * @stable ICU 4.2
505     */
506    inline USet *toUSet();
507
508
509    /**
510     * Produce a const USet * pointer for this UnicodeSet.
511     * USet is the plain C type for UnicodeSet
512     *
513     * @return a const USet pointer for this UnicodeSet
514     * @stable ICU 4.2
515     */
516    inline const USet * toUSet() const;
517
518
519    //----------------------------------------------------------------
520    // Freezable API
521    //----------------------------------------------------------------
522
523    /**
524     * Determines whether the set has been frozen (made immutable) or not.
525     * See the ICU4J Freezable interface for details.
526     * @return TRUE/FALSE for whether the set has been frozen
527     * @see freeze
528     * @see cloneAsThawed
529     * @stable ICU 3.8
530     */
531    inline UBool isFrozen() const;
532
533    /**
534     * Freeze the set (make it immutable).
535     * Once frozen, it cannot be unfrozen and is therefore thread-safe
536     * until it is deleted.
537     * See the ICU4J Freezable interface for details.
538     * Freezing the set may also make some operations faster, for example
539     * contains() and span().
540     * A frozen set will not be modified. (It remains frozen.)
541     * @return this set.
542     * @see isFrozen
543     * @see cloneAsThawed
544     * @stable ICU 3.8
545     */
546    UnicodeFunctor *freeze();
547
548    /**
549     * Clone the set and make the clone mutable.
550     * See the ICU4J Freezable interface for details.
551     * @return the mutable clone
552     * @see freeze
553     * @see isFrozen
554     * @stable ICU 3.8
555     */
556    UnicodeFunctor *cloneAsThawed() const;
557
558    //----------------------------------------------------------------
559    // Public API
560    //----------------------------------------------------------------
561
562    /**
563     * Make this object represent the range <code>start - end</code>.
564     * If <code>end > start</code> then this object is set to an
565     * an empty range.
566     * A frozen set will not be modified.
567     *
568     * @param start first character in the set, inclusive
569     * @param end last character in the set, inclusive
570     * @stable ICU 2.4
571     */
572    UnicodeSet& set(UChar32 start, UChar32 end);
573
574    /**
575     * Return true if the given position, in the given pattern, appears
576     * to be the start of a UnicodeSet pattern.
577     * @stable ICU 2.4
578     */
579    static UBool resemblesPattern(const UnicodeString& pattern,
580                                  int32_t pos);
581
582    /**
583     * Modifies this set to represent the set specified by the given
584     * pattern, ignoring Unicode Pattern_White_Space characters.
585     * See the class description for the syntax of the pattern language.
586     * A frozen set will not be modified.
587     * @param pattern a string specifying what characters are in the set
588     * @param status returns <code>U_ILLEGAL_ARGUMENT_ERROR</code> if the pattern
589     * contains a syntax error.
590     * <em> Empties the set passed before applying the pattern.</em>
591     * @return a reference to this
592     * @stable ICU 2.0
593     */
594    UnicodeSet& applyPattern(const UnicodeString& pattern,
595                             UErrorCode& status);
596
597#ifndef U_HIDE_INTERNAL_API
598    /**
599     * Modifies this set to represent the set specified by the given
600     * pattern, optionally ignoring Unicode Pattern_White_Space characters.
601     * See the class description for the syntax of the pattern language.
602     * A frozen set will not be modified.
603     * @param pattern a string specifying what characters are in the set
604     * @param options bitmask for options to apply to the pattern.
605     * Valid options are USET_IGNORE_SPACE and USET_CASE_INSENSITIVE.
606     * @param symbols a symbol table mapping variable names to
607     * values and stand-ins to UnicodeSets; may be NULL
608     * @param status returns <code>U_ILLEGAL_ARGUMENT_ERROR</code> if the pattern
609     * contains a syntax error.
610     *<em> Empties the set passed before applying the pattern.</em>
611     * @return a reference to this
612     * @internal
613     */
614    UnicodeSet& applyPattern(const UnicodeString& pattern,
615                             uint32_t options,
616                             const SymbolTable* symbols,
617                             UErrorCode& status);
618#endif  /* U_HIDE_INTERNAL_API */
619
620    /**
621     * Parses the given pattern, starting at the given position.  The
622     * character at pattern.charAt(pos.getIndex()) must be '[', or the
623     * parse fails.  Parsing continues until the corresponding closing
624     * ']'.  If a syntax error is encountered between the opening and
625     * closing brace, the parse fails.  Upon return from a successful
626     * parse, the ParsePosition is updated to point to the character
627     * following the closing ']', and a StringBuffer containing a
628     * pairs list for the parsed pattern is returned.  This method calls
629     * itself recursively to parse embedded subpatterns.
630     *<em> Empties the set passed before applying the pattern.</em>
631     * A frozen set will not be modified.
632     *
633     * @param pattern the string containing the pattern to be parsed.
634     * The portion of the string from pos.getIndex(), which must be a
635     * '[', to the corresponding closing ']', is parsed.
636     * @param pos upon entry, the position at which to being parsing.
637     * The character at pattern.charAt(pos.getIndex()) must be a '['.
638     * Upon return from a successful parse, pos.getIndex() is either
639     * the character after the closing ']' of the parsed pattern, or
640     * pattern.length() if the closing ']' is the last character of
641     * the pattern string.
642     * @param options bitmask for options to apply to the pattern.
643     * Valid options are USET_IGNORE_SPACE and USET_CASE_INSENSITIVE.
644     * @param symbols a symbol table mapping variable names to
645     * values and stand-ins to UnicodeSets; may be NULL
646     * @param status returns <code>U_ILLEGAL_ARGUMENT_ERROR</code> if the pattern
647     * contains a syntax error.
648     * @return a reference to this
649     * @stable ICU 2.8
650     */
651    UnicodeSet& applyPattern(const UnicodeString& pattern,
652                             ParsePosition& pos,
653                             uint32_t options,
654                             const SymbolTable* symbols,
655                             UErrorCode& status);
656
657    /**
658     * Returns a string representation of this set.  If the result of
659     * calling this function is passed to a UnicodeSet constructor, it
660     * will produce another set that is equal to this one.
661     * A frozen set will not be modified.
662     * @param result the string to receive the rules.  Previous
663     * contents will be deleted.
664     * @param escapeUnprintable if TRUE then convert unprintable
665     * character to their hex escape representations, \\uxxxx or
666     * \\Uxxxxxxxx.  Unprintable characters are those other than
667     * U+000A, U+0020..U+007E.
668     * @stable ICU 2.0
669     */
670    virtual UnicodeString& toPattern(UnicodeString& result,
671                             UBool escapeUnprintable = FALSE) const;
672
673    /**
674     * Modifies this set to contain those code points which have the given value
675     * for the given binary or enumerated property, as returned by
676     * u_getIntPropertyValue.  Prior contents of this set are lost.
677     * A frozen set will not be modified.
678     *
679     * @param prop a property in the range UCHAR_BIN_START..UCHAR_BIN_LIMIT-1
680     * or UCHAR_INT_START..UCHAR_INT_LIMIT-1
681     * or UCHAR_MASK_START..UCHAR_MASK_LIMIT-1.
682     *
683     * @param value a value in the range u_getIntPropertyMinValue(prop)..
684     * u_getIntPropertyMaxValue(prop), with one exception.  If prop is
685     * UCHAR_GENERAL_CATEGORY_MASK, then value should not be a UCharCategory, but
686     * rather a mask value produced by U_GET_GC_MASK().  This allows grouped
687     * categories such as [:L:] to be represented.
688     *
689     * @param ec error code input/output parameter
690     *
691     * @return a reference to this set
692     *
693     * @stable ICU 2.4
694     */
695    UnicodeSet& applyIntPropertyValue(UProperty prop,
696                                      int32_t value,
697                                      UErrorCode& ec);
698
699    /**
700     * Modifies this set to contain those code points which have the
701     * given value for the given property.  Prior contents of this
702     * set are lost.
703     * A frozen set will not be modified.
704     *
705     * @param prop a property alias, either short or long.  The name is matched
706     * loosely.  See PropertyAliases.txt for names and a description of loose
707     * matching.  If the value string is empty, then this string is interpreted
708     * as either a General_Category value alias, a Script value alias, a binary
709     * property alias, or a special ID.  Special IDs are matched loosely and
710     * correspond to the following sets:
711     *
712     * "ANY" = [\\u0000-\\U0010FFFF],
713     * "ASCII" = [\\u0000-\\u007F],
714     * "Assigned" = [:^Cn:].
715     *
716     * @param value a value alias, either short or long.  The name is matched
717     * loosely.  See PropertyValueAliases.txt for names and a description of
718     * loose matching.  In addition to aliases listed, numeric values and
719     * canonical combining classes may be expressed numerically, e.g., ("nv",
720     * "0.5") or ("ccc", "220").  The value string may also be empty.
721     *
722     * @param ec error code input/output parameter
723     *
724     * @return a reference to this set
725     *
726     * @stable ICU 2.4
727     */
728    UnicodeSet& applyPropertyAlias(const UnicodeString& prop,
729                                   const UnicodeString& value,
730                                   UErrorCode& ec);
731
732    /**
733     * Returns the number of elements in this set (its cardinality).
734     * Note than the elements of a set may include both individual
735     * codepoints and strings.
736     *
737     * @return the number of elements in this set (its cardinality).
738     * @stable ICU 2.0
739     */
740    virtual int32_t size(void) const;
741
742    /**
743     * Returns <tt>true</tt> if this set contains no elements.
744     *
745     * @return <tt>true</tt> if this set contains no elements.
746     * @stable ICU 2.0
747     */
748    virtual UBool isEmpty(void) const;
749
750    /**
751     * Returns true if this set contains the given character.
752     * This function works faster with a frozen set.
753     * @param c character to be checked for containment
754     * @return true if the test condition is met
755     * @stable ICU 2.0
756     */
757    virtual UBool contains(UChar32 c) const;
758
759    /**
760     * Returns true if this set contains every character
761     * of the given range.
762     * @param start first character, inclusive, of the range
763     * @param end last character, inclusive, of the range
764     * @return true if the test condition is met
765     * @stable ICU 2.0
766     */
767    virtual UBool contains(UChar32 start, UChar32 end) const;
768
769    /**
770     * Returns <tt>true</tt> if this set contains the given
771     * multicharacter string.
772     * @param s string to be checked for containment
773     * @return <tt>true</tt> if this set contains the specified string
774     * @stable ICU 2.4
775     */
776    UBool contains(const UnicodeString& s) const;
777
778    /**
779     * Returns true if this set contains all the characters and strings
780     * of the given set.
781     * @param c set to be checked for containment
782     * @return true if the test condition is met
783     * @stable ICU 2.4
784     */
785    virtual UBool containsAll(const UnicodeSet& c) const;
786
787    /**
788     * Returns true if this set contains all the characters
789     * of the given string.
790     * @param s string containing characters to be checked for containment
791     * @return true if the test condition is met
792     * @stable ICU 2.4
793     */
794    UBool containsAll(const UnicodeString& s) const;
795
796    /**
797     * Returns true if this set contains none of the characters
798     * of the given range.
799     * @param start first character, inclusive, of the range
800     * @param end last character, inclusive, of the range
801     * @return true if the test condition is met
802     * @stable ICU 2.4
803     */
804    UBool containsNone(UChar32 start, UChar32 end) const;
805
806    /**
807     * Returns true if this set contains none of the characters and strings
808     * of the given set.
809     * @param c set to be checked for containment
810     * @return true if the test condition is met
811     * @stable ICU 2.4
812     */
813    UBool containsNone(const UnicodeSet& c) const;
814
815    /**
816     * Returns true if this set contains none of the characters
817     * of the given string.
818     * @param s string containing characters to be checked for containment
819     * @return true if the test condition is met
820     * @stable ICU 2.4
821     */
822    UBool containsNone(const UnicodeString& s) const;
823
824    /**
825     * Returns true if this set contains one or more of the characters
826     * in the given range.
827     * @param start first character, inclusive, of the range
828     * @param end last character, inclusive, of the range
829     * @return true if the condition is met
830     * @stable ICU 2.4
831     */
832    inline UBool containsSome(UChar32 start, UChar32 end) const;
833
834    /**
835     * Returns true if this set contains one or more of the characters
836     * and strings of the given set.
837     * @param s The set to be checked for containment
838     * @return true if the condition is met
839     * @stable ICU 2.4
840     */
841    inline UBool containsSome(const UnicodeSet& s) const;
842
843    /**
844     * Returns true if this set contains one or more of the characters
845     * of the given string.
846     * @param s string containing characters to be checked for containment
847     * @return true if the condition is met
848     * @stable ICU 2.4
849     */
850    inline UBool containsSome(const UnicodeString& s) const;
851
852    /**
853     * Returns the length of the initial substring of the input string which
854     * consists only of characters and strings that are contained in this set
855     * (USET_SPAN_CONTAINED, USET_SPAN_SIMPLE),
856     * or only of characters and strings that are not contained
857     * in this set (USET_SPAN_NOT_CONTAINED).
858     * See USetSpanCondition for details.
859     * Similar to the strspn() C library function.
860     * Unpaired surrogates are treated according to contains() of their surrogate code points.
861     * This function works faster with a frozen set and with a non-negative string length argument.
862     * @param s start of the string
863     * @param length of the string; can be -1 for NUL-terminated
864     * @param spanCondition specifies the containment condition
865     * @return the length of the initial substring according to the spanCondition;
866     *         0 if the start of the string does not fit the spanCondition
867     * @stable ICU 3.8
868     * @see USetSpanCondition
869     */
870    int32_t span(const UChar *s, int32_t length, USetSpanCondition spanCondition) const;
871
872    /**
873     * Returns the end of the substring of the input string according to the USetSpanCondition.
874     * Same as <code>start+span(s.getBuffer()+start, s.length()-start, spanCondition)</code>
875     * after pinning start to 0<=start<=s.length().
876     * @param s the string
877     * @param start the start index in the string for the span operation
878     * @param spanCondition specifies the containment condition
879     * @return the exclusive end of the substring according to the spanCondition;
880     *         the substring s.tempSubStringBetween(start, end) fulfills the spanCondition
881     * @stable ICU 4.4
882     * @see USetSpanCondition
883     */
884    inline int32_t span(const UnicodeString &s, int32_t start, USetSpanCondition spanCondition) const;
885
886    /**
887     * Returns the start of the trailing substring of the input string which
888     * consists only of characters and strings that are contained in this set
889     * (USET_SPAN_CONTAINED, USET_SPAN_SIMPLE),
890     * or only of characters and strings that are not contained
891     * in this set (USET_SPAN_NOT_CONTAINED).
892     * See USetSpanCondition for details.
893     * Unpaired surrogates are treated according to contains() of their surrogate code points.
894     * This function works faster with a frozen set and with a non-negative string length argument.
895     * @param s start of the string
896     * @param length of the string; can be -1 for NUL-terminated
897     * @param spanCondition specifies the containment condition
898     * @return the start of the trailing substring according to the spanCondition;
899     *         the string length if the end of the string does not fit the spanCondition
900     * @stable ICU 3.8
901     * @see USetSpanCondition
902     */
903    int32_t spanBack(const UChar *s, int32_t length, USetSpanCondition spanCondition) const;
904
905    /**
906     * Returns the start of the substring of the input string according to the USetSpanCondition.
907     * Same as <code>spanBack(s.getBuffer(), limit, spanCondition)</code>
908     * after pinning limit to 0<=end<=s.length().
909     * @param s the string
910     * @param limit the exclusive-end index in the string for the span operation
911     *              (use s.length() or INT32_MAX for spanning back from the end of the string)
912     * @param spanCondition specifies the containment condition
913     * @return the start of the substring according to the spanCondition;
914     *         the substring s.tempSubStringBetween(start, limit) fulfills the spanCondition
915     * @stable ICU 4.4
916     * @see USetSpanCondition
917     */
918    inline int32_t spanBack(const UnicodeString &s, int32_t limit, USetSpanCondition spanCondition) const;
919
920    /**
921     * Returns the length of the initial substring of the input string which
922     * consists only of characters and strings that are contained in this set
923     * (USET_SPAN_CONTAINED, USET_SPAN_SIMPLE),
924     * or only of characters and strings that are not contained
925     * in this set (USET_SPAN_NOT_CONTAINED).
926     * See USetSpanCondition for details.
927     * Similar to the strspn() C library function.
928     * Malformed byte sequences are treated according to contains(0xfffd).
929     * This function works faster with a frozen set and with a non-negative string length argument.
930     * @param s start of the string (UTF-8)
931     * @param length of the string; can be -1 for NUL-terminated
932     * @param spanCondition specifies the containment condition
933     * @return the length of the initial substring according to the spanCondition;
934     *         0 if the start of the string does not fit the spanCondition
935     * @stable ICU 3.8
936     * @see USetSpanCondition
937     */
938    int32_t spanUTF8(const char *s, int32_t length, USetSpanCondition spanCondition) const;
939
940    /**
941     * Returns the start of the trailing substring of the input string which
942     * consists only of characters and strings that are contained in this set
943     * (USET_SPAN_CONTAINED, USET_SPAN_SIMPLE),
944     * or only of characters and strings that are not contained
945     * in this set (USET_SPAN_NOT_CONTAINED).
946     * See USetSpanCondition for details.
947     * Malformed byte sequences are treated according to contains(0xfffd).
948     * This function works faster with a frozen set and with a non-negative string length argument.
949     * @param s start of the string (UTF-8)
950     * @param length of the string; can be -1 for NUL-terminated
951     * @param spanCondition specifies the containment condition
952     * @return the start of the trailing substring according to the spanCondition;
953     *         the string length if the end of the string does not fit the spanCondition
954     * @stable ICU 3.8
955     * @see USetSpanCondition
956     */
957    int32_t spanBackUTF8(const char *s, int32_t length, USetSpanCondition spanCondition) const;
958
959    /**
960     * Implement UnicodeMatcher::matches()
961     * @stable ICU 2.4
962     */
963    virtual UMatchDegree matches(const Replaceable& text,
964                         int32_t& offset,
965                         int32_t limit,
966                         UBool incremental);
967
968private:
969    /**
970     * Returns the longest match for s in text at the given position.
971     * If limit > start then match forward from start+1 to limit
972     * matching all characters except s.charAt(0).  If limit < start,
973     * go backward starting from start-1 matching all characters
974     * except s.charAt(s.length()-1).  This method assumes that the
975     * first character, text.charAt(start), matches s, so it does not
976     * check it.
977     * @param text the text to match
978     * @param start the first character to match.  In the forward
979     * direction, text.charAt(start) is matched against s.charAt(0).
980     * In the reverse direction, it is matched against
981     * s.charAt(s.length()-1).
982     * @param limit the limit offset for matching, either last+1 in
983     * the forward direction, or last-1 in the reverse direction,
984     * where last is the index of the last character to match.
985     * @param s
986     * @return If part of s matches up to the limit, return |limit -
987     * start|.  If all of s matches before reaching the limit, return
988     * s.length().  If there is a mismatch between s and text, return
989     * 0
990     */
991    static int32_t matchRest(const Replaceable& text,
992                             int32_t start, int32_t limit,
993                             const UnicodeString& s);
994
995    /**
996     * Returns the smallest value i such that c < list[i].  Caller
997     * must ensure that c is a legal value or this method will enter
998     * an infinite loop.  This method performs a binary search.
999     * @param c a character in the range MIN_VALUE..MAX_VALUE
1000     * inclusive
1001     * @return the smallest integer i in the range 0..len-1,
1002     * inclusive, such that c < list[i]
1003     */
1004    int32_t findCodePoint(UChar32 c) const;
1005
1006public:
1007
1008    /**
1009     * Implementation of UnicodeMatcher API.  Union the set of all
1010     * characters that may be matched by this object into the given
1011     * set.
1012     * @param toUnionTo the set into which to union the source characters
1013     * @stable ICU 2.4
1014     */
1015    virtual void addMatchSetTo(UnicodeSet& toUnionTo) const;
1016
1017    /**
1018     * Returns the index of the given character within this set, where
1019     * the set is ordered by ascending code point.  If the character
1020     * is not in this set, return -1.  The inverse of this method is
1021     * <code>charAt()</code>.
1022     * @return an index from 0..size()-1, or -1
1023     * @stable ICU 2.4
1024     */
1025    int32_t indexOf(UChar32 c) const;
1026
1027    /**
1028     * Returns the character at the given index within this set, where
1029     * the set is ordered by ascending code point.  If the index is
1030     * out of range, return (UChar32)-1.  The inverse of this method is
1031     * <code>indexOf()</code>.
1032     * @param index an index from 0..size()-1
1033     * @return the character at the given index, or (UChar32)-1.
1034     * @stable ICU 2.4
1035     */
1036    UChar32 charAt(int32_t index) const;
1037
1038    /**
1039     * Adds the specified range to this set if it is not already
1040     * present.  If this set already contains the specified range,
1041     * the call leaves this set unchanged.  If <code>end > start</code>
1042     * then an empty range is added, leaving the set unchanged.
1043     * This is equivalent to a boolean logic OR, or a set UNION.
1044     * A frozen set will not be modified.
1045     *
1046     * @param start first character, inclusive, of range to be added
1047     * to this set.
1048     * @param end last character, inclusive, of range to be added
1049     * to this set.
1050     * @stable ICU 2.0
1051     */
1052    virtual UnicodeSet& add(UChar32 start, UChar32 end);
1053
1054    /**
1055     * Adds the specified character to this set if it is not already
1056     * present.  If this set already contains the specified character,
1057     * the call leaves this set unchanged.
1058     * A frozen set will not be modified.
1059     * @stable ICU 2.0
1060     */
1061    UnicodeSet& add(UChar32 c);
1062
1063    /**
1064     * Adds the specified multicharacter to this set if it is not already
1065     * present.  If this set already contains the multicharacter,
1066     * the call leaves this set unchanged.
1067     * Thus "ch" => {"ch"}
1068     * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>
1069     * A frozen set will not be modified.
1070     * @param s the source string
1071     * @return this object, for chaining
1072     * @stable ICU 2.4
1073     */
1074    UnicodeSet& add(const UnicodeString& s);
1075
1076 private:
1077    /**
1078     * @return a code point IF the string consists of a single one.
1079     * otherwise returns -1.
1080     * @param s string to test
1081     */
1082    static int32_t getSingleCP(const UnicodeString& s);
1083
1084    void _add(const UnicodeString& s);
1085
1086 public:
1087    /**
1088     * Adds each of the characters in this string to the set. Thus "ch" => {"c", "h"}
1089     * If this set already any particular character, it has no effect on that character.
1090     * A frozen set will not be modified.
1091     * @param s the source string
1092     * @return this object, for chaining
1093     * @stable ICU 2.4
1094     */
1095    UnicodeSet& addAll(const UnicodeString& s);
1096
1097    /**
1098     * Retains EACH of the characters in this string. Note: "ch" == {"c", "h"}
1099     * If this set already any particular character, it has no effect on that character.
1100     * A frozen set will not be modified.
1101     * @param s the source string
1102     * @return this object, for chaining
1103     * @stable ICU 2.4
1104     */
1105    UnicodeSet& retainAll(const UnicodeString& s);
1106
1107    /**
1108     * Complement EACH of the characters in this string. Note: "ch" == {"c", "h"}
1109     * If this set already any particular character, it has no effect on that character.
1110     * A frozen set will not be modified.
1111     * @param s the source string
1112     * @return this object, for chaining
1113     * @stable ICU 2.4
1114     */
1115    UnicodeSet& complementAll(const UnicodeString& s);
1116
1117    /**
1118     * Remove EACH of the characters in this string. Note: "ch" == {"c", "h"}
1119     * If this set already any particular character, it has no effect on that character.
1120     * A frozen set will not be modified.
1121     * @param s the source string
1122     * @return this object, for chaining
1123     * @stable ICU 2.4
1124     */
1125    UnicodeSet& removeAll(const UnicodeString& s);
1126
1127    /**
1128     * Makes a set from a multicharacter string. Thus "ch" => {"ch"}
1129     * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>
1130     * @param s the source string
1131     * @return a newly created set containing the given string.
1132     * The caller owns the return object and is responsible for deleting it.
1133     * @stable ICU 2.4
1134     */
1135    static UnicodeSet* U_EXPORT2 createFrom(const UnicodeString& s);
1136
1137
1138    /**
1139     * Makes a set from each of the characters in the string. Thus "ch" => {"c", "h"}
1140     * @param s the source string
1141     * @return a newly created set containing the given characters
1142     * The caller owns the return object and is responsible for deleting it.
1143     * @stable ICU 2.4
1144     */
1145    static UnicodeSet* U_EXPORT2 createFromAll(const UnicodeString& s);
1146
1147    /**
1148     * Retain only the elements in this set that are contained in the
1149     * specified range.  If <code>end > start</code> then an empty range is
1150     * retained, leaving the set empty.  This is equivalent to
1151     * a boolean logic AND, or a set INTERSECTION.
1152     * A frozen set will not be modified.
1153     *
1154     * @param start first character, inclusive, of range to be retained
1155     * to this set.
1156     * @param end last character, inclusive, of range to be retained
1157     * to this set.
1158     * @stable ICU 2.0
1159     */
1160    virtual UnicodeSet& retain(UChar32 start, UChar32 end);
1161
1162
1163    /**
1164     * Retain the specified character from this set if it is present.
1165     * A frozen set will not be modified.
1166     * @stable ICU 2.0
1167     */
1168    UnicodeSet& retain(UChar32 c);
1169
1170    /**
1171     * Removes the specified range from this set if it is present.
1172     * The set will not contain the specified range once the call
1173     * returns.  If <code>end > start</code> then an empty range is
1174     * removed, leaving the set unchanged.
1175     * A frozen set will not be modified.
1176     *
1177     * @param start first character, inclusive, of range to be removed
1178     * from this set.
1179     * @param end last character, inclusive, of range to be removed
1180     * from this set.
1181     * @stable ICU 2.0
1182     */
1183    virtual UnicodeSet& remove(UChar32 start, UChar32 end);
1184
1185    /**
1186     * Removes the specified character from this set if it is present.
1187     * The set will not contain the specified range once the call
1188     * returns.
1189     * A frozen set will not be modified.
1190     * @stable ICU 2.0
1191     */
1192    UnicodeSet& remove(UChar32 c);
1193
1194    /**
1195     * Removes the specified string from this set if it is present.
1196     * The set will not contain the specified character once the call
1197     * returns.
1198     * A frozen set will not be modified.
1199     * @param s the source string
1200     * @return this object, for chaining
1201     * @stable ICU 2.4
1202     */
1203    UnicodeSet& remove(const UnicodeString& s);
1204
1205    /**
1206     * Inverts this set.  This operation modifies this set so that
1207     * its value is its complement.  This is equivalent to
1208     * <code>complement(MIN_VALUE, MAX_VALUE)</code>.
1209     * A frozen set will not be modified.
1210     * @stable ICU 2.0
1211     */
1212    virtual UnicodeSet& complement(void);
1213
1214    /**
1215     * Complements the specified range in this set.  Any character in
1216     * the range will be removed if it is in this set, or will be
1217     * added if it is not in this set.  If <code>end > start</code>
1218     * then an empty range is complemented, leaving the set unchanged.
1219     * This is equivalent to a boolean logic XOR.
1220     * A frozen set will not be modified.
1221     *
1222     * @param start first character, inclusive, of range to be removed
1223     * from this set.
1224     * @param end last character, inclusive, of range to be removed
1225     * from this set.
1226     * @stable ICU 2.0
1227     */
1228    virtual UnicodeSet& complement(UChar32 start, UChar32 end);
1229
1230    /**
1231     * Complements the specified character in this set.  The character
1232     * will be removed if it is in this set, or will be added if it is
1233     * not in this set.
1234     * A frozen set will not be modified.
1235     * @stable ICU 2.0
1236     */
1237    UnicodeSet& complement(UChar32 c);
1238
1239    /**
1240     * Complement the specified string in this set.
1241     * The set will not contain the specified string once the call
1242     * returns.
1243     * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>
1244     * A frozen set will not be modified.
1245     * @param s the string to complement
1246     * @return this object, for chaining
1247     * @stable ICU 2.4
1248     */
1249    UnicodeSet& complement(const UnicodeString& s);
1250
1251    /**
1252     * Adds all of the elements in the specified set to this set if
1253     * they're not already present.  This operation effectively
1254     * modifies this set so that its value is the <i>union</i> of the two
1255     * sets.  The behavior of this operation is unspecified if the specified
1256     * collection is modified while the operation is in progress.
1257     * A frozen set will not be modified.
1258     *
1259     * @param c set whose elements are to be added to this set.
1260     * @see #add(UChar32, UChar32)
1261     * @stable ICU 2.0
1262     */
1263    virtual UnicodeSet& addAll(const UnicodeSet& c);
1264
1265    /**
1266     * Retains only the elements in this set that are contained in the
1267     * specified set.  In other words, removes from this set all of
1268     * its elements that are not contained in the specified set.  This
1269     * operation effectively modifies this set so that its value is
1270     * the <i>intersection</i> of the two sets.
1271     * A frozen set will not be modified.
1272     *
1273     * @param c set that defines which elements this set will retain.
1274     * @stable ICU 2.0
1275     */
1276    virtual UnicodeSet& retainAll(const UnicodeSet& c);
1277
1278    /**
1279     * Removes from this set all of its elements that are contained in the
1280     * specified set.  This operation effectively modifies this
1281     * set so that its value is the <i>asymmetric set difference</i> of
1282     * the two sets.
1283     * A frozen set will not be modified.
1284     *
1285     * @param c set that defines which elements will be removed from
1286     *          this set.
1287     * @stable ICU 2.0
1288     */
1289    virtual UnicodeSet& removeAll(const UnicodeSet& c);
1290
1291    /**
1292     * Complements in this set all elements contained in the specified
1293     * set.  Any character in the other set will be removed if it is
1294     * in this set, or will be added if it is not in this set.
1295     * A frozen set will not be modified.
1296     *
1297     * @param c set that defines which elements will be xor'ed from
1298     *          this set.
1299     * @stable ICU 2.4
1300     */
1301    virtual UnicodeSet& complementAll(const UnicodeSet& c);
1302
1303    /**
1304     * Removes all of the elements from this set.  This set will be
1305     * empty after this call returns.
1306     * A frozen set will not be modified.
1307     * @stable ICU 2.0
1308     */
1309    virtual UnicodeSet& clear(void);
1310
1311    /**
1312     * Close this set over the given attribute.  For the attribute
1313     * USET_CASE, the result is to modify this set so that:
1314     *
1315     * 1. For each character or string 'a' in this set, all strings or
1316     * characters 'b' such that foldCase(a) == foldCase(b) are added
1317     * to this set.
1318     *
1319     * 2. For each string 'e' in the resulting set, if e !=
1320     * foldCase(e), 'e' will be removed.
1321     *
1322     * Example: [aq\\u00DF{Bc}{bC}{Fi}] => [aAqQ\\u00DF\\uFB01{ss}{bc}{fi}]
1323     *
1324     * (Here foldCase(x) refers to the operation u_strFoldCase, and a
1325     * == b denotes that the contents are the same, not pointer
1326     * comparison.)
1327     *
1328     * A frozen set will not be modified.
1329     *
1330     * @param attribute bitmask for attributes to close over.
1331     * Currently only the USET_CASE bit is supported.  Any undefined bits
1332     * are ignored.
1333     * @return a reference to this set.
1334     * @stable ICU 4.2
1335     */
1336    UnicodeSet& closeOver(int32_t attribute);
1337
1338    /**
1339     * Remove all strings from this set.
1340     *
1341     * @return a reference to this set.
1342     * @stable ICU 4.2
1343     */
1344    virtual UnicodeSet &removeAllStrings();
1345
1346    /**
1347     * Iteration method that returns the number of ranges contained in
1348     * this set.
1349     * @see #getRangeStart
1350     * @see #getRangeEnd
1351     * @stable ICU 2.4
1352     */
1353    virtual int32_t getRangeCount(void) const;
1354
1355    /**
1356     * Iteration method that returns the first character in the
1357     * specified range of this set.
1358     * @see #getRangeCount
1359     * @see #getRangeEnd
1360     * @stable ICU 2.4
1361     */
1362    virtual UChar32 getRangeStart(int32_t index) const;
1363
1364    /**
1365     * Iteration method that returns the last character in the
1366     * specified range of this set.
1367     * @see #getRangeStart
1368     * @see #getRangeEnd
1369     * @stable ICU 2.4
1370     */
1371    virtual UChar32 getRangeEnd(int32_t index) const;
1372
1373    /**
1374     * Serializes this set into an array of 16-bit integers.  Serialization
1375     * (currently) only records the characters in the set; multicharacter
1376     * strings are ignored.
1377     *
1378     * The array has following format (each line is one 16-bit
1379     * integer):
1380     *
1381     *  length     = (n+2*m) | (m!=0?0x8000:0)
1382     *  bmpLength  = n; present if m!=0
1383     *  bmp[0]
1384     *  bmp[1]
1385     *  ...
1386     *  bmp[n-1]
1387     *  supp-high[0]
1388     *  supp-low[0]
1389     *  supp-high[1]
1390     *  supp-low[1]
1391     *  ...
1392     *  supp-high[m-1]
1393     *  supp-low[m-1]
1394     *
1395     * The array starts with a header.  After the header are n bmp
1396     * code points, then m supplementary code points.  Either n or m
1397     * or both may be zero.  n+2*m is always <= 0x7FFF.
1398     *
1399     * If there are no supplementary characters (if m==0) then the
1400     * header is one 16-bit integer, 'length', with value n.
1401     *
1402     * If there are supplementary characters (if m!=0) then the header
1403     * is two 16-bit integers.  The first, 'length', has value
1404     * (n+2*m)|0x8000.  The second, 'bmpLength', has value n.
1405     *
1406     * After the header the code points are stored in ascending order.
1407     * Supplementary code points are stored as most significant 16
1408     * bits followed by least significant 16 bits.
1409     *
1410     * @param dest pointer to buffer of destCapacity 16-bit integers.
1411     * May be NULL only if destCapacity is zero.
1412     * @param destCapacity size of dest, or zero.  Must not be negative.
1413     * @param ec error code.  Will be set to U_INDEX_OUTOFBOUNDS_ERROR
1414     * if n+2*m > 0x7FFF.  Will be set to U_BUFFER_OVERFLOW_ERROR if
1415     * n+2*m+(m!=0?2:1) > destCapacity.
1416     * @return the total length of the serialized format, including
1417     * the header, that is, n+2*m+(m!=0?2:1), or 0 on error other
1418     * than U_BUFFER_OVERFLOW_ERROR.
1419     * @stable ICU 2.4
1420     */
1421    int32_t serialize(uint16_t *dest, int32_t destCapacity, UErrorCode& ec) const;
1422
1423    /**
1424     * Reallocate this objects internal structures to take up the least
1425     * possible space, without changing this object's value.
1426     * A frozen set will not be modified.
1427     * @stable ICU 2.4
1428     */
1429    virtual UnicodeSet& compact();
1430
1431    /**
1432     * Return the class ID for this class.  This is useful only for
1433     * comparing to a return value from getDynamicClassID().  For example:
1434     * <pre>
1435     * .      Base* polymorphic_pointer = createPolymorphicObject();
1436     * .      if (polymorphic_pointer->getDynamicClassID() ==
1437     * .          Derived::getStaticClassID()) ...
1438     * </pre>
1439     * @return          The class ID for all objects of this class.
1440     * @stable ICU 2.0
1441     */
1442    static UClassID U_EXPORT2 getStaticClassID(void);
1443
1444    /**
1445     * Implement UnicodeFunctor API.
1446     *
1447     * @return The class ID for this object. All objects of a given
1448     * class have the same class ID.  Objects of other classes have
1449     * different class IDs.
1450     * @stable ICU 2.4
1451     */
1452    virtual UClassID getDynamicClassID(void) const;
1453
1454private:
1455
1456    // Private API for the USet API
1457
1458    friend class USetAccess;
1459
1460    int32_t getStringCount() const;
1461
1462    const UnicodeString* getString(int32_t index) const;
1463
1464    //----------------------------------------------------------------
1465    // RuleBasedTransliterator support
1466    //----------------------------------------------------------------
1467
1468private:
1469
1470    /**
1471     * Returns <tt>true</tt> if this set contains any character whose low byte
1472     * is the given value.  This is used by <tt>RuleBasedTransliterator</tt> for
1473     * indexing.
1474     */
1475    virtual UBool matchesIndexValue(uint8_t v) const;
1476
1477private:
1478    friend class RBBIRuleScanner;
1479
1480    //----------------------------------------------------------------
1481    // Implementation: Clone as thawed (see ICU4J Freezable)
1482    //----------------------------------------------------------------
1483
1484    UnicodeSet(const UnicodeSet& o, UBool /* asThawed */);
1485
1486    //----------------------------------------------------------------
1487    // Implementation: Pattern parsing
1488    //----------------------------------------------------------------
1489
1490    void applyPatternIgnoreSpace(const UnicodeString& pattern,
1491                                 ParsePosition& pos,
1492                                 const SymbolTable* symbols,
1493                                 UErrorCode& status);
1494
1495    void applyPattern(RuleCharacterIterator& chars,
1496                      const SymbolTable* symbols,
1497                      UnicodeString& rebuiltPat,
1498                      uint32_t options,
1499                      UnicodeSet& (UnicodeSet::*caseClosure)(int32_t attribute),
1500                      UErrorCode& ec);
1501
1502    //----------------------------------------------------------------
1503    // Implementation: Utility methods
1504    //----------------------------------------------------------------
1505
1506    void ensureCapacity(int32_t newLen, UErrorCode& ec);
1507
1508    void ensureBufferCapacity(int32_t newLen, UErrorCode& ec);
1509
1510    void swapBuffers(void);
1511
1512    UBool allocateStrings(UErrorCode &status);
1513
1514    UnicodeString& _toPattern(UnicodeString& result,
1515                              UBool escapeUnprintable) const;
1516
1517    UnicodeString& _generatePattern(UnicodeString& result,
1518                                    UBool escapeUnprintable) const;
1519
1520    static void _appendToPat(UnicodeString& buf, const UnicodeString& s, UBool escapeUnprintable);
1521
1522    static void _appendToPat(UnicodeString& buf, UChar32 c, UBool escapeUnprintable);
1523
1524    //----------------------------------------------------------------
1525    // Implementation: Fundamental operators
1526    //----------------------------------------------------------------
1527
1528    void exclusiveOr(const UChar32* other, int32_t otherLen, int8_t polarity);
1529
1530    void add(const UChar32* other, int32_t otherLen, int8_t polarity);
1531
1532    void retain(const UChar32* other, int32_t otherLen, int8_t polarity);
1533
1534    /**
1535     * Return true if the given position, in the given pattern, appears
1536     * to be the start of a property set pattern [:foo:], \\p{foo}, or
1537     * \\P{foo}, or \\N{name}.
1538     */
1539    static UBool resemblesPropertyPattern(const UnicodeString& pattern,
1540                                          int32_t pos);
1541
1542    static UBool resemblesPropertyPattern(RuleCharacterIterator& chars,
1543                                          int32_t iterOpts);
1544
1545    /**
1546     * Parse the given property pattern at the given parse position
1547     * and set this UnicodeSet to the result.
1548     *
1549     * The original design document is out of date, but still useful.
1550     * Ignore the property and value names:
1551     * http://source.icu-project.org/repos/icu/icuhtml/trunk/design/unicodeset_properties.html
1552     *
1553     * Recognized syntax:
1554     *
1555     * [:foo:] [:^foo:] - white space not allowed within "[:" or ":]"
1556     * \\p{foo} \\P{foo}  - white space not allowed within "\\p" or "\\P"
1557     * \\N{name}         - white space not allowed within "\\N"
1558     *
1559     * Other than the above restrictions, Unicode Pattern_White_Space characters are ignored.
1560     * Case is ignored except in "\\p" and "\\P" and "\\N".  In 'name' leading
1561     * and trailing space is deleted, and internal runs of whitespace
1562     * are collapsed to a single space.
1563     *
1564     * We support binary properties, enumerated properties, and the
1565     * following non-enumerated properties:
1566     *
1567     *  Numeric_Value
1568     *  Name
1569     *  Unicode_1_Name
1570     *
1571     * @param pattern the pattern string
1572     * @param ppos on entry, the position at which to begin parsing.
1573     * This should be one of the locations marked '^':
1574     *
1575     *   [:blah:]     \\p{blah}     \\P{blah}     \\N{name}
1576     *   ^       %    ^       %    ^       %    ^       %
1577     *
1578     * On return, the position after the last character parsed, that is,
1579     * the locations marked '%'.  If the parse fails, ppos is returned
1580     * unchanged.
1581     * @param ec status
1582     * @return a reference to this.
1583     */
1584    UnicodeSet& applyPropertyPattern(const UnicodeString& pattern,
1585                                     ParsePosition& ppos,
1586                                     UErrorCode &ec);
1587
1588    void applyPropertyPattern(RuleCharacterIterator& chars,
1589                              UnicodeString& rebuiltPat,
1590                              UErrorCode& ec);
1591
1592    friend void UnicodeSet_initInclusion(int32_t src, UErrorCode &status);
1593    static const UnicodeSet* getInclusions(int32_t src, UErrorCode &status);
1594
1595    /**
1596     * A filter that returns TRUE if the given code point should be
1597     * included in the UnicodeSet being constructed.
1598     */
1599    typedef UBool (*Filter)(UChar32 codePoint, void* context);
1600
1601    /**
1602     * Given a filter, set this UnicodeSet to the code points
1603     * contained by that filter.  The filter MUST be
1604     * property-conformant.  That is, if it returns value v for one
1605     * code point, then it must return v for all affiliated code
1606     * points, as defined by the inclusions list.  See
1607     * getInclusions().
1608     * src is a UPropertySource value.
1609     */
1610    void applyFilter(Filter filter,
1611                     void* context,
1612                     int32_t src,
1613                     UErrorCode &status);
1614
1615    /**
1616     * Set the new pattern to cache.
1617     */
1618    void setPattern(const UnicodeString& newPat);
1619    /**
1620     * Release existing cached pattern.
1621     */
1622    void releasePattern();
1623
1624    friend class UnicodeSetIterator;
1625};
1626
1627
1628
1629inline UBool UnicodeSet::operator!=(const UnicodeSet& o) const {
1630    return !operator==(o);
1631}
1632
1633inline UBool UnicodeSet::isFrozen() const {
1634    return (UBool)(bmpSet!=NULL || stringSpan!=NULL);
1635}
1636
1637inline UBool UnicodeSet::containsSome(UChar32 start, UChar32 end) const {
1638    return !containsNone(start, end);
1639}
1640
1641inline UBool UnicodeSet::containsSome(const UnicodeSet& s) const {
1642    return !containsNone(s);
1643}
1644
1645inline UBool UnicodeSet::containsSome(const UnicodeString& s) const {
1646    return !containsNone(s);
1647}
1648
1649inline UBool UnicodeSet::isBogus() const {
1650    return (UBool)(fFlags & kIsBogus);
1651}
1652
1653inline UnicodeSet *UnicodeSet::fromUSet(USet *uset) {
1654    return reinterpret_cast<UnicodeSet *>(uset);
1655}
1656
1657inline const UnicodeSet *UnicodeSet::fromUSet(const USet *uset) {
1658    return reinterpret_cast<const UnicodeSet *>(uset);
1659}
1660
1661inline USet *UnicodeSet::toUSet() {
1662    return reinterpret_cast<USet *>(this);
1663}
1664
1665inline const USet *UnicodeSet::toUSet() const {
1666    return reinterpret_cast<const USet *>(this);
1667}
1668
1669inline int32_t UnicodeSet::span(const UnicodeString &s, int32_t start, USetSpanCondition spanCondition) const {
1670    int32_t sLength=s.length();
1671    if(start<0) {
1672        start=0;
1673    } else if(start>sLength) {
1674        start=sLength;
1675    }
1676    return start+span(s.getBuffer()+start, sLength-start, spanCondition);
1677}
1678
1679inline int32_t UnicodeSet::spanBack(const UnicodeString &s, int32_t limit, USetSpanCondition spanCondition) const {
1680    int32_t sLength=s.length();
1681    if(limit<0) {
1682        limit=0;
1683    } else if(limit>sLength) {
1684        limit=sLength;
1685    }
1686    return spanBack(s.getBuffer(), limit, spanCondition);
1687}
1688
1689U_NAMESPACE_END
1690
1691#endif
1692