1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3//
4//  rbbisetb.h
5/*
6**********************************************************************
7*   Copyright (c) 2001-2005, International Business Machines
8*   Corporation and others.  All Rights Reserved.
9**********************************************************************
10*/
11
12#ifndef RBBISETB_H
13#define RBBISETB_H
14
15#include "unicode/utypes.h"
16
17#if !UCONFIG_NO_BREAK_ITERATION
18
19#include "unicode/uobject.h"
20#include "rbbirb.h"
21#include "utrie2.h"
22#include "uvector.h"
23
24U_NAMESPACE_BEGIN
25
26//
27//  RBBISetBuilder   Derives the character categories used by the runtime RBBI engine
28//                   from the Unicode Sets appearing in the source  RBBI rules, and
29//                   creates the TRIE table used to map from Unicode to the
30//                   character categories.
31//
32
33
34//
35//  RangeDescriptor
36//
37//     Each of the non-overlapping character ranges gets one of these descriptors.
38//     All of them are strung together in a linked list, which is kept in order
39//     (by character)
40//
41class RangeDescriptor : public UMemory {
42public:
43    UChar32            fStartChar;      // Start of range, unicode 32 bit value.
44    UChar32            fEndChar;        // End of range, unicode 32 bit value.
45    int32_t            fNum;            // runtime-mapped input value for this range.
46    UVector           *fIncludesSets;   // vector of the the original
47                                        //   Unicode sets that include this range.
48                                        //    (Contains ptrs to uset nodes)
49    RangeDescriptor   *fNext;           // Next RangeDescriptor in the linked list.
50
51    RangeDescriptor(UErrorCode &status);
52    RangeDescriptor(const RangeDescriptor &other, UErrorCode &status);
53    ~RangeDescriptor();
54    void split(UChar32 where, UErrorCode &status);   // Spit this range in two at "where", with
55                                        //   where appearing in the second (higher) part.
56    void setDictionaryFlag();           // Check whether this range appears as part of
57                                        //   the Unicode set named "dictionary"
58
59private:
60    RangeDescriptor(const RangeDescriptor &other); // forbid copying of this class
61    RangeDescriptor &operator=(const RangeDescriptor &other); // forbid copying of this class
62};
63
64
65//
66//  RBBISetBuilder   Handles processing of Unicode Sets from RBBI rules.
67//
68//      Starting with the rules parse tree from the scanner,
69//
70//                   -  Enumerate the set of UnicodeSets that are referenced
71//                      by the RBBI rules.
72//                   -  compute a derived set of non-overlapping UnicodeSets
73//                      that will correspond to columns in the state table for
74//                      the RBBI execution engine.
75//                   -  construct the trie table that maps input characters
76//                      to set numbers in the non-overlapping set of sets.
77//
78
79
80class RBBISetBuilder : public UMemory {
81public:
82    RBBISetBuilder(RBBIRuleBuilder *rb);
83    ~RBBISetBuilder();
84
85    void     build();
86    void     addValToSets(UVector *sets,      uint32_t val);
87    void     addValToSet (RBBINode *usetNode, uint32_t val);
88    int32_t  getNumCharCategories() const;   // CharCategories are the same as input symbol set to the
89                                             //    runtime state machine, which are the same as
90                                             //    columns in the DFA state table
91    int32_t  getTrieSize() /*const*/;        // Size in bytes of the serialized Trie.
92    void     serializeTrie(uint8_t *where);  // write out the serialized Trie.
93    UChar32  getFirstChar(int32_t  val) const;
94    UBool    sawBOF() const;                 // Indicate whether any references to the {bof} pseudo
95                                             //   character were encountered.
96#ifdef RBBI_DEBUG
97    void     printSets();
98    void     printRanges();
99    void     printRangeGroups();
100#else
101    #define printSets()
102    #define printRanges()
103    #define printRangeGroups()
104#endif
105
106private:
107    void           numberSets();
108
109    RBBIRuleBuilder       *fRB;             // The RBBI Rule Compiler that owns us.
110    UErrorCode            *fStatus;
111
112    RangeDescriptor       *fRangeList;      // Head of the linked list of RangeDescriptors
113
114    UTrie2                *fTrie;           // The mapping TRIE that is the end result of processing
115    uint32_t               fTrieSize;       //  the Unicode Sets.
116
117    // Groups correspond to character categories -
118    //       groups of ranges that are in the same original UnicodeSets.
119    //       fGroupCount is the index of the last used group.
120    //       fGroupCount+1 is also the number of columns in the RBBI state table being compiled.
121    //       State table column 0 is not used.  Column 1 is for end-of-input.
122    //       column 2 is for group 0.  Funny counting.
123    int32_t               fGroupCount;
124
125    UBool                 fSawBOF;
126
127    RBBISetBuilder(const RBBISetBuilder &other); // forbid copying of this class
128    RBBISetBuilder &operator=(const RBBISetBuilder &other); // forbid copying of this class
129};
130
131
132
133U_NAMESPACE_END
134
135#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
136
137#endif
138