164339d36f8bd4db5025fe2988eda22b491a9219cFredrik Roubert// Copyright (C) 2016 and later: Unicode, Inc. and others. 264339d36f8bd4db5025fe2988eda22b491a9219cFredrik Roubert// License & terms of use: http://www.unicode.org/copyright.html 3ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// 4ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// rbbisetb.h 5ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/* 6ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru********************************************************************** 7ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru* Copyright (c) 2001-2005, International Business Machines 8ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru* Corporation and others. All Rights Reserved. 9ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru********************************************************************** 10ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru*/ 11ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 12ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#ifndef RBBISETB_H 13ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#define RBBISETB_H 14ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 15ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "unicode/utypes.h" 16ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "unicode/uobject.h" 17ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "rbbirb.h" 18ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "uvector.h" 19ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 20ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querustruct UNewTrie; 21ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 22ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruU_NAMESPACE_BEGIN 23ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 24ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// 25ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// RBBISetBuilder Derives the character categories used by the runtime RBBI engine 26ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// from the Unicode Sets appearing in the source RBBI rules, and 27ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// creates the TRIE table used to map from Unicode to the 28ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// character categories. 29ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// 30ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 31ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 32ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// 33ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// RangeDescriptor 34ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// 35ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// Each of the non-overlapping character ranges gets one of these descriptors. 36ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// All of them are strung together in a linked list, which is kept in order 37ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// (by character) 38ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// 39ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruclass RangeDescriptor : public UMemory { 40ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querupublic: 41ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UChar32 fStartChar; // Start of range, unicode 32 bit value. 42ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UChar32 fEndChar; // End of range, unicode 32 bit value. 43ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t fNum; // runtime-mapped input value for this range. 44ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UVector *fIncludesSets; // vector of the the original 45ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Unicode sets that include this range. 46ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // (Contains ptrs to uset nodes) 47ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru RangeDescriptor *fNext; // Next RangeDescriptor in the linked list. 48ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 49ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru RangeDescriptor(UErrorCode &status); 50ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru RangeDescriptor(const RangeDescriptor &other, UErrorCode &status); 51ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru ~RangeDescriptor(); 52ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru void split(UChar32 where, UErrorCode &status); // Spit this range in two at "where", with 53ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // where appearing in the second (higher) part. 54ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru void setDictionaryFlag(); // Check whether this range appears as part of 55ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // the Unicode set named "dictionary" 56ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 57ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruprivate: 58ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru RangeDescriptor(const RangeDescriptor &other); // forbid copying of this class 59ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru RangeDescriptor &operator=(const RangeDescriptor &other); // forbid copying of this class 60ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}; 61ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 62ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 63ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// 64ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// RBBISetBuilder Handles processing of Unicode Sets from RBBI rules. 65ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// 66ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// Starting with the rules parse tree from the scanner, 67ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// 68ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// - Enumerate the set of UnicodeSets that are referenced 69ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// by the RBBI rules. 70ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// - compute a derived set of non-overlapping UnicodeSets 71ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// that will correspond to columns in the state table for 72ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// the RBBI execution engine. 73ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// - construct the trie table that maps input characters 74ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// to set numbers in the non-overlapping set of sets. 75ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// 76ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 77ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 78ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruclass RBBISetBuilder : public UMemory { 79ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querupublic: 80ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru RBBISetBuilder(RBBIRuleBuilder *rb); 81ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru ~RBBISetBuilder(); 82ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 83ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru void build(); 84ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru void addValToSets(UVector *sets, uint32_t val); 85ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru void addValToSet (RBBINode *usetNode, uint32_t val); 86ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t getNumCharCategories() const; // CharCategories are the same as input symbol set to the 87ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // runtime state machine, which are the same as 88ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // columns in the DFA state table 89ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t getTrieSize() /*const*/; // Size in bytes of the serialized Trie. 90ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru void serializeTrie(uint8_t *where); // write out the serialized Trie. 91ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UChar32 getFirstChar(int32_t val) const; 92ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UBool sawBOF() const; // Indicate whether any references to the {bof} pseudo 93ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // character were encountered. 94ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#ifdef RBBI_DEBUG 95ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru void printSets(); 96ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru void printRanges(); 97ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru void printRangeGroups(); 98ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#else 99ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru #define printSets() 100ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru #define printRanges() 101ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru #define printRangeGroups() 102ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#endif 103ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 104ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruprivate: 105ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru void numberSets(); 106ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 107ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru RBBIRuleBuilder *fRB; // The RBBI Rule Compiler that owns us. 108ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UErrorCode *fStatus; 109ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 110ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru RangeDescriptor *fRangeList; // Head of the linked list of RangeDescriptors 111ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 112ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UNewTrie *fTrie; // The mapping TRIE that is the end result of processing 113ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru uint32_t fTrieSize; // the Unicode Sets. 114ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 115ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Groups correspond to character categories - 116ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // groups of ranges that are in the same original UnicodeSets. 117ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // fGroupCount is the index of the last used group. 118ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // fGroupCount+1 is also the number of columns in the RBBI state table being compiled. 119ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // State table column 0 is not used. Column 1 is for end-of-input. 120ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // column 2 is for group 0. Funny counting. 121ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t fGroupCount; 122ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 123ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UBool fSawBOF; 124ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 125ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru RBBISetBuilder(const RBBISetBuilder &other); // forbid copying of this class 126ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru RBBISetBuilder &operator=(const RBBISetBuilder &other); // forbid copying of this class 127ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}; 128ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 129ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 130ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 131ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruU_NAMESPACE_END 132ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#endif 133