164339d36f8bd4db5025fe2988eda22b491a9219cFredrik Roubert// Copyright (C) 2016 and later: Unicode, Inc. and others.
264339d36f8bd4db5025fe2988eda22b491a9219cFredrik Roubert// License & terms of use: http://www.unicode.org/copyright.html
3ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru//
4ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru//  file:  rbbirb.cpp
5ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru//
683a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius//  Copyright (C) 2002-2011, International Business Machines Corporation and others.
7ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru//  All Rights Reserved.
8ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru//
9ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru//  This file contains the RBBIRuleBuilder class implementation.  This is the main class for
10ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru//    building (compiling) break rules into the tables required by the runtime
11ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru//    RBBI engine.
12ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru//
13ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
14ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "unicode/utypes.h"
15ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
16ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#if !UCONFIG_NO_BREAK_ITERATION
17ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
18ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "unicode/brkiter.h"
19ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "unicode/rbbi.h"
20ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "unicode/ubrk.h"
21ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "unicode/unistr.h"
22ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "unicode/uniset.h"
23ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "unicode/uchar.h"
24ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "unicode/uchriter.h"
25ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "unicode/parsepos.h"
26ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "unicode/parseerr.h"
27ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "cmemory.h"
28ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "cstring.h"
29ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
30ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "rbbirb.h"
31ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "rbbinode.h"
32ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
33ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "rbbiscan.h"
34ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "rbbisetb.h"
35ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "rbbitblb.h"
36ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "rbbidata.h"
37ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
38ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
39ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruU_NAMESPACE_BEGIN
40ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
41ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
42ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru//----------------------------------------------------------------------------------------
43ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru//
44ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru//  Constructor.
45ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru//
46ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru//----------------------------------------------------------------------------------------
47ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruRBBIRuleBuilder::RBBIRuleBuilder(const UnicodeString   &rules,
4885bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho                                       UParseError     *parseErr,
49ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                                       UErrorCode      &status)
50ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru : fRules(rules)
51ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{
52ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    fStatus = &status; // status is checked below
5385bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho    fParseError = parseErr;
54ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    fDebugEnv   = NULL;
55ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#ifdef RBBI_DEBUG
56ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    fDebugEnv   = getenv("U_RBBIDEBUG");
57ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#endif
58ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
59ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
60ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    fForwardTree        = NULL;
61ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    fReverseTree        = NULL;
62ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    fSafeFwdTree        = NULL;
63ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    fSafeRevTree        = NULL;
64ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    fDefaultTree        = &fForwardTree;
65ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    fForwardTables      = NULL;
66ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    fReverseTables      = NULL;
67ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    fSafeFwdTables      = NULL;
68ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    fSafeRevTables      = NULL;
69ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    fRuleStatusVals     = NULL;
70ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    fChainRules         = FALSE;
71ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    fLBCMNoChain        = FALSE;
72ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    fLookAheadHardBreak = FALSE;
73ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    fUSetNodes          = NULL;
74ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    fRuleStatusVals     = NULL;
75ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    fScanner            = NULL;
76ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    fSetBuilder         = NULL;
7785bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho    if (parseErr) {
7885bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho        uprv_memset(parseErr, 0, sizeof(UParseError));
7985bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho    }
80ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
81ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if (U_FAILURE(status)) {
82ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        return;
83ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
84ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
85ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    fUSetNodes          = new UVector(status); // bcos status gets overwritten here
86ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    fRuleStatusVals     = new UVector(status);
87ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    fScanner            = new RBBIRuleScanner(this);
88ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    fSetBuilder         = new RBBISetBuilder(this);
89ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if (U_FAILURE(status)) {
90ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        return;
91ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
92ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if(fSetBuilder == 0 || fScanner == 0 || fUSetNodes == 0 || fRuleStatusVals == 0) {
93ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        status = U_MEMORY_ALLOCATION_ERROR;
94ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
95ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
96ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
97ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
98ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
99ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru//----------------------------------------------------------------------------------------
100ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru//
101ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru//  Destructor
102ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru//
103ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru//----------------------------------------------------------------------------------------
104ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruRBBIRuleBuilder::~RBBIRuleBuilder() {
105ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
106ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    int        i;
107ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    for (i=0; ; i++) {
108ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        RBBINode *n = (RBBINode *)fUSetNodes->elementAt(i);
109ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        if (n==NULL) {
110ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru            break;
111ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        }
112ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        delete n;
113ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
114ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
115ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    delete fUSetNodes;
116ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    delete fSetBuilder;
117ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    delete fForwardTables;
118ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    delete fReverseTables;
119ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    delete fSafeFwdTables;
120ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    delete fSafeRevTables;
121ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
122ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    delete fForwardTree;
123ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    delete fReverseTree;
124ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    delete fSafeFwdTree;
125ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    delete fSafeRevTree;
126ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    delete fScanner;
127ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    delete fRuleStatusVals;
128ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
129ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
130ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
131ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
132ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
133ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
134ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru//----------------------------------------------------------------------------------------
135ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru//
136ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru//   flattenData() -  Collect up the compiled RBBI rule data and put it into
137ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru//                    the format for saving in ICU data files,
138ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru//                    which is also the format needed by the RBBI runtime engine.
139ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru//
140ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru//----------------------------------------------------------------------------------------
141ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querustatic int32_t align8(int32_t i) {return (i+7) & 0xfffffff8;}
142ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
143ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruRBBIDataHeader *RBBIRuleBuilder::flattenData() {
144ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    int32_t    i;
145ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
146ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if (U_FAILURE(*fStatus)) {
147ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        return NULL;
148ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
149ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
150ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    // Remove comments and whitespace from the rules to make it smaller.
151ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    UnicodeString strippedRules((const UnicodeString&)RBBIRuleScanner::stripRules(fRules));
152ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
153ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    // Calculate the size of each section in the data.
154ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    //   Sizes here are padded up to a multiple of 8 for better memory alignment.
155ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    //   Sections sizes actually stored in the header are for the actual data
156ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    //     without the padding.
157ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    //
158ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    int32_t headerSize        = align8(sizeof(RBBIDataHeader));
159ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    int32_t forwardTableSize  = align8(fForwardTables->getTableSize());
160ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    int32_t reverseTableSize  = align8(fReverseTables->getTableSize());
161ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    int32_t safeFwdTableSize  = align8(fSafeFwdTables->getTableSize());
162ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    int32_t safeRevTableSize  = align8(fSafeRevTables->getTableSize());
163ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    int32_t trieSize          = align8(fSetBuilder->getTrieSize());
164ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    int32_t statusTableSize   = align8(fRuleStatusVals->size() * sizeof(int32_t));
165ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    int32_t rulesSize         = align8((strippedRules.length()+1) * sizeof(UChar));
166ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
167ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    int32_t         totalSize = headerSize + forwardTableSize + reverseTableSize
168ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                                + safeFwdTableSize + safeRevTableSize
169ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                                + statusTableSize + trieSize + rulesSize;
170ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
171ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    RBBIDataHeader  *data     = (RBBIDataHeader *)uprv_malloc(totalSize);
172ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if (data == NULL) {
173ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        *fStatus = U_MEMORY_ALLOCATION_ERROR;
174ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        return NULL;
175ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
176ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    uprv_memset(data, 0, totalSize);
177ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
178ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
179ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    data->fMagic            = 0xb1a0;
180ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    data->fFormatVersion[0] = 3;
181ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    data->fFormatVersion[1] = 1;
182ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    data->fFormatVersion[2] = 0;
183ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    data->fFormatVersion[3] = 0;
184ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    data->fLength           = totalSize;
185ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    data->fCatCount         = fSetBuilder->getNumCharCategories();
186ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
187ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    data->fFTable        = headerSize;
188ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    data->fFTableLen     = forwardTableSize;
189ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    data->fRTable        = data->fFTable  + forwardTableSize;
190ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    data->fRTableLen     = reverseTableSize;
191ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    data->fSFTable       = data->fRTable  + reverseTableSize;
192ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    data->fSFTableLen    = safeFwdTableSize;
193ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    data->fSRTable       = data->fSFTable + safeFwdTableSize;
194ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    data->fSRTableLen    = safeRevTableSize;
195ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
196ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    data->fTrie          = data->fSRTable + safeRevTableSize;
197ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    data->fTrieLen       = fSetBuilder->getTrieSize();
198ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    data->fStatusTable   = data->fTrie    + trieSize;
199ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    data->fStatusTableLen= statusTableSize;
200ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    data->fRuleSource    = data->fStatusTable + statusTableSize;
201ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    data->fRuleSourceLen = strippedRules.length() * sizeof(UChar);
202ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
203ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    uprv_memset(data->fReserved, 0, sizeof(data->fReserved));
204ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
205ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    fForwardTables->exportTable((uint8_t *)data + data->fFTable);
206ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    fReverseTables->exportTable((uint8_t *)data + data->fRTable);
207ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    fSafeFwdTables->exportTable((uint8_t *)data + data->fSFTable);
208ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    fSafeRevTables->exportTable((uint8_t *)data + data->fSRTable);
209ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    fSetBuilder->serializeTrie ((uint8_t *)data + data->fTrie);
210ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
211ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    int32_t *ruleStatusTable = (int32_t *)((uint8_t *)data + data->fStatusTable);
212ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    for (i=0; i<fRuleStatusVals->size(); i++) {
213ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        ruleStatusTable[i] = fRuleStatusVals->elementAti(i);
214ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
215ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
216ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    strippedRules.extract((UChar *)((uint8_t *)data+data->fRuleSource), rulesSize/2+1, *fStatus);
217ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
218ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    return data;
219ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
220ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
221ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
222ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
223ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
224ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
225ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
226ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru//----------------------------------------------------------------------------------------
227ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru//
228ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru//  createRuleBasedBreakIterator    construct from source rules that are passed in
229ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru//                                  in a UnicodeString
230ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru//
231ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru//----------------------------------------------------------------------------------------
232ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruBreakIterator *
233ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruRBBIRuleBuilder::createRuleBasedBreakIterator( const UnicodeString    &rules,
23485bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho                                    UParseError      *parseError,
235ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru                                    UErrorCode       &status)
236ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{
237ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    // status checked below
238ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
239ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    //
240ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    // Read the input rules, generate a parse tree, symbol table,
241ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    // and list of all Unicode Sets referenced by the rules.
242ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    //
243ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    RBBIRuleBuilder  builder(rules, parseError, status);
244ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if (U_FAILURE(status)) { // status checked here bcos build below doesn't
245ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        return NULL;
246ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
24785bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho    builder.fScanner->parse();
248ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
249ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    //
250ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    // UnicodeSet processing.
251ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    //    Munge the Unicode Sets to create a set of character categories.
252ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    //    Generate the mapping tables (TRIE) from input 32-bit characters to
253ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    //    the character categories.
254ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    //
255ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    builder.fSetBuilder->build();
256ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
257ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
258ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    //
259ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    //   Generate the DFA state transition table.
260ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    //
261ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    builder.fForwardTables = new RBBITableBuilder(&builder, &builder.fForwardTree);
262ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    builder.fReverseTables = new RBBITableBuilder(&builder, &builder.fReverseTree);
263ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    builder.fSafeFwdTables = new RBBITableBuilder(&builder, &builder.fSafeFwdTree);
264ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    builder.fSafeRevTables = new RBBITableBuilder(&builder, &builder.fSafeRevTree);
26583a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius    if (builder.fForwardTables == NULL || builder.fReverseTables == NULL ||
26683a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius        builder.fSafeFwdTables == NULL || builder.fSafeRevTables == NULL)
267ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    {
268ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        status = U_MEMORY_ALLOCATION_ERROR;
26983a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius        delete builder.fForwardTables; builder.fForwardTables = NULL;
27083a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius        delete builder.fReverseTables; builder.fReverseTables = NULL;
27183a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius        delete builder.fSafeFwdTables; builder.fSafeFwdTables = NULL;
27283a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius        delete builder.fSafeRevTables; builder.fSafeRevTables = NULL;
273ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        return NULL;
274ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
275ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
276ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    builder.fForwardTables->build();
277ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    builder.fReverseTables->build();
278ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    builder.fSafeFwdTables->build();
279ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    builder.fSafeRevTables->build();
280ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
281ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#ifdef RBBI_DEBUG
282ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if (builder.fDebugEnv && uprv_strstr(builder.fDebugEnv, "states")) {
283ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        builder.fForwardTables->printRuleStatusTable();
284ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
285ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#endif
286ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
287ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    //
288ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    //   Package up the compiled data into a memory image
289ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    //      in the run-time format.
290ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    //
291ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    RBBIDataHeader *data = builder.flattenData(); // returns NULL if error
29285bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho    if (U_FAILURE(*builder.fStatus)) {
29385bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho        return NULL;
29485bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho    }
295ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
296ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
297ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    //
298ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    //  Clean up the compiler related stuff
299ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    //
300ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
301ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
302ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    //
303ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    //  Create a break iterator from the compiled rules.
304ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    //     (Identical to creation from stored pre-compiled rules)
305ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    //
306ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    // status is checked after init in construction.
307ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    RuleBasedBreakIterator *This = new RuleBasedBreakIterator(data, status);
308ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    if (U_FAILURE(status)) {
309ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        delete This;
310ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        This = NULL;
311ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
312ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    else if(This == NULL) { // test for NULL
313ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru        status = U_MEMORY_ALLOCATION_ERROR;
314ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    }
315ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru    return This;
316ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}
317ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
318ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruU_NAMESPACE_END
319ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru
320ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
321