1//
2//  file:  rbbirb.cpp
3//
4//  Copyright (C) 2002-2011, International Business Machines Corporation and others.
5//  All Rights Reserved.
6//
7//  This file contains the RBBIRuleBuilder class implementation.  This is the main class for
8//    building (compiling) break rules into the tables required by the runtime
9//    RBBI engine.
10//
11
12#include "unicode/utypes.h"
13
14#if !UCONFIG_NO_BREAK_ITERATION
15
16#include "unicode/brkiter.h"
17#include "unicode/rbbi.h"
18#include "unicode/ubrk.h"
19#include "unicode/unistr.h"
20#include "unicode/uniset.h"
21#include "unicode/uchar.h"
22#include "unicode/uchriter.h"
23#include "unicode/parsepos.h"
24#include "unicode/parseerr.h"
25#include "cmemory.h"
26#include "cstring.h"
27
28#include "rbbirb.h"
29#include "rbbinode.h"
30
31#include "rbbiscan.h"
32#include "rbbisetb.h"
33#include "rbbitblb.h"
34#include "rbbidata.h"
35
36
37U_NAMESPACE_BEGIN
38
39
40//----------------------------------------------------------------------------------------
41//
42//  Constructor.
43//
44//----------------------------------------------------------------------------------------
45RBBIRuleBuilder::RBBIRuleBuilder(const UnicodeString   &rules,
46                                       UParseError     *parseErr,
47                                       UErrorCode      &status)
48 : fRules(rules)
49{
50    fStatus = &status; // status is checked below
51    fParseError = parseErr;
52    fDebugEnv   = NULL;
53#ifdef RBBI_DEBUG
54    fDebugEnv   = getenv("U_RBBIDEBUG");
55#endif
56
57
58    fForwardTree        = NULL;
59    fReverseTree        = NULL;
60    fSafeFwdTree        = NULL;
61    fSafeRevTree        = NULL;
62    fDefaultTree        = &fForwardTree;
63    fForwardTables      = NULL;
64    fReverseTables      = NULL;
65    fSafeFwdTables      = NULL;
66    fSafeRevTables      = NULL;
67    fRuleStatusVals     = NULL;
68    fChainRules         = FALSE;
69    fLBCMNoChain        = FALSE;
70    fLookAheadHardBreak = FALSE;
71    fUSetNodes          = NULL;
72    fRuleStatusVals     = NULL;
73    fScanner            = NULL;
74    fSetBuilder         = NULL;
75    if (parseErr) {
76        uprv_memset(parseErr, 0, sizeof(UParseError));
77    }
78
79    if (U_FAILURE(status)) {
80        return;
81    }
82
83    fUSetNodes          = new UVector(status); // bcos status gets overwritten here
84    fRuleStatusVals     = new UVector(status);
85    fScanner            = new RBBIRuleScanner(this);
86    fSetBuilder         = new RBBISetBuilder(this);
87    if (U_FAILURE(status)) {
88        return;
89    }
90    if(fSetBuilder == 0 || fScanner == 0 || fUSetNodes == 0 || fRuleStatusVals == 0) {
91        status = U_MEMORY_ALLOCATION_ERROR;
92    }
93}
94
95
96
97//----------------------------------------------------------------------------------------
98//
99//  Destructor
100//
101//----------------------------------------------------------------------------------------
102RBBIRuleBuilder::~RBBIRuleBuilder() {
103
104    int        i;
105    for (i=0; ; i++) {
106        RBBINode *n = (RBBINode *)fUSetNodes->elementAt(i);
107        if (n==NULL) {
108            break;
109        }
110        delete n;
111    }
112
113    delete fUSetNodes;
114    delete fSetBuilder;
115    delete fForwardTables;
116    delete fReverseTables;
117    delete fSafeFwdTables;
118    delete fSafeRevTables;
119
120    delete fForwardTree;
121    delete fReverseTree;
122    delete fSafeFwdTree;
123    delete fSafeRevTree;
124    delete fScanner;
125    delete fRuleStatusVals;
126}
127
128
129
130
131
132//----------------------------------------------------------------------------------------
133//
134//   flattenData() -  Collect up the compiled RBBI rule data and put it into
135//                    the format for saving in ICU data files,
136//                    which is also the format needed by the RBBI runtime engine.
137//
138//----------------------------------------------------------------------------------------
139static int32_t align8(int32_t i) {return (i+7) & 0xfffffff8;}
140
141RBBIDataHeader *RBBIRuleBuilder::flattenData() {
142    int32_t    i;
143
144    if (U_FAILURE(*fStatus)) {
145        return NULL;
146    }
147
148    // Remove comments and whitespace from the rules to make it smaller.
149    UnicodeString strippedRules((const UnicodeString&)RBBIRuleScanner::stripRules(fRules));
150
151    // Calculate the size of each section in the data.
152    //   Sizes here are padded up to a multiple of 8 for better memory alignment.
153    //   Sections sizes actually stored in the header are for the actual data
154    //     without the padding.
155    //
156    int32_t headerSize        = align8(sizeof(RBBIDataHeader));
157    int32_t forwardTableSize  = align8(fForwardTables->getTableSize());
158    int32_t reverseTableSize  = align8(fReverseTables->getTableSize());
159    int32_t safeFwdTableSize  = align8(fSafeFwdTables->getTableSize());
160    int32_t safeRevTableSize  = align8(fSafeRevTables->getTableSize());
161    int32_t trieSize          = align8(fSetBuilder->getTrieSize());
162    int32_t statusTableSize   = align8(fRuleStatusVals->size() * sizeof(int32_t));
163    int32_t rulesSize         = align8((strippedRules.length()+1) * sizeof(UChar));
164
165    int32_t         totalSize = headerSize + forwardTableSize + reverseTableSize
166                                + safeFwdTableSize + safeRevTableSize
167                                + statusTableSize + trieSize + rulesSize;
168
169    RBBIDataHeader  *data     = (RBBIDataHeader *)uprv_malloc(totalSize);
170    if (data == NULL) {
171        *fStatus = U_MEMORY_ALLOCATION_ERROR;
172        return NULL;
173    }
174    uprv_memset(data, 0, totalSize);
175
176
177    data->fMagic            = 0xb1a0;
178    data->fFormatVersion[0] = 3;
179    data->fFormatVersion[1] = 1;
180    data->fFormatVersion[2] = 0;
181    data->fFormatVersion[3] = 0;
182    data->fLength           = totalSize;
183    data->fCatCount         = fSetBuilder->getNumCharCategories();
184
185    data->fFTable        = headerSize;
186    data->fFTableLen     = forwardTableSize;
187    data->fRTable        = data->fFTable  + forwardTableSize;
188    data->fRTableLen     = reverseTableSize;
189    data->fSFTable       = data->fRTable  + reverseTableSize;
190    data->fSFTableLen    = safeFwdTableSize;
191    data->fSRTable       = data->fSFTable + safeFwdTableSize;
192    data->fSRTableLen    = safeRevTableSize;
193
194    data->fTrie          = data->fSRTable + safeRevTableSize;
195    data->fTrieLen       = fSetBuilder->getTrieSize();
196    data->fStatusTable   = data->fTrie    + trieSize;
197    data->fStatusTableLen= statusTableSize;
198    data->fRuleSource    = data->fStatusTable + statusTableSize;
199    data->fRuleSourceLen = strippedRules.length() * sizeof(UChar);
200
201    uprv_memset(data->fReserved, 0, sizeof(data->fReserved));
202
203    fForwardTables->exportTable((uint8_t *)data + data->fFTable);
204    fReverseTables->exportTable((uint8_t *)data + data->fRTable);
205    fSafeFwdTables->exportTable((uint8_t *)data + data->fSFTable);
206    fSafeRevTables->exportTable((uint8_t *)data + data->fSRTable);
207    fSetBuilder->serializeTrie ((uint8_t *)data + data->fTrie);
208
209    int32_t *ruleStatusTable = (int32_t *)((uint8_t *)data + data->fStatusTable);
210    for (i=0; i<fRuleStatusVals->size(); i++) {
211        ruleStatusTable[i] = fRuleStatusVals->elementAti(i);
212    }
213
214    strippedRules.extract((UChar *)((uint8_t *)data+data->fRuleSource), rulesSize/2+1, *fStatus);
215
216    return data;
217}
218
219
220
221
222
223
224//----------------------------------------------------------------------------------------
225//
226//  createRuleBasedBreakIterator    construct from source rules that are passed in
227//                                  in a UnicodeString
228//
229//----------------------------------------------------------------------------------------
230BreakIterator *
231RBBIRuleBuilder::createRuleBasedBreakIterator( const UnicodeString    &rules,
232                                    UParseError      *parseError,
233                                    UErrorCode       &status)
234{
235    // status checked below
236
237    //
238    // Read the input rules, generate a parse tree, symbol table,
239    // and list of all Unicode Sets referenced by the rules.
240    //
241    RBBIRuleBuilder  builder(rules, parseError, status);
242    if (U_FAILURE(status)) { // status checked here bcos build below doesn't
243        return NULL;
244    }
245    builder.fScanner->parse();
246
247    //
248    // UnicodeSet processing.
249    //    Munge the Unicode Sets to create a set of character categories.
250    //    Generate the mapping tables (TRIE) from input 32-bit characters to
251    //    the character categories.
252    //
253    builder.fSetBuilder->build();
254
255
256    //
257    //   Generate the DFA state transition table.
258    //
259    builder.fForwardTables = new RBBITableBuilder(&builder, &builder.fForwardTree);
260    builder.fReverseTables = new RBBITableBuilder(&builder, &builder.fReverseTree);
261    builder.fSafeFwdTables = new RBBITableBuilder(&builder, &builder.fSafeFwdTree);
262    builder.fSafeRevTables = new RBBITableBuilder(&builder, &builder.fSafeRevTree);
263    if (builder.fForwardTables == NULL || builder.fReverseTables == NULL ||
264        builder.fSafeFwdTables == NULL || builder.fSafeRevTables == NULL)
265    {
266        status = U_MEMORY_ALLOCATION_ERROR;
267        delete builder.fForwardTables; builder.fForwardTables = NULL;
268        delete builder.fReverseTables; builder.fReverseTables = NULL;
269        delete builder.fSafeFwdTables; builder.fSafeFwdTables = NULL;
270        delete builder.fSafeRevTables; builder.fSafeRevTables = NULL;
271        return NULL;
272    }
273
274    builder.fForwardTables->build();
275    builder.fReverseTables->build();
276    builder.fSafeFwdTables->build();
277    builder.fSafeRevTables->build();
278
279#ifdef RBBI_DEBUG
280    if (builder.fDebugEnv && uprv_strstr(builder.fDebugEnv, "states")) {
281        builder.fForwardTables->printRuleStatusTable();
282    }
283#endif
284
285    //
286    //   Package up the compiled data into a memory image
287    //      in the run-time format.
288    //
289    RBBIDataHeader *data = builder.flattenData(); // returns NULL if error
290    if (U_FAILURE(*builder.fStatus)) {
291        return NULL;
292    }
293
294
295    //
296    //  Clean up the compiler related stuff
297    //
298
299
300    //
301    //  Create a break iterator from the compiled rules.
302    //     (Identical to creation from stored pre-compiled rules)
303    //
304    // status is checked after init in construction.
305    RuleBasedBreakIterator *This = new RuleBasedBreakIterator(data, status);
306    if (U_FAILURE(status)) {
307        delete This;
308        This = NULL;
309    }
310    else if(This == NULL) { // test for NULL
311        status = U_MEMORY_ALLOCATION_ERROR;
312    }
313    return This;
314}
315
316U_NAMESPACE_END
317
318#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
319