1// Copyright (C) 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3/*
4***************************************************************************
5*   Copyright (C) 1999-2014 International Business Machines Corporation   *
6*   and others. All rights reserved.                                      *
7***************************************************************************
8*/
9
10#include "unicode/utypes.h"
11
12#if !UCONFIG_NO_BREAK_ITERATION
13
14#include "unicode/utypes.h"
15#include "rbbidata.h"
16#include "rbbirb.h"
17#include "utrie.h"
18#include "udatamem.h"
19#include "cmemory.h"
20#include "cstring.h"
21#include "umutex.h"
22
23#include "uassert.h"
24
25
26//-----------------------------------------------------------------------------------
27//
28//   Trie access folding function.  Copied as-is from properties code in uchar.c
29//
30//-----------------------------------------------------------------------------------
31U_CDECL_BEGIN
32static int32_t U_CALLCONV
33getFoldingOffset(uint32_t data) {
34    /* if bit 15 is set, then the folding offset is in bits 14..0 of the 16-bit trie result */
35    if(data&0x8000) {
36        return (int32_t)(data&0x7fff);
37    } else {
38        return 0;
39    }
40}
41U_CDECL_END
42
43U_NAMESPACE_BEGIN
44
45//-----------------------------------------------------------------------------
46//
47//    Constructors.
48//
49//-----------------------------------------------------------------------------
50RBBIDataWrapper::RBBIDataWrapper(const RBBIDataHeader *data, UErrorCode &status) {
51    init0();
52    init(data, status);
53}
54
55RBBIDataWrapper::RBBIDataWrapper(const RBBIDataHeader *data, enum EDontAdopt, UErrorCode &status) {
56    init0();
57    init(data, status);
58    fDontFreeData = TRUE;
59}
60
61RBBIDataWrapper::RBBIDataWrapper(UDataMemory* udm, UErrorCode &status) {
62    init0();
63    if (U_FAILURE(status)) {
64        return;
65    }
66    const DataHeader *dh = udm->pHeader;
67    int32_t headerSize = dh->dataHeader.headerSize;
68    if (  !(headerSize >= 20 &&
69            dh->info.isBigEndian == U_IS_BIG_ENDIAN &&
70            dh->info.charsetFamily == U_CHARSET_FAMILY &&
71            dh->info.dataFormat[0] == 0x42 &&  // dataFormat="Brk "
72            dh->info.dataFormat[1] == 0x72 &&
73            dh->info.dataFormat[2] == 0x6b &&
74            dh->info.dataFormat[3] == 0x20)
75            // Note: info.fFormatVersion is duplicated in the RBBIDataHeader, and is
76            //       validated when checking that.
77        ) {
78        status = U_INVALID_FORMAT_ERROR;
79        return;
80    }
81    const char *dataAsBytes = reinterpret_cast<const char *>(dh);
82    const RBBIDataHeader *rbbidh = reinterpret_cast<const RBBIDataHeader *>(dataAsBytes + headerSize);
83    init(rbbidh, status);
84    fUDataMem = udm;
85}
86
87//-----------------------------------------------------------------------------
88//
89//    init().   Does most of the work of construction, shared between the
90//              constructors.
91//
92//-----------------------------------------------------------------------------
93void RBBIDataWrapper::init0() {
94    fHeader = NULL;
95    fForwardTable = NULL;
96    fReverseTable = NULL;
97    fSafeFwdTable = NULL;
98    fSafeRevTable = NULL;
99    fRuleSource = NULL;
100    fRuleStatusTable = NULL;
101    fUDataMem = NULL;
102    fRefCount = 0;
103    fDontFreeData = TRUE;
104}
105
106void RBBIDataWrapper::init(const RBBIDataHeader *data, UErrorCode &status) {
107    if (U_FAILURE(status)) {
108        return;
109    }
110    fHeader = data;
111    if (fHeader->fMagic != 0xb1a0 || fHeader->fFormatVersion[0] != 3)
112    {
113        status = U_INVALID_FORMAT_ERROR;
114        return;
115    }
116    // Note: in ICU version 3.2 and earlier, there was a formatVersion 1
117    //       that is no longer supported.  At that time fFormatVersion was
118    //       an int32_t field, rather than an array of 4 bytes.
119
120    fDontFreeData = FALSE;
121    if (data->fFTableLen != 0) {
122        fForwardTable = (RBBIStateTable *)((char *)data + fHeader->fFTable);
123    }
124    if (data->fRTableLen != 0) {
125        fReverseTable = (RBBIStateTable *)((char *)data + fHeader->fRTable);
126    }
127    if (data->fSFTableLen != 0) {
128        fSafeFwdTable = (RBBIStateTable *)((char *)data + fHeader->fSFTable);
129    }
130    if (data->fSRTableLen != 0) {
131        fSafeRevTable = (RBBIStateTable *)((char *)data + fHeader->fSRTable);
132    }
133
134
135    utrie_unserialize(&fTrie,
136                       (uint8_t *)data + fHeader->fTrie,
137                       fHeader->fTrieLen,
138                       &status);
139    if (U_FAILURE(status)) {
140        return;
141    }
142    fTrie.getFoldingOffset=getFoldingOffset;
143
144
145    fRuleSource   = (UChar *)((char *)data + fHeader->fRuleSource);
146    fRuleString.setTo(TRUE, fRuleSource, -1);
147    U_ASSERT(data->fRuleSourceLen > 0);
148
149    fRuleStatusTable = (int32_t *)((char *)data + fHeader->fStatusTable);
150    fStatusMaxIdx    = data->fStatusTableLen / sizeof(int32_t);
151
152    fRefCount = 1;
153
154#ifdef RBBI_DEBUG
155    char *debugEnv = getenv("U_RBBIDEBUG");
156    if (debugEnv && uprv_strstr(debugEnv, "data")) {this->printData();}
157#endif
158}
159
160
161//-----------------------------------------------------------------------------
162//
163//    Destructor.     Don't call this - use removeReference() instead.
164//
165//-----------------------------------------------------------------------------
166RBBIDataWrapper::~RBBIDataWrapper() {
167    U_ASSERT(fRefCount == 0);
168    if (fUDataMem) {
169        udata_close(fUDataMem);
170    } else if (!fDontFreeData) {
171        uprv_free((void *)fHeader);
172    }
173}
174
175
176
177//-----------------------------------------------------------------------------
178//
179//   Operator ==    Consider two RBBIDataWrappers to be equal if they
180//                  refer to the same underlying data.  Although
181//                  the data wrappers are normally shared between
182//                  iterator instances, it's possible to independently
183//                  open the same data twice, and get two instances, which
184//                  should still be ==.
185//
186//-----------------------------------------------------------------------------
187UBool RBBIDataWrapper::operator ==(const RBBIDataWrapper &other) const {
188    if (fHeader == other.fHeader) {
189        return TRUE;
190    }
191    if (fHeader->fLength != other.fHeader->fLength) {
192        return FALSE;
193    }
194    if (uprv_memcmp(fHeader, other.fHeader, fHeader->fLength) == 0) {
195        return TRUE;
196    }
197    return FALSE;
198}
199
200int32_t  RBBIDataWrapper::hashCode() {
201    return fHeader->fFTableLen;
202}
203
204
205
206//-----------------------------------------------------------------------------
207//
208//    Reference Counting.   A single RBBIDataWrapper object is shared among
209//                          however many RulesBasedBreakIterator instances are
210//                          referencing the same data.
211//
212//-----------------------------------------------------------------------------
213void RBBIDataWrapper::removeReference() {
214    if (umtx_atomic_dec(&fRefCount) == 0) {
215        delete this;
216    }
217}
218
219
220RBBIDataWrapper *RBBIDataWrapper::addReference() {
221   umtx_atomic_inc(&fRefCount);
222   return this;
223}
224
225
226
227//-----------------------------------------------------------------------------
228//
229//  getRuleSourceString
230//
231//-----------------------------------------------------------------------------
232const UnicodeString &RBBIDataWrapper::getRuleSourceString() const {
233    return fRuleString;
234}
235
236
237//-----------------------------------------------------------------------------
238//
239//  print   -  debugging function to dump the runtime data tables.
240//
241//-----------------------------------------------------------------------------
242#ifdef RBBI_DEBUG
243void  RBBIDataWrapper::printTable(const char *heading, const RBBIStateTable *table) {
244    uint32_t   c;
245    uint32_t   s;
246
247    RBBIDebugPrintf("   %s\n", heading);
248
249    RBBIDebugPrintf("State |  Acc  LA TagIx");
250    for (c=0; c<fHeader->fCatCount; c++) {RBBIDebugPrintf("%3d ", c);}
251    RBBIDebugPrintf("\n------|---------------"); for (c=0;c<fHeader->fCatCount; c++) {
252        RBBIDebugPrintf("----");
253    }
254    RBBIDebugPrintf("\n");
255
256    if (table == NULL) {
257        RBBIDebugPrintf("         N U L L   T A B L E\n\n");
258        return;
259    }
260    for (s=0; s<table->fNumStates; s++) {
261        RBBIStateTableRow *row = (RBBIStateTableRow *)
262                                  (table->fTableData + (table->fRowLen * s));
263        RBBIDebugPrintf("%4d  |  %3d %3d %3d ", s, row->fAccepting, row->fLookAhead, row->fTagIdx);
264        for (c=0; c<fHeader->fCatCount; c++)  {
265            RBBIDebugPrintf("%3d ", row->fNextState[c]);
266        }
267        RBBIDebugPrintf("\n");
268    }
269    RBBIDebugPrintf("\n");
270}
271#endif
272
273
274#ifdef RBBI_DEBUG
275void  RBBIDataWrapper::printData() {
276    RBBIDebugPrintf("RBBI Data at %p\n", (void *)fHeader);
277    RBBIDebugPrintf("   Version = {%d %d %d %d}\n", fHeader->fFormatVersion[0], fHeader->fFormatVersion[1],
278                                                    fHeader->fFormatVersion[2], fHeader->fFormatVersion[3]);
279    RBBIDebugPrintf("   total length of data  = %d\n", fHeader->fLength);
280    RBBIDebugPrintf("   number of character categories = %d\n\n", fHeader->fCatCount);
281
282    printTable("Forward State Transition Table", fForwardTable);
283    printTable("Reverse State Transition Table", fReverseTable);
284    printTable("Safe Forward State Transition Table", fSafeFwdTable);
285    printTable("Safe Reverse State Transition Table", fSafeRevTable);
286
287    RBBIDebugPrintf("\nOrignal Rules source:\n");
288    for (int32_t c=0; fRuleSource[c] != 0; c++) {
289        RBBIDebugPrintf("%c", fRuleSource[c]);
290    }
291    RBBIDebugPrintf("\n\n");
292}
293#endif
294
295
296U_NAMESPACE_END
297U_NAMESPACE_USE
298
299//-----------------------------------------------------------------------------
300//
301//  ubrk_swap   -  byte swap and char encoding swap of RBBI data
302//
303//-----------------------------------------------------------------------------
304
305U_CAPI int32_t U_EXPORT2
306ubrk_swap(const UDataSwapper *ds, const void *inData, int32_t length, void *outData,
307           UErrorCode *status) {
308
309    if (status == NULL || U_FAILURE(*status)) {
310        return 0;
311    }
312    if(ds==NULL || inData==NULL || length<-1 || (length>0 && outData==NULL)) {
313        *status=U_ILLEGAL_ARGUMENT_ERROR;
314        return 0;
315    }
316
317    //
318    //  Check that the data header is for for break data.
319    //    (Header contents are defined in genbrk.cpp)
320    //
321    const UDataInfo *pInfo = (const UDataInfo *)((const char *)inData+4);
322    if(!(  pInfo->dataFormat[0]==0x42 &&   /* dataFormat="Brk " */
323           pInfo->dataFormat[1]==0x72 &&
324           pInfo->dataFormat[2]==0x6b &&
325           pInfo->dataFormat[3]==0x20 &&
326           pInfo->formatVersion[0]==3  )) {
327        udata_printError(ds, "ubrk_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized\n",
328                         pInfo->dataFormat[0], pInfo->dataFormat[1],
329                         pInfo->dataFormat[2], pInfo->dataFormat[3],
330                         pInfo->formatVersion[0]);
331        *status=U_UNSUPPORTED_ERROR;
332        return 0;
333    }
334
335    //
336    // Swap the data header.  (This is the generic ICU Data Header, not the RBBI Specific
337    //                         RBBIDataHeader).  This swap also conveniently gets us
338    //                         the size of the ICU d.h., which lets us locate the start
339    //                         of the RBBI specific data.
340    //
341    int32_t headerSize=udata_swapDataHeader(ds, inData, length, outData, status);
342
343
344    //
345    // Get the RRBI Data Header, and check that it appears to be OK.
346    //
347    //    Note:  ICU 3.2 and earlier, RBBIDataHeader::fDataFormat was actually
348    //           an int32_t with a value of 1.  Starting with ICU 3.4,
349    //           RBBI's fDataFormat matches the dataFormat field from the
350    //           UDataInfo header, four int8_t bytes.  The value is {3,1,0,0}
351    //
352    const uint8_t  *inBytes =(const uint8_t *)inData+headerSize;
353    RBBIDataHeader *rbbiDH = (RBBIDataHeader *)inBytes;
354    if (ds->readUInt32(rbbiDH->fMagic) != 0xb1a0 ||
355        rbbiDH->fFormatVersion[0] != 3 ||
356        ds->readUInt32(rbbiDH->fLength)  <  sizeof(RBBIDataHeader))
357    {
358        udata_printError(ds, "ubrk_swap(): RBBI Data header is invalid.\n");
359        *status=U_UNSUPPORTED_ERROR;
360        return 0;
361    }
362
363    //
364    // Prefight operation?  Just return the size
365    //
366    int32_t breakDataLength = ds->readUInt32(rbbiDH->fLength);
367    int32_t totalSize = headerSize + breakDataLength;
368    if (length < 0) {
369        return totalSize;
370    }
371
372    //
373    // Check that length passed in is consistent with length from RBBI data header.
374    //
375    if (length < totalSize) {
376        udata_printError(ds, "ubrk_swap(): too few bytes (%d after ICU Data header) for break data.\n",
377                            breakDataLength);
378        *status=U_INDEX_OUTOFBOUNDS_ERROR;
379        return 0;
380        }
381
382
383    //
384    // Swap the Data.  Do the data itself first, then the RBBI Data Header, because
385    //                 we need to reference the header to locate the data, and an
386    //                 inplace swap of the header leaves it unusable.
387    //
388    uint8_t         *outBytes = (uint8_t *)outData + headerSize;
389    RBBIDataHeader  *outputDH = (RBBIDataHeader *)outBytes;
390
391    int32_t   tableStartOffset;
392    int32_t   tableLength;
393
394    //
395    // If not swapping in place, zero out the output buffer before starting.
396    //    Individual tables and other data items within are aligned to 8 byte boundaries
397    //    when originally created.  Any unused space between items needs to be zero.
398    //
399    if (inBytes != outBytes) {
400        uprv_memset(outBytes, 0, breakDataLength);
401    }
402
403    //
404    // Each state table begins with several 32 bit fields.  Calculate the size
405    //   in bytes of these.
406    //
407    int32_t         topSize = offsetof(RBBIStateTable, fTableData);
408
409    // Forward state table.
410    tableStartOffset = ds->readUInt32(rbbiDH->fFTable);
411    tableLength      = ds->readUInt32(rbbiDH->fFTableLen);
412
413    if (tableLength > 0) {
414        ds->swapArray32(ds, inBytes+tableStartOffset, topSize,
415                            outBytes+tableStartOffset, status);
416        ds->swapArray16(ds, inBytes+tableStartOffset+topSize, tableLength-topSize,
417                            outBytes+tableStartOffset+topSize, status);
418    }
419
420    // Reverse state table.  Same layout as forward table, above.
421    tableStartOffset = ds->readUInt32(rbbiDH->fRTable);
422    tableLength      = ds->readUInt32(rbbiDH->fRTableLen);
423
424    if (tableLength > 0) {
425        ds->swapArray32(ds, inBytes+tableStartOffset, topSize,
426                            outBytes+tableStartOffset, status);
427        ds->swapArray16(ds, inBytes+tableStartOffset+topSize, tableLength-topSize,
428                            outBytes+tableStartOffset+topSize, status);
429    }
430
431    // Safe Forward state table.  Same layout as forward table, above.
432    tableStartOffset = ds->readUInt32(rbbiDH->fSFTable);
433    tableLength      = ds->readUInt32(rbbiDH->fSFTableLen);
434
435    if (tableLength > 0) {
436        ds->swapArray32(ds, inBytes+tableStartOffset, topSize,
437                            outBytes+tableStartOffset, status);
438        ds->swapArray16(ds, inBytes+tableStartOffset+topSize, tableLength-topSize,
439                            outBytes+tableStartOffset+topSize, status);
440    }
441
442    // Safe Reverse state table.  Same layout as forward table, above.
443    tableStartOffset = ds->readUInt32(rbbiDH->fSRTable);
444    tableLength      = ds->readUInt32(rbbiDH->fSRTableLen);
445
446    if (tableLength > 0) {
447        ds->swapArray32(ds, inBytes+tableStartOffset, topSize,
448                            outBytes+tableStartOffset, status);
449        ds->swapArray16(ds, inBytes+tableStartOffset+topSize, tableLength-topSize,
450                            outBytes+tableStartOffset+topSize, status);
451    }
452
453    // Trie table for character categories
454    utrie_swap(ds, inBytes+ds->readUInt32(rbbiDH->fTrie), ds->readUInt32(rbbiDH->fTrieLen),
455                            outBytes+ds->readUInt32(rbbiDH->fTrie), status);
456
457    // Source Rules Text.  It's UChar data
458    ds->swapArray16(ds, inBytes+ds->readUInt32(rbbiDH->fRuleSource), ds->readUInt32(rbbiDH->fRuleSourceLen),
459                        outBytes+ds->readUInt32(rbbiDH->fRuleSource), status);
460
461    // Table of rule status values.  It's all int_32 values
462    ds->swapArray32(ds, inBytes+ds->readUInt32(rbbiDH->fStatusTable), ds->readUInt32(rbbiDH->fStatusTableLen),
463                        outBytes+ds->readUInt32(rbbiDH->fStatusTable), status);
464
465    // And, last, the header.
466    //   It is all int32_t values except for fFormataVersion, which is an array of four bytes.
467    //   Swap the whole thing as int32_t, then re-swap the one field.
468    //
469    ds->swapArray32(ds, inBytes, sizeof(RBBIDataHeader), outBytes, status);
470    ds->swapArray32(ds, outputDH->fFormatVersion, 4, outputDH->fFormatVersion, status);
471
472    return totalSize;
473}
474
475
476#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
477