1/*
2***************************************************************************
3*   Copyright (C) 1999-2008 International Business Machines Corporation   *
4*   and others. All rights reserved.                                      *
5***************************************************************************
6*/
7
8#include "unicode/utypes.h"
9
10#if !UCONFIG_NO_BREAK_ITERATION
11
12#include "unicode/utypes.h"
13#include "rbbidata.h"
14#include "rbbirb.h"
15#include "utrie.h"
16#include "udatamem.h"
17#include "cmemory.h"
18#include "cstring.h"
19#include "umutex.h"
20
21#include "uassert.h"
22
23
24//-----------------------------------------------------------------------------------
25//
26//   Trie access folding function.  Copied as-is from properties code in uchar.c
27//
28//-----------------------------------------------------------------------------------
29U_CDECL_BEGIN
30static int32_t U_CALLCONV
31getFoldingOffset(uint32_t data) {
32    /* if bit 15 is set, then the folding offset is in bits 14..0 of the 16-bit trie result */
33    if(data&0x8000) {
34        return (int32_t)(data&0x7fff);
35    } else {
36        return 0;
37    }
38}
39U_CDECL_END
40
41U_NAMESPACE_BEGIN
42
43//-----------------------------------------------------------------------------
44//
45//    Constructors.
46//
47//-----------------------------------------------------------------------------
48RBBIDataWrapper::RBBIDataWrapper(const RBBIDataHeader *data, UErrorCode &status) {
49    init(data, status);
50}
51
52RBBIDataWrapper::RBBIDataWrapper(const RBBIDataHeader *data, enum EDontAdopt, UErrorCode &status) {
53    init(data, status);
54    fDontFreeData = TRUE;
55}
56
57RBBIDataWrapper::RBBIDataWrapper(UDataMemory* udm, UErrorCode &status) {
58    const RBBIDataHeader *d = (const RBBIDataHeader *)
59        // ((char *)&(udm->pHeader->info) + udm->pHeader->info.size);
60        // taking into consideration the padding added in by udata_write
61        ((char *)(udm->pHeader) + udm->pHeader->dataHeader.headerSize);
62    init(d, status);
63    fUDataMem = udm;
64}
65
66//-----------------------------------------------------------------------------
67//
68//    init().   Does most of the work of construction, shared between the
69//              constructors.
70//
71//-----------------------------------------------------------------------------
72void RBBIDataWrapper::init(const RBBIDataHeader *data, UErrorCode &status) {
73    if (U_FAILURE(status)) {
74        return;
75    }
76    fHeader = data;
77    if (fHeader->fMagic != 0xb1a0 ||
78        !(fHeader->fFormatVersion[0] == 3 ||         // ICU 3.4
79          *(int32_t *)fHeader->fFormatVersion == 1))  // ICU 3.2 and earlier.
80    {
81        status = U_INVALID_FORMAT_ERROR;
82        return;
83    }
84
85    fDontFreeData = FALSE;
86    fUDataMem     = NULL;
87    fReverseTable = NULL;
88    fSafeFwdTable = NULL;
89    fSafeRevTable = NULL;
90    if (data->fFTableLen != 0) {
91        fForwardTable = (RBBIStateTable *)((char *)data + fHeader->fFTable);
92    }
93    if (data->fRTableLen != 0) {
94        fReverseTable = (RBBIStateTable *)((char *)data + fHeader->fRTable);
95    }
96    if (data->fSFTableLen != 0) {
97        fSafeFwdTable = (RBBIStateTable *)((char *)data + fHeader->fSFTable);
98    }
99    if (data->fSRTableLen != 0) {
100        fSafeRevTable = (RBBIStateTable *)((char *)data + fHeader->fSRTable);
101    }
102
103
104    utrie_unserialize(&fTrie,
105                       (uint8_t *)data + fHeader->fTrie,
106                       fHeader->fTrieLen,
107                       &status);
108    if (U_FAILURE(status)) {
109        return;
110    }
111    fTrie.getFoldingOffset=getFoldingOffset;
112
113
114    fRuleSource   = (UChar *)((char *)data + fHeader->fRuleSource);
115    fRuleString.setTo(TRUE, fRuleSource, -1);
116    U_ASSERT(data->fRuleSourceLen > 0);
117
118    fRuleStatusTable = (int32_t *)((char *)data + fHeader->fStatusTable);
119    fStatusMaxIdx    = data->fStatusTableLen / sizeof(int32_t);
120
121    fRefCount = 1;
122
123#ifdef RBBI_DEBUG
124    char *debugEnv = getenv("U_RBBIDEBUG");
125    if (debugEnv && uprv_strstr(debugEnv, "data")) {this->printData();}
126#endif
127}
128
129
130//-----------------------------------------------------------------------------
131//
132//    Destructor.     Don't call this - use removeReference() instead.
133//
134//-----------------------------------------------------------------------------
135RBBIDataWrapper::~RBBIDataWrapper() {
136    U_ASSERT(fRefCount == 0);
137    if (fUDataMem) {
138        udata_close(fUDataMem);
139    } else if (!fDontFreeData) {
140        uprv_free((void *)fHeader);
141    }
142}
143
144
145
146//-----------------------------------------------------------------------------
147//
148//   Operator ==    Consider two RBBIDataWrappers to be equal if they
149//                  refer to the same underlying data.  Although
150//                  the data wrappers are normally shared between
151//                  iterator instances, it's possible to independently
152//                  open the same data twice, and get two instances, which
153//                  should still be ==.
154//
155//-----------------------------------------------------------------------------
156UBool RBBIDataWrapper::operator ==(const RBBIDataWrapper &other) const {
157    if (fHeader == other.fHeader) {
158        return TRUE;
159    }
160    if (fHeader->fLength != other.fHeader->fLength) {
161        return FALSE;
162    }
163    if (uprv_memcmp(fHeader, other.fHeader, fHeader->fLength) == 0) {
164        return TRUE;
165    }
166    return FALSE;
167}
168
169int32_t  RBBIDataWrapper::hashCode() {
170    return fHeader->fFTableLen;
171}
172
173
174
175//-----------------------------------------------------------------------------
176//
177//    Reference Counting.   A single RBBIDataWrapper object is shared among
178//                          however many RulesBasedBreakIterator instances are
179//                          referencing the same data.
180//
181//-----------------------------------------------------------------------------
182void RBBIDataWrapper::removeReference() {
183    if (umtx_atomic_dec(&fRefCount) == 0) {
184        delete this;
185    }
186}
187
188
189RBBIDataWrapper *RBBIDataWrapper::addReference() {
190   umtx_atomic_inc(&fRefCount);
191   return this;
192}
193
194
195
196//-----------------------------------------------------------------------------
197//
198//  getRuleSourceString
199//
200//-----------------------------------------------------------------------------
201const UnicodeString &RBBIDataWrapper::getRuleSourceString() const {
202    return fRuleString;
203}
204
205
206//-----------------------------------------------------------------------------
207//
208//  print   -  debugging function to dump the runtime data tables.
209//
210//-----------------------------------------------------------------------------
211#ifdef RBBI_DEBUG
212void  RBBIDataWrapper::printTable(const char *heading, const RBBIStateTable *table) {
213    uint32_t   c;
214    uint32_t   s;
215
216    RBBIDebugPrintf("   %s\n", heading);
217
218    RBBIDebugPrintf("State |  Acc  LA TagIx");
219    for (c=0; c<fHeader->fCatCount; c++) {RBBIDebugPrintf("%3d ", c);}
220    RBBIDebugPrintf("\n------|---------------"); for (c=0;c<fHeader->fCatCount; c++) {
221        RBBIDebugPrintf("----");
222    }
223    RBBIDebugPrintf("\n");
224
225    if (table == NULL) {
226        RBBIDebugPrintf("         N U L L   T A B L E\n\n");
227        return;
228    }
229    for (s=0; s<table->fNumStates; s++) {
230        RBBIStateTableRow *row = (RBBIStateTableRow *)
231                                  (table->fTableData + (table->fRowLen * s));
232        RBBIDebugPrintf("%4d  |  %3d %3d %3d ", s, row->fAccepting, row->fLookAhead, row->fTagIdx);
233        for (c=0; c<fHeader->fCatCount; c++)  {
234            RBBIDebugPrintf("%3d ", row->fNextState[c]);
235        }
236        RBBIDebugPrintf("\n");
237    }
238    RBBIDebugPrintf("\n");
239}
240#endif
241
242
243#ifdef RBBI_DEBUG
244void  RBBIDataWrapper::printData() {
245    RBBIDebugPrintf("RBBI Data at %p\n", (void *)fHeader);
246    RBBIDebugPrintf("   Version = {%d %d %d %d}\n", fHeader->fFormatVersion[0], fHeader->fFormatVersion[1],
247                                                    fHeader->fFormatVersion[2], fHeader->fFormatVersion[3]);
248    RBBIDebugPrintf("   total length of data  = %d\n", fHeader->fLength);
249    RBBIDebugPrintf("   number of character categories = %d\n\n", fHeader->fCatCount);
250
251    printTable("Forward State Transition Table", fForwardTable);
252    printTable("Reverse State Transition Table", fReverseTable);
253    printTable("Safe Forward State Transition Table", fSafeFwdTable);
254    printTable("Safe Reverse State Transition Table", fSafeRevTable);
255
256    RBBIDebugPrintf("\nOrignal Rules source:\n");
257    for (int32_t c=0; fRuleSource[c] != 0; c++) {
258        RBBIDebugPrintf("%c", fRuleSource[c]);
259    }
260    RBBIDebugPrintf("\n\n");
261}
262#endif
263
264
265U_NAMESPACE_END
266U_NAMESPACE_USE
267
268//-----------------------------------------------------------------------------
269//
270//  ubrk_swap   -  byte swap and char encoding swap of RBBI data
271//
272//-----------------------------------------------------------------------------
273
274U_CAPI int32_t U_EXPORT2
275ubrk_swap(const UDataSwapper *ds, const void *inData, int32_t length, void *outData,
276           UErrorCode *status) {
277
278    if (status == NULL || U_FAILURE(*status)) {
279        return 0;
280    }
281    if(ds==NULL || inData==NULL || length<-1 || (length>0 && outData==NULL)) {
282        *status=U_ILLEGAL_ARGUMENT_ERROR;
283        return 0;
284    }
285
286    //
287    //  Check that the data header is for for break data.
288    //    (Header contents are defined in genbrk.cpp)
289    //
290    const UDataInfo *pInfo = (const UDataInfo *)((const char *)inData+4);
291    if(!(  pInfo->dataFormat[0]==0x42 &&   /* dataFormat="Brk " */
292           pInfo->dataFormat[1]==0x72 &&
293           pInfo->dataFormat[2]==0x6b &&
294           pInfo->dataFormat[3]==0x20 &&
295           pInfo->formatVersion[0]==3  )) {
296        udata_printError(ds, "ubrk_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized\n",
297                         pInfo->dataFormat[0], pInfo->dataFormat[1],
298                         pInfo->dataFormat[2], pInfo->dataFormat[3],
299                         pInfo->formatVersion[0]);
300        *status=U_UNSUPPORTED_ERROR;
301        return 0;
302    }
303
304    //
305    // Swap the data header.  (This is the generic ICU Data Header, not the RBBI Specific
306    //                         RBBIDataHeader).  This swap also conveniently gets us
307    //                         the size of the ICU d.h., which lets us locate the start
308    //                         of the RBBI specific data.
309    //
310    int32_t headerSize=udata_swapDataHeader(ds, inData, length, outData, status);
311
312
313    //
314    // Get the RRBI Data Header, and check that it appears to be OK.
315    //
316    //    Note:  ICU 3.2 and earlier, RBBIDataHeader::fDataFormat was actually
317    //           an int32_t with a value of 1.  Starting with ICU 3.4,
318    //           RBBI's fDataFormat matches the dataFormat field from the
319    //           UDataInfo header, four int8_t bytes.  The value is {3,1,0,0}
320    //
321    const uint8_t  *inBytes =(const uint8_t *)inData+headerSize;
322    RBBIDataHeader *rbbiDH = (RBBIDataHeader *)inBytes;
323    UBool           formatVersionOne = ds->readUInt32(*(int32_t *)rbbiDH->fFormatVersion) == 1;
324    if (ds->readUInt32(rbbiDH->fMagic)   != 0xb1a0 ||
325        !(formatVersionOne || rbbiDH->fFormatVersion[0] == 3)   ||
326        ds->readUInt32(rbbiDH->fLength)  <  sizeof(RBBIDataHeader))
327    {
328        udata_printError(ds, "ubrk_swap(): RBBI Data header is invalid.\n");
329        *status=U_UNSUPPORTED_ERROR;
330        return 0;
331    }
332
333    //
334    // Prefight operation?  Just return the size
335    //
336    int32_t breakDataLength = ds->readUInt32(rbbiDH->fLength);
337    int32_t totalSize = headerSize + breakDataLength;
338    if (length < 0) {
339        return totalSize;
340    }
341
342    //
343    // Check that length passed in is consistent with length from RBBI data header.
344    //
345    if (length < totalSize) {
346        udata_printError(ds, "ubrk_swap(): too few bytes (%d after ICU Data header) for break data.\n",
347                            breakDataLength);
348        *status=U_INDEX_OUTOFBOUNDS_ERROR;
349        return 0;
350        }
351
352
353    //
354    // Swap the Data.  Do the data itself first, then the RBBI Data Header, because
355    //                 we need to reference the header to locate the data, and an
356    //                 inplace swap of the header leaves it unusable.
357    //
358    uint8_t         *outBytes = (uint8_t *)outData + headerSize;
359    RBBIDataHeader  *outputDH = (RBBIDataHeader *)outBytes;
360
361    int32_t   tableStartOffset;
362    int32_t   tableLength;
363
364    //
365    // If not swapping in place, zero out the output buffer before starting.
366    //    Individual tables and other data items within are aligned to 8 byte boundaries
367    //    when originally created.  Any unused space between items needs to be zero.
368    //
369    if (inBytes != outBytes) {
370        uprv_memset(outBytes, 0, breakDataLength);
371    }
372
373    //
374    // Each state table begins with several 32 bit fields.  Calculate the size
375    //   in bytes of these.
376    //
377    int32_t         topSize = offsetof(RBBIStateTable, fTableData);
378
379    // Forward state table.
380    tableStartOffset = ds->readUInt32(rbbiDH->fFTable);
381    tableLength      = ds->readUInt32(rbbiDH->fFTableLen);
382
383    if (tableLength > 0) {
384        ds->swapArray32(ds, inBytes+tableStartOffset, topSize,
385                            outBytes+tableStartOffset, status);
386        ds->swapArray16(ds, inBytes+tableStartOffset+topSize, tableLength-topSize,
387                            outBytes+tableStartOffset+topSize, status);
388    }
389
390    // Reverse state table.  Same layout as forward table, above.
391    tableStartOffset = ds->readUInt32(rbbiDH->fRTable);
392    tableLength      = ds->readUInt32(rbbiDH->fRTableLen);
393
394    if (tableLength > 0) {
395        ds->swapArray32(ds, inBytes+tableStartOffset, topSize,
396                            outBytes+tableStartOffset, status);
397        ds->swapArray16(ds, inBytes+tableStartOffset+topSize, tableLength-topSize,
398                            outBytes+tableStartOffset+topSize, status);
399    }
400
401    // Safe Forward state table.  Same layout as forward table, above.
402    tableStartOffset = ds->readUInt32(rbbiDH->fSFTable);
403    tableLength      = ds->readUInt32(rbbiDH->fSFTableLen);
404
405    if (tableLength > 0) {
406        ds->swapArray32(ds, inBytes+tableStartOffset, topSize,
407                            outBytes+tableStartOffset, status);
408        ds->swapArray16(ds, inBytes+tableStartOffset+topSize, tableLength-topSize,
409                            outBytes+tableStartOffset+topSize, status);
410    }
411
412    // Safe Reverse state table.  Same layout as forward table, above.
413    tableStartOffset = ds->readUInt32(rbbiDH->fSRTable);
414    tableLength      = ds->readUInt32(rbbiDH->fSRTableLen);
415
416    if (tableLength > 0) {
417        ds->swapArray32(ds, inBytes+tableStartOffset, topSize,
418                            outBytes+tableStartOffset, status);
419        ds->swapArray16(ds, inBytes+tableStartOffset+topSize, tableLength-topSize,
420                            outBytes+tableStartOffset+topSize, status);
421    }
422
423    // Trie table for character categories
424    utrie_swap(ds, inBytes+ds->readUInt32(rbbiDH->fTrie), ds->readUInt32(rbbiDH->fTrieLen),
425                            outBytes+ds->readUInt32(rbbiDH->fTrie), status);
426
427    // Source Rules Text.  It's UChar data
428    ds->swapArray16(ds, inBytes+ds->readUInt32(rbbiDH->fRuleSource), ds->readUInt32(rbbiDH->fRuleSourceLen),
429                        outBytes+ds->readUInt32(rbbiDH->fRuleSource), status);
430
431    // Table of rule status values.  It's all int_32 values
432    ds->swapArray32(ds, inBytes+ds->readUInt32(rbbiDH->fStatusTable), ds->readUInt32(rbbiDH->fStatusTableLen),
433                        outBytes+ds->readUInt32(rbbiDH->fStatusTable), status);
434
435    // And, last, the header.
436    //   For the old version one format, the entire header consists of int32_t values.
437    //   For the newer formats, the fDataFormat field is an array of four bytes.
438    //   Swap the whole thing as int32_t, then, for the newer format, re-swap the one field.
439    //
440    ds->swapArray32(ds, inBytes, sizeof(RBBIDataHeader), outBytes, status);
441    if (formatVersionOne == FALSE) {
442        ds->swapArray32(ds, outputDH->fFormatVersion, 4, outputDH->fFormatVersion, status);
443    }
444
445
446    return totalSize;
447}
448
449
450#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
451