1/*
2***************************************************************************
3*   Copyright (C) 1999-2010 International Business Machines Corporation   *
4*   and others. All rights reserved.                                      *
5***************************************************************************
6*/
7
8#include "unicode/utypes.h"
9
10#if !UCONFIG_NO_BREAK_ITERATION
11
12#include "unicode/utypes.h"
13#include "rbbidata.h"
14#include "rbbirb.h"
15#include "utrie.h"
16#include "udatamem.h"
17#include "cmemory.h"
18#include "cstring.h"
19#include "umutex.h"
20
21#include "uassert.h"
22
23
24//-----------------------------------------------------------------------------------
25//
26//   Trie access folding function.  Copied as-is from properties code in uchar.c
27//
28//-----------------------------------------------------------------------------------
29U_CDECL_BEGIN
30static int32_t U_CALLCONV
31getFoldingOffset(uint32_t data) {
32    /* if bit 15 is set, then the folding offset is in bits 14..0 of the 16-bit trie result */
33    if(data&0x8000) {
34        return (int32_t)(data&0x7fff);
35    } else {
36        return 0;
37    }
38}
39U_CDECL_END
40
41U_NAMESPACE_BEGIN
42
43//-----------------------------------------------------------------------------
44//
45//    Constructors.
46//
47//-----------------------------------------------------------------------------
48RBBIDataWrapper::RBBIDataWrapper(const RBBIDataHeader *data, UErrorCode &status) {
49    init(data, status);
50}
51
52RBBIDataWrapper::RBBIDataWrapper(const RBBIDataHeader *data, enum EDontAdopt, UErrorCode &status) {
53    init(data, status);
54    fDontFreeData = TRUE;
55}
56
57RBBIDataWrapper::RBBIDataWrapper(UDataMemory* udm, UErrorCode &status) {
58    const RBBIDataHeader *d = (const RBBIDataHeader *)
59        // ((char *)&(udm->pHeader->info) + udm->pHeader->info.size);
60        // taking into consideration the padding added in by udata_write
61        ((char *)(udm->pHeader) + udm->pHeader->dataHeader.headerSize);
62    init(d, status);
63    fUDataMem = udm;
64}
65
66//-----------------------------------------------------------------------------
67//
68//    init().   Does most of the work of construction, shared between the
69//              constructors.
70//
71//-----------------------------------------------------------------------------
72void RBBIDataWrapper::init(const RBBIDataHeader *data, UErrorCode &status) {
73    if (U_FAILURE(status)) {
74        return;
75    }
76    fHeader = data;
77    if (fHeader->fMagic != 0xb1a0 || fHeader->fFormatVersion[0] != 3)
78    {
79        status = U_INVALID_FORMAT_ERROR;
80        return;
81    }
82    // Note: in ICU version 3.2 and earlier, there was a formatVersion 1
83    //       that is no longer supported.  At that time fFormatVersion was
84    //       an int32_t field, rather than an array of 4 bytes.
85
86    fDontFreeData = FALSE;
87    fUDataMem     = NULL;
88    fReverseTable = NULL;
89    fSafeFwdTable = NULL;
90    fSafeRevTable = NULL;
91    if (data->fFTableLen != 0) {
92        fForwardTable = (RBBIStateTable *)((char *)data + fHeader->fFTable);
93    }
94    if (data->fRTableLen != 0) {
95        fReverseTable = (RBBIStateTable *)((char *)data + fHeader->fRTable);
96    }
97    if (data->fSFTableLen != 0) {
98        fSafeFwdTable = (RBBIStateTable *)((char *)data + fHeader->fSFTable);
99    }
100    if (data->fSRTableLen != 0) {
101        fSafeRevTable = (RBBIStateTable *)((char *)data + fHeader->fSRTable);
102    }
103
104
105    utrie_unserialize(&fTrie,
106                       (uint8_t *)data + fHeader->fTrie,
107                       fHeader->fTrieLen,
108                       &status);
109    if (U_FAILURE(status)) {
110        return;
111    }
112    fTrie.getFoldingOffset=getFoldingOffset;
113
114
115    fRuleSource   = (UChar *)((char *)data + fHeader->fRuleSource);
116    fRuleString.setTo(TRUE, fRuleSource, -1);
117    U_ASSERT(data->fRuleSourceLen > 0);
118
119    fRuleStatusTable = (int32_t *)((char *)data + fHeader->fStatusTable);
120    fStatusMaxIdx    = data->fStatusTableLen / sizeof(int32_t);
121
122    fRefCount = 1;
123
124#ifdef RBBI_DEBUG
125    char *debugEnv = getenv("U_RBBIDEBUG");
126    if (debugEnv && uprv_strstr(debugEnv, "data")) {this->printData();}
127#endif
128}
129
130
131//-----------------------------------------------------------------------------
132//
133//    Destructor.     Don't call this - use removeReference() instead.
134//
135//-----------------------------------------------------------------------------
136RBBIDataWrapper::~RBBIDataWrapper() {
137    U_ASSERT(fRefCount == 0);
138    if (fUDataMem) {
139        udata_close(fUDataMem);
140    } else if (!fDontFreeData) {
141        uprv_free((void *)fHeader);
142    }
143}
144
145
146
147//-----------------------------------------------------------------------------
148//
149//   Operator ==    Consider two RBBIDataWrappers to be equal if they
150//                  refer to the same underlying data.  Although
151//                  the data wrappers are normally shared between
152//                  iterator instances, it's possible to independently
153//                  open the same data twice, and get two instances, which
154//                  should still be ==.
155//
156//-----------------------------------------------------------------------------
157UBool RBBIDataWrapper::operator ==(const RBBIDataWrapper &other) const {
158    if (fHeader == other.fHeader) {
159        return TRUE;
160    }
161    if (fHeader->fLength != other.fHeader->fLength) {
162        return FALSE;
163    }
164    if (uprv_memcmp(fHeader, other.fHeader, fHeader->fLength) == 0) {
165        return TRUE;
166    }
167    return FALSE;
168}
169
170int32_t  RBBIDataWrapper::hashCode() {
171    return fHeader->fFTableLen;
172}
173
174
175
176//-----------------------------------------------------------------------------
177//
178//    Reference Counting.   A single RBBIDataWrapper object is shared among
179//                          however many RulesBasedBreakIterator instances are
180//                          referencing the same data.
181//
182//-----------------------------------------------------------------------------
183void RBBIDataWrapper::removeReference() {
184    if (umtx_atomic_dec(&fRefCount) == 0) {
185        delete this;
186    }
187}
188
189
190RBBIDataWrapper *RBBIDataWrapper::addReference() {
191   umtx_atomic_inc(&fRefCount);
192   return this;
193}
194
195
196
197//-----------------------------------------------------------------------------
198//
199//  getRuleSourceString
200//
201//-----------------------------------------------------------------------------
202const UnicodeString &RBBIDataWrapper::getRuleSourceString() const {
203    return fRuleString;
204}
205
206
207//-----------------------------------------------------------------------------
208//
209//  print   -  debugging function to dump the runtime data tables.
210//
211//-----------------------------------------------------------------------------
212#ifdef RBBI_DEBUG
213void  RBBIDataWrapper::printTable(const char *heading, const RBBIStateTable *table) {
214    uint32_t   c;
215    uint32_t   s;
216
217    RBBIDebugPrintf("   %s\n", heading);
218
219    RBBIDebugPrintf("State |  Acc  LA TagIx");
220    for (c=0; c<fHeader->fCatCount; c++) {RBBIDebugPrintf("%3d ", c);}
221    RBBIDebugPrintf("\n------|---------------"); for (c=0;c<fHeader->fCatCount; c++) {
222        RBBIDebugPrintf("----");
223    }
224    RBBIDebugPrintf("\n");
225
226    if (table == NULL) {
227        RBBIDebugPrintf("         N U L L   T A B L E\n\n");
228        return;
229    }
230    for (s=0; s<table->fNumStates; s++) {
231        RBBIStateTableRow *row = (RBBIStateTableRow *)
232                                  (table->fTableData + (table->fRowLen * s));
233        RBBIDebugPrintf("%4d  |  %3d %3d %3d ", s, row->fAccepting, row->fLookAhead, row->fTagIdx);
234        for (c=0; c<fHeader->fCatCount; c++)  {
235            RBBIDebugPrintf("%3d ", row->fNextState[c]);
236        }
237        RBBIDebugPrintf("\n");
238    }
239    RBBIDebugPrintf("\n");
240}
241#endif
242
243
244#ifdef RBBI_DEBUG
245void  RBBIDataWrapper::printData() {
246    RBBIDebugPrintf("RBBI Data at %p\n", (void *)fHeader);
247    RBBIDebugPrintf("   Version = {%d %d %d %d}\n", fHeader->fFormatVersion[0], fHeader->fFormatVersion[1],
248                                                    fHeader->fFormatVersion[2], fHeader->fFormatVersion[3]);
249    RBBIDebugPrintf("   total length of data  = %d\n", fHeader->fLength);
250    RBBIDebugPrintf("   number of character categories = %d\n\n", fHeader->fCatCount);
251
252    printTable("Forward State Transition Table", fForwardTable);
253    printTable("Reverse State Transition Table", fReverseTable);
254    printTable("Safe Forward State Transition Table", fSafeFwdTable);
255    printTable("Safe Reverse State Transition Table", fSafeRevTable);
256
257    RBBIDebugPrintf("\nOrignal Rules source:\n");
258    for (int32_t c=0; fRuleSource[c] != 0; c++) {
259        RBBIDebugPrintf("%c", fRuleSource[c]);
260    }
261    RBBIDebugPrintf("\n\n");
262}
263#endif
264
265
266U_NAMESPACE_END
267U_NAMESPACE_USE
268
269//-----------------------------------------------------------------------------
270//
271//  ubrk_swap   -  byte swap and char encoding swap of RBBI data
272//
273//-----------------------------------------------------------------------------
274
275U_CAPI int32_t U_EXPORT2
276ubrk_swap(const UDataSwapper *ds, const void *inData, int32_t length, void *outData,
277           UErrorCode *status) {
278
279    if (status == NULL || U_FAILURE(*status)) {
280        return 0;
281    }
282    if(ds==NULL || inData==NULL || length<-1 || (length>0 && outData==NULL)) {
283        *status=U_ILLEGAL_ARGUMENT_ERROR;
284        return 0;
285    }
286
287    //
288    //  Check that the data header is for for break data.
289    //    (Header contents are defined in genbrk.cpp)
290    //
291    const UDataInfo *pInfo = (const UDataInfo *)((const char *)inData+4);
292    if(!(  pInfo->dataFormat[0]==0x42 &&   /* dataFormat="Brk " */
293           pInfo->dataFormat[1]==0x72 &&
294           pInfo->dataFormat[2]==0x6b &&
295           pInfo->dataFormat[3]==0x20 &&
296           pInfo->formatVersion[0]==3  )) {
297        udata_printError(ds, "ubrk_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized\n",
298                         pInfo->dataFormat[0], pInfo->dataFormat[1],
299                         pInfo->dataFormat[2], pInfo->dataFormat[3],
300                         pInfo->formatVersion[0]);
301        *status=U_UNSUPPORTED_ERROR;
302        return 0;
303    }
304
305    //
306    // Swap the data header.  (This is the generic ICU Data Header, not the RBBI Specific
307    //                         RBBIDataHeader).  This swap also conveniently gets us
308    //                         the size of the ICU d.h., which lets us locate the start
309    //                         of the RBBI specific data.
310    //
311    int32_t headerSize=udata_swapDataHeader(ds, inData, length, outData, status);
312
313
314    //
315    // Get the RRBI Data Header, and check that it appears to be OK.
316    //
317    //    Note:  ICU 3.2 and earlier, RBBIDataHeader::fDataFormat was actually
318    //           an int32_t with a value of 1.  Starting with ICU 3.4,
319    //           RBBI's fDataFormat matches the dataFormat field from the
320    //           UDataInfo header, four int8_t bytes.  The value is {3,1,0,0}
321    //
322    const uint8_t  *inBytes =(const uint8_t *)inData+headerSize;
323    RBBIDataHeader *rbbiDH = (RBBIDataHeader *)inBytes;
324    if (ds->readUInt32(rbbiDH->fMagic) != 0xb1a0 ||
325        rbbiDH->fFormatVersion[0] != 3 ||
326        ds->readUInt32(rbbiDH->fLength)  <  sizeof(RBBIDataHeader))
327    {
328        udata_printError(ds, "ubrk_swap(): RBBI Data header is invalid.\n");
329        *status=U_UNSUPPORTED_ERROR;
330        return 0;
331    }
332
333    //
334    // Prefight operation?  Just return the size
335    //
336    int32_t breakDataLength = ds->readUInt32(rbbiDH->fLength);
337    int32_t totalSize = headerSize + breakDataLength;
338    if (length < 0) {
339        return totalSize;
340    }
341
342    //
343    // Check that length passed in is consistent with length from RBBI data header.
344    //
345    if (length < totalSize) {
346        udata_printError(ds, "ubrk_swap(): too few bytes (%d after ICU Data header) for break data.\n",
347                            breakDataLength);
348        *status=U_INDEX_OUTOFBOUNDS_ERROR;
349        return 0;
350        }
351
352
353    //
354    // Swap the Data.  Do the data itself first, then the RBBI Data Header, because
355    //                 we need to reference the header to locate the data, and an
356    //                 inplace swap of the header leaves it unusable.
357    //
358    uint8_t         *outBytes = (uint8_t *)outData + headerSize;
359    RBBIDataHeader  *outputDH = (RBBIDataHeader *)outBytes;
360
361    int32_t   tableStartOffset;
362    int32_t   tableLength;
363
364    //
365    // If not swapping in place, zero out the output buffer before starting.
366    //    Individual tables and other data items within are aligned to 8 byte boundaries
367    //    when originally created.  Any unused space between items needs to be zero.
368    //
369    if (inBytes != outBytes) {
370        uprv_memset(outBytes, 0, breakDataLength);
371    }
372
373    //
374    // Each state table begins with several 32 bit fields.  Calculate the size
375    //   in bytes of these.
376    //
377    int32_t         topSize = offsetof(RBBIStateTable, fTableData);
378
379    // Forward state table.
380    tableStartOffset = ds->readUInt32(rbbiDH->fFTable);
381    tableLength      = ds->readUInt32(rbbiDH->fFTableLen);
382
383    if (tableLength > 0) {
384        ds->swapArray32(ds, inBytes+tableStartOffset, topSize,
385                            outBytes+tableStartOffset, status);
386        ds->swapArray16(ds, inBytes+tableStartOffset+topSize, tableLength-topSize,
387                            outBytes+tableStartOffset+topSize, status);
388    }
389
390    // Reverse state table.  Same layout as forward table, above.
391    tableStartOffset = ds->readUInt32(rbbiDH->fRTable);
392    tableLength      = ds->readUInt32(rbbiDH->fRTableLen);
393
394    if (tableLength > 0) {
395        ds->swapArray32(ds, inBytes+tableStartOffset, topSize,
396                            outBytes+tableStartOffset, status);
397        ds->swapArray16(ds, inBytes+tableStartOffset+topSize, tableLength-topSize,
398                            outBytes+tableStartOffset+topSize, status);
399    }
400
401    // Safe Forward state table.  Same layout as forward table, above.
402    tableStartOffset = ds->readUInt32(rbbiDH->fSFTable);
403    tableLength      = ds->readUInt32(rbbiDH->fSFTableLen);
404
405    if (tableLength > 0) {
406        ds->swapArray32(ds, inBytes+tableStartOffset, topSize,
407                            outBytes+tableStartOffset, status);
408        ds->swapArray16(ds, inBytes+tableStartOffset+topSize, tableLength-topSize,
409                            outBytes+tableStartOffset+topSize, status);
410    }
411
412    // Safe Reverse state table.  Same layout as forward table, above.
413    tableStartOffset = ds->readUInt32(rbbiDH->fSRTable);
414    tableLength      = ds->readUInt32(rbbiDH->fSRTableLen);
415
416    if (tableLength > 0) {
417        ds->swapArray32(ds, inBytes+tableStartOffset, topSize,
418                            outBytes+tableStartOffset, status);
419        ds->swapArray16(ds, inBytes+tableStartOffset+topSize, tableLength-topSize,
420                            outBytes+tableStartOffset+topSize, status);
421    }
422
423    // Trie table for character categories
424    utrie_swap(ds, inBytes+ds->readUInt32(rbbiDH->fTrie), ds->readUInt32(rbbiDH->fTrieLen),
425                            outBytes+ds->readUInt32(rbbiDH->fTrie), status);
426
427    // Source Rules Text.  It's UChar data
428    ds->swapArray16(ds, inBytes+ds->readUInt32(rbbiDH->fRuleSource), ds->readUInt32(rbbiDH->fRuleSourceLen),
429                        outBytes+ds->readUInt32(rbbiDH->fRuleSource), status);
430
431    // Table of rule status values.  It's all int_32 values
432    ds->swapArray32(ds, inBytes+ds->readUInt32(rbbiDH->fStatusTable), ds->readUInt32(rbbiDH->fStatusTableLen),
433                        outBytes+ds->readUInt32(rbbiDH->fStatusTable), status);
434
435    // And, last, the header.
436    //   It is all int32_t values except for fFormataVersion, which is an array of four bytes.
437    //   Swap the whole thing as int32_t, then re-swap the one field.
438    //
439    ds->swapArray32(ds, inBytes, sizeof(RBBIDataHeader), outBytes, status);
440    ds->swapArray32(ds, outputDH->fFormatVersion, 4, outputDH->fFormatVersion, status);
441
442    return totalSize;
443}
444
445
446#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
447