1/* 2*************************************************************************** 3* Copyright (C) 1999-2008 International Business Machines Corporation * 4* and others. All rights reserved. * 5*************************************************************************** 6*/ 7 8#include "unicode/utypes.h" 9 10#if !UCONFIG_NO_BREAK_ITERATION 11 12#include "unicode/utypes.h" 13#include "rbbidata.h" 14#include "rbbirb.h" 15#include "utrie.h" 16#include "udatamem.h" 17#include "cmemory.h" 18#include "cstring.h" 19#include "umutex.h" 20 21#include "uassert.h" 22 23 24//----------------------------------------------------------------------------------- 25// 26// Trie access folding function. Copied as-is from properties code in uchar.c 27// 28//----------------------------------------------------------------------------------- 29U_CDECL_BEGIN 30static int32_t U_CALLCONV 31getFoldingOffset(uint32_t data) { 32 /* if bit 15 is set, then the folding offset is in bits 14..0 of the 16-bit trie result */ 33 if(data&0x8000) { 34 return (int32_t)(data&0x7fff); 35 } else { 36 return 0; 37 } 38} 39U_CDECL_END 40 41U_NAMESPACE_BEGIN 42 43//----------------------------------------------------------------------------- 44// 45// Constructors. 46// 47//----------------------------------------------------------------------------- 48RBBIDataWrapper::RBBIDataWrapper(const RBBIDataHeader *data, UErrorCode &status) { 49 init(data, status); 50} 51 52RBBIDataWrapper::RBBIDataWrapper(const RBBIDataHeader *data, enum EDontAdopt, UErrorCode &status) { 53 init(data, status); 54 fDontFreeData = TRUE; 55} 56 57RBBIDataWrapper::RBBIDataWrapper(UDataMemory* udm, UErrorCode &status) { 58 const RBBIDataHeader *d = (const RBBIDataHeader *) 59 // ((char *)&(udm->pHeader->info) + udm->pHeader->info.size); 60 // taking into consideration the padding added in by udata_write 61 ((char *)(udm->pHeader) + udm->pHeader->dataHeader.headerSize); 62 init(d, status); 63 fUDataMem = udm; 64} 65 66//----------------------------------------------------------------------------- 67// 68// init(). Does most of the work of construction, shared between the 69// constructors. 70// 71//----------------------------------------------------------------------------- 72void RBBIDataWrapper::init(const RBBIDataHeader *data, UErrorCode &status) { 73 if (U_FAILURE(status)) { 74 return; 75 } 76 fHeader = data; 77 if (fHeader->fMagic != 0xb1a0 || 78 !(fHeader->fFormatVersion[0] == 3 || // ICU 3.4 79 *(int32_t *)fHeader->fFormatVersion == 1)) // ICU 3.2 and earlier. 80 { 81 status = U_INVALID_FORMAT_ERROR; 82 return; 83 } 84 85 fDontFreeData = FALSE; 86 fUDataMem = NULL; 87 fReverseTable = NULL; 88 fSafeFwdTable = NULL; 89 fSafeRevTable = NULL; 90 if (data->fFTableLen != 0) { 91 fForwardTable = (RBBIStateTable *)((char *)data + fHeader->fFTable); 92 } 93 if (data->fRTableLen != 0) { 94 fReverseTable = (RBBIStateTable *)((char *)data + fHeader->fRTable); 95 } 96 if (data->fSFTableLen != 0) { 97 fSafeFwdTable = (RBBIStateTable *)((char *)data + fHeader->fSFTable); 98 } 99 if (data->fSRTableLen != 0) { 100 fSafeRevTable = (RBBIStateTable *)((char *)data + fHeader->fSRTable); 101 } 102 103 104 utrie_unserialize(&fTrie, 105 (uint8_t *)data + fHeader->fTrie, 106 fHeader->fTrieLen, 107 &status); 108 if (U_FAILURE(status)) { 109 return; 110 } 111 fTrie.getFoldingOffset=getFoldingOffset; 112 113 114 fRuleSource = (UChar *)((char *)data + fHeader->fRuleSource); 115 fRuleString.setTo(TRUE, fRuleSource, -1); 116 U_ASSERT(data->fRuleSourceLen > 0); 117 118 fRuleStatusTable = (int32_t *)((char *)data + fHeader->fStatusTable); 119 fStatusMaxIdx = data->fStatusTableLen / sizeof(int32_t); 120 121 fRefCount = 1; 122 123#ifdef RBBI_DEBUG 124 char *debugEnv = getenv("U_RBBIDEBUG"); 125 if (debugEnv && uprv_strstr(debugEnv, "data")) {this->printData();} 126#endif 127} 128 129 130//----------------------------------------------------------------------------- 131// 132// Destructor. Don't call this - use removeReference() instead. 133// 134//----------------------------------------------------------------------------- 135RBBIDataWrapper::~RBBIDataWrapper() { 136 U_ASSERT(fRefCount == 0); 137 if (fUDataMem) { 138 udata_close(fUDataMem); 139 } else if (!fDontFreeData) { 140 uprv_free((void *)fHeader); 141 } 142} 143 144 145 146//----------------------------------------------------------------------------- 147// 148// Operator == Consider two RBBIDataWrappers to be equal if they 149// refer to the same underlying data. Although 150// the data wrappers are normally shared between 151// iterator instances, it's possible to independently 152// open the same data twice, and get two instances, which 153// should still be ==. 154// 155//----------------------------------------------------------------------------- 156UBool RBBIDataWrapper::operator ==(const RBBIDataWrapper &other) const { 157 if (fHeader == other.fHeader) { 158 return TRUE; 159 } 160 if (fHeader->fLength != other.fHeader->fLength) { 161 return FALSE; 162 } 163 if (uprv_memcmp(fHeader, other.fHeader, fHeader->fLength) == 0) { 164 return TRUE; 165 } 166 return FALSE; 167} 168 169int32_t RBBIDataWrapper::hashCode() { 170 return fHeader->fFTableLen; 171} 172 173 174 175//----------------------------------------------------------------------------- 176// 177// Reference Counting. A single RBBIDataWrapper object is shared among 178// however many RulesBasedBreakIterator instances are 179// referencing the same data. 180// 181//----------------------------------------------------------------------------- 182void RBBIDataWrapper::removeReference() { 183 if (umtx_atomic_dec(&fRefCount) == 0) { 184 delete this; 185 } 186} 187 188 189RBBIDataWrapper *RBBIDataWrapper::addReference() { 190 umtx_atomic_inc(&fRefCount); 191 return this; 192} 193 194 195 196//----------------------------------------------------------------------------- 197// 198// getRuleSourceString 199// 200//----------------------------------------------------------------------------- 201const UnicodeString &RBBIDataWrapper::getRuleSourceString() const { 202 return fRuleString; 203} 204 205 206//----------------------------------------------------------------------------- 207// 208// print - debugging function to dump the runtime data tables. 209// 210//----------------------------------------------------------------------------- 211#ifdef RBBI_DEBUG 212void RBBIDataWrapper::printTable(const char *heading, const RBBIStateTable *table) { 213 uint32_t c; 214 uint32_t s; 215 216 RBBIDebugPrintf(" %s\n", heading); 217 218 RBBIDebugPrintf("State | Acc LA TagIx"); 219 for (c=0; c<fHeader->fCatCount; c++) {RBBIDebugPrintf("%3d ", c);} 220 RBBIDebugPrintf("\n------|---------------"); for (c=0;c<fHeader->fCatCount; c++) { 221 RBBIDebugPrintf("----"); 222 } 223 RBBIDebugPrintf("\n"); 224 225 if (table == NULL) { 226 RBBIDebugPrintf(" N U L L T A B L E\n\n"); 227 return; 228 } 229 for (s=0; s<table->fNumStates; s++) { 230 RBBIStateTableRow *row = (RBBIStateTableRow *) 231 (table->fTableData + (table->fRowLen * s)); 232 RBBIDebugPrintf("%4d | %3d %3d %3d ", s, row->fAccepting, row->fLookAhead, row->fTagIdx); 233 for (c=0; c<fHeader->fCatCount; c++) { 234 RBBIDebugPrintf("%3d ", row->fNextState[c]); 235 } 236 RBBIDebugPrintf("\n"); 237 } 238 RBBIDebugPrintf("\n"); 239} 240#endif 241 242 243#ifdef RBBI_DEBUG 244void RBBIDataWrapper::printData() { 245 RBBIDebugPrintf("RBBI Data at %p\n", (void *)fHeader); 246 RBBIDebugPrintf(" Version = {%d %d %d %d}\n", fHeader->fFormatVersion[0], fHeader->fFormatVersion[1], 247 fHeader->fFormatVersion[2], fHeader->fFormatVersion[3]); 248 RBBIDebugPrintf(" total length of data = %d\n", fHeader->fLength); 249 RBBIDebugPrintf(" number of character categories = %d\n\n", fHeader->fCatCount); 250 251 printTable("Forward State Transition Table", fForwardTable); 252 printTable("Reverse State Transition Table", fReverseTable); 253 printTable("Safe Forward State Transition Table", fSafeFwdTable); 254 printTable("Safe Reverse State Transition Table", fSafeRevTable); 255 256 RBBIDebugPrintf("\nOrignal Rules source:\n"); 257 for (int32_t c=0; fRuleSource[c] != 0; c++) { 258 RBBIDebugPrintf("%c", fRuleSource[c]); 259 } 260 RBBIDebugPrintf("\n\n"); 261} 262#endif 263 264 265U_NAMESPACE_END 266U_NAMESPACE_USE 267 268//----------------------------------------------------------------------------- 269// 270// ubrk_swap - byte swap and char encoding swap of RBBI data 271// 272//----------------------------------------------------------------------------- 273 274U_CAPI int32_t U_EXPORT2 275ubrk_swap(const UDataSwapper *ds, const void *inData, int32_t length, void *outData, 276 UErrorCode *status) { 277 278 if (status == NULL || U_FAILURE(*status)) { 279 return 0; 280 } 281 if(ds==NULL || inData==NULL || length<-1 || (length>0 && outData==NULL)) { 282 *status=U_ILLEGAL_ARGUMENT_ERROR; 283 return 0; 284 } 285 286 // 287 // Check that the data header is for for break data. 288 // (Header contents are defined in genbrk.cpp) 289 // 290 const UDataInfo *pInfo = (const UDataInfo *)((const char *)inData+4); 291 if(!( pInfo->dataFormat[0]==0x42 && /* dataFormat="Brk " */ 292 pInfo->dataFormat[1]==0x72 && 293 pInfo->dataFormat[2]==0x6b && 294 pInfo->dataFormat[3]==0x20 && 295 pInfo->formatVersion[0]==3 )) { 296 udata_printError(ds, "ubrk_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized\n", 297 pInfo->dataFormat[0], pInfo->dataFormat[1], 298 pInfo->dataFormat[2], pInfo->dataFormat[3], 299 pInfo->formatVersion[0]); 300 *status=U_UNSUPPORTED_ERROR; 301 return 0; 302 } 303 304 // 305 // Swap the data header. (This is the generic ICU Data Header, not the RBBI Specific 306 // RBBIDataHeader). This swap also conveniently gets us 307 // the size of the ICU d.h., which lets us locate the start 308 // of the RBBI specific data. 309 // 310 int32_t headerSize=udata_swapDataHeader(ds, inData, length, outData, status); 311 312 313 // 314 // Get the RRBI Data Header, and check that it appears to be OK. 315 // 316 // Note: ICU 3.2 and earlier, RBBIDataHeader::fDataFormat was actually 317 // an int32_t with a value of 1. Starting with ICU 3.4, 318 // RBBI's fDataFormat matches the dataFormat field from the 319 // UDataInfo header, four int8_t bytes. The value is {3,1,0,0} 320 // 321 const uint8_t *inBytes =(const uint8_t *)inData+headerSize; 322 RBBIDataHeader *rbbiDH = (RBBIDataHeader *)inBytes; 323 UBool formatVersionOne = ds->readUInt32(*(int32_t *)rbbiDH->fFormatVersion) == 1; 324 if (ds->readUInt32(rbbiDH->fMagic) != 0xb1a0 || 325 !(formatVersionOne || rbbiDH->fFormatVersion[0] == 3) || 326 ds->readUInt32(rbbiDH->fLength) < sizeof(RBBIDataHeader)) 327 { 328 udata_printError(ds, "ubrk_swap(): RBBI Data header is invalid.\n"); 329 *status=U_UNSUPPORTED_ERROR; 330 return 0; 331 } 332 333 // 334 // Prefight operation? Just return the size 335 // 336 int32_t breakDataLength = ds->readUInt32(rbbiDH->fLength); 337 int32_t totalSize = headerSize + breakDataLength; 338 if (length < 0) { 339 return totalSize; 340 } 341 342 // 343 // Check that length passed in is consistent with length from RBBI data header. 344 // 345 if (length < totalSize) { 346 udata_printError(ds, "ubrk_swap(): too few bytes (%d after ICU Data header) for break data.\n", 347 breakDataLength); 348 *status=U_INDEX_OUTOFBOUNDS_ERROR; 349 return 0; 350 } 351 352 353 // 354 // Swap the Data. Do the data itself first, then the RBBI Data Header, because 355 // we need to reference the header to locate the data, and an 356 // inplace swap of the header leaves it unusable. 357 // 358 uint8_t *outBytes = (uint8_t *)outData + headerSize; 359 RBBIDataHeader *outputDH = (RBBIDataHeader *)outBytes; 360 361 int32_t tableStartOffset; 362 int32_t tableLength; 363 364 // 365 // If not swapping in place, zero out the output buffer before starting. 366 // Individual tables and other data items within are aligned to 8 byte boundaries 367 // when originally created. Any unused space between items needs to be zero. 368 // 369 if (inBytes != outBytes) { 370 uprv_memset(outBytes, 0, breakDataLength); 371 } 372 373 // 374 // Each state table begins with several 32 bit fields. Calculate the size 375 // in bytes of these. 376 // 377 int32_t topSize = offsetof(RBBIStateTable, fTableData); 378 379 // Forward state table. 380 tableStartOffset = ds->readUInt32(rbbiDH->fFTable); 381 tableLength = ds->readUInt32(rbbiDH->fFTableLen); 382 383 if (tableLength > 0) { 384 ds->swapArray32(ds, inBytes+tableStartOffset, topSize, 385 outBytes+tableStartOffset, status); 386 ds->swapArray16(ds, inBytes+tableStartOffset+topSize, tableLength-topSize, 387 outBytes+tableStartOffset+topSize, status); 388 } 389 390 // Reverse state table. Same layout as forward table, above. 391 tableStartOffset = ds->readUInt32(rbbiDH->fRTable); 392 tableLength = ds->readUInt32(rbbiDH->fRTableLen); 393 394 if (tableLength > 0) { 395 ds->swapArray32(ds, inBytes+tableStartOffset, topSize, 396 outBytes+tableStartOffset, status); 397 ds->swapArray16(ds, inBytes+tableStartOffset+topSize, tableLength-topSize, 398 outBytes+tableStartOffset+topSize, status); 399 } 400 401 // Safe Forward state table. Same layout as forward table, above. 402 tableStartOffset = ds->readUInt32(rbbiDH->fSFTable); 403 tableLength = ds->readUInt32(rbbiDH->fSFTableLen); 404 405 if (tableLength > 0) { 406 ds->swapArray32(ds, inBytes+tableStartOffset, topSize, 407 outBytes+tableStartOffset, status); 408 ds->swapArray16(ds, inBytes+tableStartOffset+topSize, tableLength-topSize, 409 outBytes+tableStartOffset+topSize, status); 410 } 411 412 // Safe Reverse state table. Same layout as forward table, above. 413 tableStartOffset = ds->readUInt32(rbbiDH->fSRTable); 414 tableLength = ds->readUInt32(rbbiDH->fSRTableLen); 415 416 if (tableLength > 0) { 417 ds->swapArray32(ds, inBytes+tableStartOffset, topSize, 418 outBytes+tableStartOffset, status); 419 ds->swapArray16(ds, inBytes+tableStartOffset+topSize, tableLength-topSize, 420 outBytes+tableStartOffset+topSize, status); 421 } 422 423 // Trie table for character categories 424 utrie_swap(ds, inBytes+ds->readUInt32(rbbiDH->fTrie), ds->readUInt32(rbbiDH->fTrieLen), 425 outBytes+ds->readUInt32(rbbiDH->fTrie), status); 426 427 // Source Rules Text. It's UChar data 428 ds->swapArray16(ds, inBytes+ds->readUInt32(rbbiDH->fRuleSource), ds->readUInt32(rbbiDH->fRuleSourceLen), 429 outBytes+ds->readUInt32(rbbiDH->fRuleSource), status); 430 431 // Table of rule status values. It's all int_32 values 432 ds->swapArray32(ds, inBytes+ds->readUInt32(rbbiDH->fStatusTable), ds->readUInt32(rbbiDH->fStatusTableLen), 433 outBytes+ds->readUInt32(rbbiDH->fStatusTable), status); 434 435 // And, last, the header. 436 // For the old version one format, the entire header consists of int32_t values. 437 // For the newer formats, the fDataFormat field is an array of four bytes. 438 // Swap the whole thing as int32_t, then, for the newer format, re-swap the one field. 439 // 440 ds->swapArray32(ds, inBytes, sizeof(RBBIDataHeader), outBytes, status); 441 if (formatVersionOne == FALSE) { 442 ds->swapArray32(ds, outputDH->fFormatVersion, 4, outputDH->fFormatVersion, status); 443 } 444 445 446 return totalSize; 447} 448 449 450#endif /* #if !UCONFIG_NO_BREAK_ITERATION */ 451