utext.h revision 54dcd9b6a06071f647dac967e9e267abb9410720
1/* 2******************************************************************************* 3* 4* Copyright (C) 2004-2012, International Business Machines 5* Corporation and others. All Rights Reserved. 6* 7******************************************************************************* 8* file name: utext.h 9* encoding: US-ASCII 10* tab size: 8 (not used) 11* indentation:4 12* 13* created on: 2004oct06 14* created by: Markus W. Scherer 15*/ 16 17#ifndef __UTEXT_H__ 18#define __UTEXT_H__ 19 20/** 21 * \file 22 * \brief C API: Abstract Unicode Text API 23 * 24 * The Text Access API provides a means to allow text that is stored in alternative 25 * formats to work with ICU services. ICU normally operates on text that is 26 * stored in UTF-16 format, in (UChar *) arrays for the C APIs or as type 27 * UnicodeString for C++ APIs. 28 * 29 * ICU Text Access allows other formats, such as UTF-8 or non-contiguous 30 * UTF-16 strings, to be placed in a UText wrapper and then passed to ICU services. 31 * 32 * There are three general classes of usage for UText: 33 * 34 * Application Level Use. This is the simplest usage - applications would 35 * use one of the utext_open() functions on their input text, and pass 36 * the resulting UText to the desired ICU service. 37 * 38 * Second is usage in ICU Services, such as break iteration, that will need to 39 * operate on input presented to them as a UText. These implementations 40 * will need to use the iteration and related UText functions to gain 41 * access to the actual text. 42 * 43 * The third class of UText users are "text providers." These are the 44 * UText implementations for the various text storage formats. An application 45 * or system with a unique text storage format can implement a set of 46 * UText provider functions for that format, which will then allow 47 * ICU services to operate on that format. 48 * 49 * 50 * <em>Iterating over text</em> 51 * 52 * Here is sample code for a forward iteration over the contents of a UText 53 * 54 * \code 55 * UChar32 c; 56 * UText *ut = whatever(); 57 * 58 * for (c=utext_next32From(ut, 0); c>=0; c=utext_next32(ut)) { 59 * // do whatever with the codepoint c here. 60 * } 61 * \endcode 62 * 63 * And here is similar code to iterate in the reverse direction, from the end 64 * of the text towards the beginning. 65 * 66 * \code 67 * UChar32 c; 68 * UText *ut = whatever(); 69 * int textLength = utext_nativeLength(ut); 70 * for (c=utext_previous32From(ut, textLength); c>=0; c=utext_previous32(ut)) { 71 * // do whatever with the codepoint c here. 72 * } 73 * \endcode 74 * 75 * <em>Characters and Indexing</em> 76 * 77 * Indexing into text by UText functions is nearly always in terms of the native 78 * indexing of the underlying text storage. The storage format could be UTF-8 79 * or UTF-32, for example. When coding to the UText access API, no assumptions 80 * can be made regarding the size of characters, or how far an index 81 * may move when iterating between characters. 82 * 83 * All indices supplied to UText functions are pinned to the length of the 84 * text. An out-of-bounds index is not considered to be an error, but is 85 * adjusted to be in the range 0 <= index <= length of input text. 86 * 87 * 88 * When an index position is returned from a UText function, it will be 89 * a native index to the underlying text. In the case of multi-unit characters, 90 * it will always refer to the first position of the character, 91 * never to the interior. This is essentially the same thing as saying that 92 * a returned index will always point to a boundary between characters. 93 * 94 * When a native index is supplied to a UText function, all indices that 95 * refer to any part of a multi-unit character representation are considered 96 * to be equivalent. In the case of multi-unit characters, an incoming index 97 * will be logically normalized to refer to the start of the character. 98 * 99 * It is possible to test whether a native index is on a code point boundary 100 * by doing a utext_setNativeIndex() followed by a utext_getNativeIndex(). 101 * If the index is returned unchanged, it was on a code point boundary. If 102 * an adjusted index is returned, the original index referred to the 103 * interior of a character. 104 * 105 * <em>Conventions for calling UText functions</em> 106 * 107 * Most UText access functions have as their first parameter a (UText *) pointer, 108 * which specifies the UText to be used. Unless otherwise noted, the 109 * pointer must refer to a valid, open UText. Attempting to 110 * use a closed UText or passing a NULL pointer is a programming error and 111 * will produce undefined results or NULL pointer exceptions. 112 * 113 * The UText_Open family of functions can either open an existing (closed) 114 * UText, or heap allocate a new UText. Here is sample code for creating 115 * a stack-allocated UText. 116 * 117 * \code 118 * char *s = whatever(); // A utf-8 string 119 * U_ErrorCode status = U_ZERO_ERROR; 120 * UText ut = UTEXT_INITIALIZER; 121 * utext_openUTF8(ut, s, -1, &status); 122 * if (U_FAILURE(status)) { 123 * // error handling 124 * } else { 125 * // work with the UText 126 * } 127 * \endcode 128 * 129 * Any existing UText passed to an open function _must_ have been initialized, 130 * either by the UTEXT_INITIALIZER, or by having been originally heap-allocated 131 * by an open function. Passing NULL will cause the open function to 132 * heap-allocate and fully initialize a new UText. 133 * 134 */ 135 136 137 138#include "unicode/utypes.h" 139#include "unicode/uchar.h" 140#if U_SHOW_CPLUSPLUS_API 141#include "unicode/localpointer.h" 142#include "unicode/rep.h" 143#include "unicode/unistr.h" 144#include "unicode/chariter.h" 145#endif 146 147 148U_CDECL_BEGIN 149 150struct UText; 151typedef struct UText UText; /**< C typedef for struct UText. @stable ICU 3.6 */ 152 153 154/*************************************************************************************** 155 * 156 * C Functions for creating UText wrappers around various kinds of text strings. 157 * 158 ****************************************************************************************/ 159 160 161/** 162 * Close function for UText instances. 163 * Cleans up, releases any resources being held by an open UText. 164 * <p> 165 * If the UText was originally allocated by one of the utext_open functions, 166 * the storage associated with the utext will also be freed. 167 * If the UText storage originated with the application, as it would with 168 * a local or static instance, the storage will not be deleted. 169 * 170 * An open UText can be reset to refer to new string by using one of the utext_open() 171 * functions without first closing the UText. 172 * 173 * @param ut The UText to be closed. 174 * @return NULL if the UText struct was deleted by the close. If the UText struct 175 * was originally provided by the caller to the open function, it is 176 * returned by this function, and may be safely used again in 177 * a subsequent utext_open. 178 * 179 * @stable ICU 3.4 180 */ 181U_STABLE UText * U_EXPORT2 182utext_close(UText *ut); 183 184#if U_SHOW_CPLUSPLUS_API 185 186U_NAMESPACE_BEGIN 187 188/** 189 * \class LocalUTextPointer 190 * "Smart pointer" class, closes a UText via utext_close(). 191 * For most methods see the LocalPointerBase base class. 192 * 193 * @see LocalPointerBase 194 * @see LocalPointer 195 * @stable ICU 4.4 196 */ 197U_DEFINE_LOCAL_OPEN_POINTER(LocalUTextPointer, UText, utext_close); 198 199U_NAMESPACE_END 200 201#endif 202 203/** 204 * Open a read-only UText implementation for UTF-8 strings. 205 * 206 * \htmlonly 207 * Any invalid UTF-8 in the input will be handled in this way: 208 * a sequence of bytes that has the form of a truncated, but otherwise valid, 209 * UTF-8 sequence will be replaced by a single unicode replacement character, \uFFFD. 210 * Any other illegal bytes will each be replaced by a \uFFFD. 211 * \endhtmlonly 212 * 213 * @param ut Pointer to a UText struct. If NULL, a new UText will be created. 214 * If non-NULL, must refer to an initialized UText struct, which will then 215 * be reset to reference the specified UTF-8 string. 216 * @param s A UTF-8 string. Must not be NULL. 217 * @param length The length of the UTF-8 string in bytes, or -1 if the string is 218 * zero terminated. 219 * @param status Errors are returned here. 220 * @return A pointer to the UText. If a pre-allocated UText was provided, it 221 * will always be used and returned. 222 * @stable ICU 3.4 223 */ 224U_STABLE UText * U_EXPORT2 225utext_openUTF8(UText *ut, const char *s, int64_t length, UErrorCode *status); 226 227 228/** 229 * Open a read-only UText for UChar * string. 230 * 231 * @param ut Pointer to a UText struct. If NULL, a new UText will be created. 232 * If non-NULL, must refer to an initialized UText struct, which will then 233 * be reset to reference the specified UChar string. 234 * @param s A UChar (UTF-16) string 235 * @param length The number of UChars in the input string, or -1 if the string is 236 * zero terminated. 237 * @param status Errors are returned here. 238 * @return A pointer to the UText. If a pre-allocated UText was provided, it 239 * will always be used and returned. 240 * @stable ICU 3.4 241 */ 242U_STABLE UText * U_EXPORT2 243utext_openUChars(UText *ut, const UChar *s, int64_t length, UErrorCode *status); 244 245 246#if U_SHOW_CPLUSPLUS_API 247/** 248 * Open a writable UText for a non-const UnicodeString. 249 * 250 * @param ut Pointer to a UText struct. If NULL, a new UText will be created. 251 * If non-NULL, must refer to an initialized UText struct, which will then 252 * be reset to reference the specified input string. 253 * @param s A UnicodeString. 254 * @param status Errors are returned here. 255 * @return Pointer to the UText. If a UText was supplied as input, this 256 * will always be used and returned. 257 * @stable ICU 3.4 258 */ 259U_STABLE UText * U_EXPORT2 260utext_openUnicodeString(UText *ut, icu::UnicodeString *s, UErrorCode *status); 261 262 263/** 264 * Open a UText for a const UnicodeString. The resulting UText will not be writable. 265 * 266 * @param ut Pointer to a UText struct. If NULL, a new UText will be created. 267 * If non-NULL, must refer to an initialized UText struct, which will then 268 * be reset to reference the specified input string. 269 * @param s A const UnicodeString to be wrapped. 270 * @param status Errors are returned here. 271 * @return Pointer to the UText. If a UText was supplied as input, this 272 * will always be used and returned. 273 * @stable ICU 3.4 274 */ 275U_STABLE UText * U_EXPORT2 276utext_openConstUnicodeString(UText *ut, const icu::UnicodeString *s, UErrorCode *status); 277 278 279/** 280 * Open a writable UText implementation for an ICU Replaceable object. 281 * @param ut Pointer to a UText struct. If NULL, a new UText will be created. 282 * If non-NULL, must refer to an already existing UText, which will then 283 * be reset to reference the specified replaceable text. 284 * @param rep A Replaceable text object. 285 * @param status Errors are returned here. 286 * @return Pointer to the UText. If a UText was supplied as input, this 287 * will always be used and returned. 288 * @see Replaceable 289 * @stable ICU 3.4 290 */ 291U_STABLE UText * U_EXPORT2 292utext_openReplaceable(UText *ut, icu::Replaceable *rep, UErrorCode *status); 293 294/** 295 * Open a UText implementation over an ICU CharacterIterator. 296 * @param ut Pointer to a UText struct. If NULL, a new UText will be created. 297 * If non-NULL, must refer to an already existing UText, which will then 298 * be reset to reference the specified replaceable text. 299 * @param ci A Character Iterator. 300 * @param status Errors are returned here. 301 * @return Pointer to the UText. If a UText was supplied as input, this 302 * will always be used and returned. 303 * @see Replaceable 304 * @stable ICU 3.4 305 */ 306U_STABLE UText * U_EXPORT2 307utext_openCharacterIterator(UText *ut, icu::CharacterIterator *ci, UErrorCode *status); 308 309#endif 310 311 312/** 313 * Clone a UText. This is much like opening a UText where the source text is itself 314 * another UText. 315 * 316 * A deep clone will copy both the UText data structures and the underlying text. 317 * The original and cloned UText will operate completely independently; modifications 318 * made to the text in one will not affect the other. Text providers are not 319 * required to support deep clones. The user of clone() must check the status return 320 * and be prepared to handle failures. 321 * 322 * The standard UText implementations for UTF8, UChar *, UnicodeString and 323 * Replaceable all support deep cloning. 324 * 325 * The UText returned from a deep clone will be writable, assuming that the text 326 * provider is able to support writing, even if the source UText had been made 327 * non-writable by means of UText_freeze(). 328 * 329 * A shallow clone replicates only the UText data structures; it does not make 330 * a copy of the underlying text. Shallow clones can be used as an efficient way to 331 * have multiple iterators active in a single text string that is not being 332 * modified. 333 * 334 * A shallow clone operation will not fail, barring truly exceptional conditions such 335 * as memory allocation failures. 336 * 337 * Shallow UText clones should be avoided if the UText functions that modify the 338 * text are expected to be used, either on the original or the cloned UText. 339 * Any such modifications can cause unpredictable behavior. Read Only 340 * shallow clones provide some protection against errors of this type by 341 * disabling text modification via the cloned UText. 342 * 343 * A shallow clone made with the readOnly parameter == FALSE will preserve the 344 * utext_isWritable() state of the source object. Note, however, that 345 * write operations must be avoided while more than one UText exists that refer 346 * to the same underlying text. 347 * 348 * A UText and its clone may be safely concurrently accessed by separate threads. 349 * This is true for read access only with shallow clones, and for both read and 350 * write access with deep clones. 351 * It is the responsibility of the Text Provider to ensure that this thread safety 352 * constraint is met. 353 * 354 * @param dest A UText struct to be filled in with the result of the clone operation, 355 * or NULL if the clone function should heap-allocate a new UText struct. 356 * If non-NULL, must refer to an already existing UText, which will then 357 * be reset to become the clone. 358 * @param src The UText to be cloned. 359 * @param deep TRUE to request a deep clone, FALSE for a shallow clone. 360 * @param readOnly TRUE to request that the cloned UText have read only access to the 361 * underlying text. 362 363 * @param status Errors are returned here. For deep clones, U_UNSUPPORTED_ERROR 364 * will be returned if the text provider is unable to clone the 365 * original text. 366 * @return The newly created clone, or NULL if the clone operation failed. 367 * @stable ICU 3.4 368 */ 369U_STABLE UText * U_EXPORT2 370utext_clone(UText *dest, const UText *src, UBool deep, UBool readOnly, UErrorCode *status); 371 372 373/** 374 * Compare two UText objects for equality. 375 * UTexts are equal if they are iterating over the same text, and 376 * have the same iteration position within the text. 377 * If either or both of the parameters are NULL, the comparison is FALSE. 378 * 379 * @param a The first of the two UTexts to compare. 380 * @param b The other UText to be compared. 381 * @return TRUE if the two UTexts are equal. 382 * @stable ICU 3.6 383 */ 384U_STABLE UBool U_EXPORT2 385utext_equals(const UText *a, const UText *b); 386 387 388/***************************************************************************** 389 * 390 * Functions to work with the text represeted by a UText wrapper 391 * 392 *****************************************************************************/ 393 394/** 395 * Get the length of the text. Depending on the characteristics 396 * of the underlying text representation, this may be expensive. 397 * @see utext_isLengthExpensive() 398 * 399 * 400 * @param ut the text to be accessed. 401 * @return the length of the text, expressed in native units. 402 * 403 * @stable ICU 3.4 404 */ 405U_STABLE int64_t U_EXPORT2 406utext_nativeLength(UText *ut); 407 408/** 409 * Return TRUE if calculating the length of the text could be expensive. 410 * Finding the length of NUL terminated strings is considered to be expensive. 411 * 412 * Note that the value of this function may change 413 * as the result of other operations on a UText. 414 * Once the length of a string has been discovered, it will no longer 415 * be expensive to report it. 416 * 417 * @param ut the text to be accessed. 418 * @return TRUE if determining the length of the text could be time consuming. 419 * @stable ICU 3.4 420 */ 421U_STABLE UBool U_EXPORT2 422utext_isLengthExpensive(const UText *ut); 423 424/** 425 * Returns the code point at the requested index, 426 * or U_SENTINEL (-1) if it is out of bounds. 427 * 428 * If the specified index points to the interior of a multi-unit 429 * character - one of the trail bytes of a UTF-8 sequence, for example - 430 * the complete code point will be returned. 431 * 432 * The iteration position will be set to the start of the returned code point. 433 * 434 * This function is roughly equivalent to the the sequence 435 * utext_setNativeIndex(index); 436 * utext_current32(); 437 * (There is a subtle difference if the index is out of bounds by being less than zero - 438 * utext_setNativeIndex(negative value) sets the index to zero, after which utext_current() 439 * will return the char at zero. utext_char32At(negative index), on the other hand, will 440 * return the U_SENTINEL value of -1.) 441 * 442 * @param ut the text to be accessed 443 * @param nativeIndex the native index of the character to be accessed. If the index points 444 * to other than the first unit of a multi-unit character, it will be adjusted 445 * to the start of the character. 446 * @return the code point at the specified index. 447 * @stable ICU 3.4 448 */ 449U_STABLE UChar32 U_EXPORT2 450utext_char32At(UText *ut, int64_t nativeIndex); 451 452 453/** 454 * 455 * Get the code point at the current iteration position, 456 * or U_SENTINEL (-1) if the iteration has reached the end of 457 * the input text. 458 * 459 * @param ut the text to be accessed. 460 * @return the Unicode code point at the current iterator position. 461 * @stable ICU 3.4 462 */ 463U_STABLE UChar32 U_EXPORT2 464utext_current32(UText *ut); 465 466 467/** 468 * Get the code point at the current iteration position of the UText, and 469 * advance the position to the first index following the character. 470 * 471 * If the position is at the end of the text (the index following 472 * the last character, which is also the length of the text), 473 * return U_SENTINEL (-1) and do not advance the index. 474 * 475 * This is a post-increment operation. 476 * 477 * An inline macro version of this function, UTEXT_NEXT32(), 478 * is available for performance critical use. 479 * 480 * @param ut the text to be accessed. 481 * @return the Unicode code point at the iteration position. 482 * @see UTEXT_NEXT32 483 * @stable ICU 3.4 484 */ 485U_STABLE UChar32 U_EXPORT2 486utext_next32(UText *ut); 487 488 489/** 490 * Move the iterator position to the character (code point) whose 491 * index precedes the current position, and return that character. 492 * This is a pre-decrement operation. 493 * 494 * If the initial position is at the start of the text (index of 0) 495 * return U_SENTINEL (-1), and leave the position unchanged. 496 * 497 * An inline macro version of this function, UTEXT_PREVIOUS32(), 498 * is available for performance critical use. 499 * 500 * @param ut the text to be accessed. 501 * @return the previous UChar32 code point, or U_SENTINEL (-1) 502 * if the iteration has reached the start of the text. 503 * @see UTEXT_PREVIOUS32 504 * @stable ICU 3.4 505 */ 506U_STABLE UChar32 U_EXPORT2 507utext_previous32(UText *ut); 508 509 510/** 511 * Set the iteration index and return the code point at that index. 512 * Leave the iteration index at the start of the following code point. 513 * 514 * This function is the most efficient and convenient way to 515 * begin a forward iteration. The results are identical to the those 516 * from the sequence 517 * \code 518 * utext_setIndex(); 519 * utext_next32(); 520 * \endcode 521 * 522 * @param ut the text to be accessed. 523 * @param nativeIndex Iteration index, in the native units of the text provider. 524 * @return Code point which starts at or before index, 525 * or U_SENTINEL (-1) if it is out of bounds. 526 * @stable ICU 3.4 527 */ 528U_STABLE UChar32 U_EXPORT2 529utext_next32From(UText *ut, int64_t nativeIndex); 530 531 532 533/** 534 * Set the iteration index, and return the code point preceding the 535 * one specified by the initial index. Leave the iteration position 536 * at the start of the returned code point. 537 * 538 * This function is the most efficient and convenient way to 539 * begin a backwards iteration. 540 * 541 * @param ut the text to be accessed. 542 * @param nativeIndex Iteration index in the native units of the text provider. 543 * @return Code point preceding the one at the initial index, 544 * or U_SENTINEL (-1) if it is out of bounds. 545 * 546 * @stable ICU 3.4 547 */ 548U_STABLE UChar32 U_EXPORT2 549utext_previous32From(UText *ut, int64_t nativeIndex); 550 551/** 552 * Get the current iterator position, which can range from 0 to 553 * the length of the text. 554 * The position is a native index into the input text, in whatever format it 555 * may have (possibly UTF-8 for example), and may not always be the same as 556 * the corresponding UChar (UTF-16) index. 557 * The returned position will always be aligned to a code point boundary. 558 * 559 * @param ut the text to be accessed. 560 * @return the current index position, in the native units of the text provider. 561 * @stable ICU 3.4 562 */ 563U_STABLE int64_t U_EXPORT2 564utext_getNativeIndex(const UText *ut); 565 566/** 567 * Set the current iteration position to the nearest code point 568 * boundary at or preceding the specified index. 569 * The index is in the native units of the original input text. 570 * If the index is out of range, it will be pinned to be within 571 * the range of the input text. 572 * <p> 573 * It will usually be more efficient to begin an iteration 574 * using the functions utext_next32From() or utext_previous32From() 575 * rather than setIndex(). 576 * <p> 577 * Moving the index position to an adjacent character is best done 578 * with utext_next32(), utext_previous32() or utext_moveIndex32(). 579 * Attempting to do direct arithmetic on the index position is 580 * complicated by the fact that the size (in native units) of a 581 * character depends on the underlying representation of the character 582 * (UTF-8, UTF-16, UTF-32, arbitrary codepage), and is not 583 * easily knowable. 584 * 585 * @param ut the text to be accessed. 586 * @param nativeIndex the native unit index of the new iteration position. 587 * @stable ICU 3.4 588 */ 589U_STABLE void U_EXPORT2 590utext_setNativeIndex(UText *ut, int64_t nativeIndex); 591 592/** 593 * Move the iterator postion by delta code points. The number of code points 594 * is a signed number; a negative delta will move the iterator backwards, 595 * towards the start of the text. 596 * <p> 597 * The index is moved by <code>delta</code> code points 598 * forward or backward, but no further backward than to 0 and 599 * no further forward than to utext_nativeLength(). 600 * The resulting index value will be in between 0 and length, inclusive. 601 * 602 * @param ut the text to be accessed. 603 * @param delta the signed number of code points to move the iteration position. 604 * @return TRUE if the position could be moved the requested number of positions while 605 * staying within the range [0 - text length]. 606 * @stable ICU 3.4 607 */ 608U_STABLE UBool U_EXPORT2 609utext_moveIndex32(UText *ut, int32_t delta); 610 611/** 612 * Get the native index of the character preceeding the current position. 613 * If the iteration position is already at the start of the text, zero 614 * is returned. 615 * The value returned is the same as that obtained from the following sequence, 616 * but without the side effect of changing the iteration position. 617 * 618 * \code 619 * UText *ut = whatever; 620 * ... 621 * utext_previous(ut) 622 * utext_getNativeIndex(ut); 623 * \endcode 624 * 625 * This function is most useful during forwards iteration, where it will get the 626 * native index of the character most recently returned from utext_next(). 627 * 628 * @param ut the text to be accessed 629 * @return the native index of the character preceeding the current index position, 630 * or zero if the current position is at the start of the text. 631 * @stable ICU 3.6 632 */ 633U_STABLE int64_t U_EXPORT2 634utext_getPreviousNativeIndex(UText *ut); 635 636 637/** 638 * 639 * Extract text from a UText into a UChar buffer. The range of text to be extracted 640 * is specified in the native indices of the UText provider. These may not necessarily 641 * be UTF-16 indices. 642 * <p> 643 * The size (number of 16 bit UChars) of the data to be extracted is returned. The 644 * full number of UChars is returned, even when the extracted text is truncated 645 * because the specified buffer size is too small. 646 * <p> 647 * The extracted string will (if you are a user) / must (if you are a text provider) 648 * be NUL-terminated if there is sufficient space in the destination buffer. This 649 * terminating NUL is not included in the returned length. 650 * <p> 651 * The iteration index is left at the position following the last extracted character. 652 * 653 * @param ut the UText from which to extract data. 654 * @param nativeStart the native index of the first character to extract.\ 655 * If the specified index is out of range, 656 * it will be pinned to to be within 0 <= index <= textLength 657 * @param nativeLimit the native string index of the position following the last 658 * character to extract. If the specified index is out of range, 659 * it will be pinned to to be within 0 <= index <= textLength. 660 * nativeLimit must be >= nativeStart. 661 * @param dest the UChar (UTF-16) buffer into which the extracted text is placed 662 * @param destCapacity The size, in UChars, of the destination buffer. May be zero 663 * for precomputing the required size. 664 * @param status receives any error status. 665 * U_BUFFER_OVERFLOW_ERROR: the extracted text was truncated because the 666 * buffer was too small. Returns number of UChars for preflighting. 667 * @return Number of UChars in the data to be extracted. Does not include a trailing NUL. 668 * 669 * @stable ICU 3.4 670 */ 671U_STABLE int32_t U_EXPORT2 672utext_extract(UText *ut, 673 int64_t nativeStart, int64_t nativeLimit, 674 UChar *dest, int32_t destCapacity, 675 UErrorCode *status); 676 677 678 679/************************************************************************************ 680 * 681 * #define inline versions of selected performance-critical text access functions 682 * Caution: do not use auto increment++ or decrement-- expressions 683 * as parameters to these macros. 684 * 685 * For most use, where there is no extreme performance constraint, the 686 * normal, non-inline functions are a better choice. The resulting code 687 * will be smaller, and, if the need ever arises, easier to debug. 688 * 689 * These are implemented as #defines rather than real functions 690 * because there is no fully portable way to do inline functions in plain C. 691 * 692 ************************************************************************************/ 693 694#ifndef U_HIDE_INTERNAL_API 695/** 696 * inline version of utext_current32(), for performance-critical situations. 697 * 698 * Get the code point at the current iteration position of the UText. 699 * Returns U_SENTINEL (-1) if the position is at the end of the 700 * text. 701 * 702 * @internal ICU 4.4 technology preview 703 */ 704#define UTEXT_CURRENT32(ut) \ 705 ((ut)->chunkOffset < (ut)->chunkLength && ((ut)->chunkContents)[(ut)->chunkOffset]<0xd800 ? \ 706 ((ut)->chunkContents)[((ut)->chunkOffset)] : utext_current32(ut)) 707#endif /* U_HIDE_INTERNAL_API */ 708 709/** 710 * inline version of utext_next32(), for performance-critical situations. 711 * 712 * Get the code point at the current iteration position of the UText, and 713 * advance the position to the first index following the character. 714 * This is a post-increment operation. 715 * Returns U_SENTINEL (-1) if the position is at the end of the 716 * text. 717 * 718 * @stable ICU 3.4 719 */ 720#define UTEXT_NEXT32(ut) \ 721 ((ut)->chunkOffset < (ut)->chunkLength && ((ut)->chunkContents)[(ut)->chunkOffset]<0xd800 ? \ 722 ((ut)->chunkContents)[((ut)->chunkOffset)++] : utext_next32(ut)) 723 724/** 725 * inline version of utext_previous32(), for performance-critical situations. 726 * 727 * Move the iterator position to the character (code point) whose 728 * index precedes the current position, and return that character. 729 * This is a pre-decrement operation. 730 * Returns U_SENTINEL (-1) if the position is at the start of the text. 731 * 732 * @stable ICU 3.4 733 */ 734#define UTEXT_PREVIOUS32(ut) \ 735 ((ut)->chunkOffset > 0 && \ 736 (ut)->chunkContents[(ut)->chunkOffset-1] < 0xd800 ? \ 737 (ut)->chunkContents[--((ut)->chunkOffset)] : utext_previous32(ut)) 738 739/** 740 * inline version of utext_getNativeIndex(), for performance-critical situations. 741 * 742 * Get the current iterator position, which can range from 0 to 743 * the length of the text. 744 * The position is a native index into the input text, in whatever format it 745 * may have (possibly UTF-8 for example), and may not always be the same as 746 * the corresponding UChar (UTF-16) index. 747 * The returned position will always be aligned to a code point boundary. 748 * 749 * @stable ICU 3.6 750 */ 751#define UTEXT_GETNATIVEINDEX(ut) \ 752 ((ut)->chunkOffset <= (ut)->nativeIndexingLimit? \ 753 (ut)->chunkNativeStart+(ut)->chunkOffset : \ 754 (ut)->pFuncs->mapOffsetToNative(ut)) 755 756/** 757 * inline version of utext_setNativeIndex(), for performance-critical situations. 758 * 759 * Set the current iteration position to the nearest code point 760 * boundary at or preceding the specified index. 761 * The index is in the native units of the original input text. 762 * If the index is out of range, it will be pinned to be within 763 * the range of the input text. 764 * 765 * @stable ICU 3.8 766 */ 767#define UTEXT_SETNATIVEINDEX(ut, ix) \ 768 { int64_t __offset = (ix) - (ut)->chunkNativeStart; \ 769 if (__offset>=0 && __offset<=(int64_t)(ut)->nativeIndexingLimit) { \ 770 (ut)->chunkOffset=(int32_t)__offset; \ 771 } else { \ 772 utext_setNativeIndex((ut), (ix)); } } 773 774 775 776/************************************************************************************ 777 * 778 * Functions related to writing or modifying the text. 779 * These will work only with modifiable UTexts. Attempting to 780 * modify a read-only UText will return an error status. 781 * 782 ************************************************************************************/ 783 784 785/** 786 * Return TRUE if the text can be written (modified) with utext_replace() or 787 * utext_copy(). For the text to be writable, the text provider must 788 * be of a type that supports writing and the UText must not be frozen. 789 * 790 * Attempting to modify text when utext_isWriteable() is FALSE will fail - 791 * the text will not be modified, and an error will be returned from the function 792 * that attempted the modification. 793 * 794 * @param ut the UText to be tested. 795 * @return TRUE if the text is modifiable. 796 * 797 * @see utext_freeze() 798 * @see utext_replace() 799 * @see utext_copy() 800 * @stable ICU 3.4 801 * 802 */ 803U_STABLE UBool U_EXPORT2 804utext_isWritable(const UText *ut); 805 806 807/** 808 * Test whether there is meta data associated with the text. 809 * @see Replaceable::hasMetaData() 810 * 811 * @param ut The UText to be tested 812 * @return TRUE if the underlying text includes meta data. 813 * @stable ICU 3.4 814 */ 815U_STABLE UBool U_EXPORT2 816utext_hasMetaData(const UText *ut); 817 818 819/** 820 * Replace a range of the original text with a replacement text. 821 * 822 * Leaves the current iteration position at the position following the 823 * newly inserted replacement text. 824 * 825 * This function is only available on UText types that support writing, 826 * that is, ones where utext_isWritable() returns TRUE. 827 * 828 * When using this function, there should be only a single UText opened onto the 829 * underlying native text string. Behavior after a replace operation 830 * on a UText is undefined for any other additional UTexts that refer to the 831 * modified string. 832 * 833 * @param ut the UText representing the text to be operated on. 834 * @param nativeStart the native index of the start of the region to be replaced 835 * @param nativeLimit the native index of the character following the region to be replaced. 836 * @param replacementText pointer to the replacement text 837 * @param replacementLength length of the replacement text, or -1 if the text is NUL terminated. 838 * @param status receives any error status. Possible errors include 839 * U_NO_WRITE_PERMISSION 840 * 841 * @return The signed number of (native) storage units by which 842 * the length of the text expanded or contracted. 843 * 844 * @stable ICU 3.4 845 */ 846U_STABLE int32_t U_EXPORT2 847utext_replace(UText *ut, 848 int64_t nativeStart, int64_t nativeLimit, 849 const UChar *replacementText, int32_t replacementLength, 850 UErrorCode *status); 851 852 853 854/** 855 * 856 * Copy or move a substring from one position to another within the text, 857 * while retaining any metadata associated with the text. 858 * This function is used to duplicate or reorder substrings. 859 * The destination index must not overlap the source range. 860 * 861 * The text to be copied or moved is inserted at destIndex; 862 * it does not replace or overwrite any existing text. 863 * 864 * The iteration position is left following the newly inserted text 865 * at the destination position. 866 * 867 * This function is only available on UText types that support writing, 868 * that is, ones where utext_isWritable() returns TRUE. 869 * 870 * When using this function, there should be only a single UText opened onto the 871 * underlying native text string. Behavior after a copy operation 872 * on a UText is undefined in any other additional UTexts that refer to the 873 * modified string. 874 * 875 * @param ut The UText representing the text to be operated on. 876 * @param nativeStart The native index of the start of the region to be copied or moved 877 * @param nativeLimit The native index of the character position following the region 878 * to be copied. 879 * @param destIndex The native destination index to which the source substring is 880 * copied or moved. 881 * @param move If TRUE, then the substring is moved, not copied/duplicated. 882 * @param status receives any error status. Possible errors include U_NO_WRITE_PERMISSION 883 * 884 * @stable ICU 3.4 885 */ 886U_STABLE void U_EXPORT2 887utext_copy(UText *ut, 888 int64_t nativeStart, int64_t nativeLimit, 889 int64_t destIndex, 890 UBool move, 891 UErrorCode *status); 892 893 894/** 895 * <p> 896 * Freeze a UText. This prevents any modification to the underlying text itself 897 * by means of functions operating on this UText. 898 * </p> 899 * <p> 900 * Once frozen, a UText can not be unfrozen. The intent is to ensure 901 * that a the text underlying a frozen UText wrapper cannot be modified via that UText. 902 * </p> 903 * <p> 904 * Caution: freezing a UText will disable changes made via the specific 905 * frozen UText wrapper only; it will not have any effect on the ability to 906 * directly modify the text by bypassing the UText. Any such backdoor modifications 907 * are always an error while UText access is occuring because the underlying 908 * text can get out of sync with UText's buffering. 909 * </p> 910 * 911 * @param ut The UText to be frozen. 912 * @see utext_isWritable() 913 * @stable ICU 3.6 914 */ 915U_STABLE void U_EXPORT2 916utext_freeze(UText *ut); 917 918 919/** 920 * UText provider properties (bit field indexes). 921 * 922 * @see UText 923 * @stable ICU 3.4 924 */ 925enum { 926 /** 927 * It is potentially time consuming for the provider to determine the length of the text. 928 * @stable ICU 3.4 929 */ 930 UTEXT_PROVIDER_LENGTH_IS_EXPENSIVE = 1, 931 /** 932 * Text chunks remain valid and usable until the text object is modified or 933 * deleted, not just until the next time the access() function is called 934 * (which is the default). 935 * @stable ICU 3.4 936 */ 937 UTEXT_PROVIDER_STABLE_CHUNKS = 2, 938 /** 939 * The provider supports modifying the text via the replace() and copy() 940 * functions. 941 * @see Replaceable 942 * @stable ICU 3.4 943 */ 944 UTEXT_PROVIDER_WRITABLE = 3, 945 /** 946 * There is meta data associated with the text. 947 * @see Replaceable::hasMetaData() 948 * @stable ICU 3.4 949 */ 950 UTEXT_PROVIDER_HAS_META_DATA = 4, 951 /** 952 * Text provider owns the text storage. 953 * Generally occurs as the result of a deep clone of the UText. 954 * When closing the UText, the associated text must 955 * also be closed/deleted/freed/ whatever is appropriate. 956 * @stable ICU 3.6 957 */ 958 UTEXT_PROVIDER_OWNS_TEXT = 5 959}; 960 961/** 962 * Function type declaration for UText.clone(). 963 * 964 * clone a UText. Much like opening a UText where the source text is itself 965 * another UText. 966 * 967 * A deep clone will copy both the UText data structures and the underlying text. 968 * The original and cloned UText will operate completely independently; modifications 969 * made to the text in one will not effect the other. Text providers are not 970 * required to support deep clones. The user of clone() must check the status return 971 * and be prepared to handle failures. 972 * 973 * A shallow clone replicates only the UText data structures; it does not make 974 * a copy of the underlying text. Shallow clones can be used as an efficient way to 975 * have multiple iterators active in a single text string that is not being 976 * modified. 977 * 978 * A shallow clone operation must not fail except for truly exceptional conditions such 979 * as memory allocation failures. 980 * 981 * A UText and its clone may be safely concurrently accessed by separate threads. 982 * This is true for both shallow and deep clones. 983 * It is the responsibility of the Text Provider to ensure that this thread safety 984 * constraint is met. 985 986 * 987 * @param dest A UText struct to be filled in with the result of the clone operation, 988 * or NULL if the clone function should heap-allocate a new UText struct. 989 * @param src The UText to be cloned. 990 * @param deep TRUE to request a deep clone, FALSE for a shallow clone. 991 * @param status Errors are returned here. For deep clones, U_UNSUPPORTED_ERROR 992 * should be returned if the text provider is unable to clone the 993 * original text. 994 * @return The newly created clone, or NULL if the clone operation failed. 995 * 996 * @stable ICU 3.4 997 */ 998typedef UText * U_CALLCONV 999UTextClone(UText *dest, const UText *src, UBool deep, UErrorCode *status); 1000 1001 1002/** 1003 * Function type declaration for UText.nativeLength(). 1004 * 1005 * @param ut the UText to get the length of. 1006 * @return the length, in the native units of the original text string. 1007 * @see UText 1008 * @stable ICU 3.4 1009 */ 1010typedef int64_t U_CALLCONV 1011UTextNativeLength(UText *ut); 1012 1013/** 1014 * Function type declaration for UText.access(). Get the description of the text chunk 1015 * containing the text at a requested native index. The UText's iteration 1016 * position will be left at the requested index. If the index is out 1017 * of bounds, the iteration position will be left at the start or end 1018 * of the string, as appropriate. 1019 * 1020 * Chunks must begin and end on code point boundaries. A single code point 1021 * comprised of multiple storage units must never span a chunk boundary. 1022 * 1023 * 1024 * @param ut the UText being accessed. 1025 * @param nativeIndex Requested index of the text to be accessed. 1026 * @param forward If TRUE, then the returned chunk must contain text 1027 * starting from the index, so that start<=index<limit. 1028 * If FALSE, then the returned chunk must contain text 1029 * before the index, so that start<index<=limit. 1030 * @return True if the requested index could be accessed. The chunk 1031 * will contain the requested text. 1032 * False value if a chunk cannot be accessed 1033 * (the requested index is out of bounds). 1034 * 1035 * @see UText 1036 * @stable ICU 3.4 1037 */ 1038typedef UBool U_CALLCONV 1039UTextAccess(UText *ut, int64_t nativeIndex, UBool forward); 1040 1041/** 1042 * Function type declaration for UText.extract(). 1043 * 1044 * Extract text from a UText into a UChar buffer. The range of text to be extracted 1045 * is specified in the native indices of the UText provider. These may not necessarily 1046 * be UTF-16 indices. 1047 * <p> 1048 * The size (number of 16 bit UChars) in the data to be extracted is returned. The 1049 * full amount is returned, even when the specified buffer size is smaller. 1050 * <p> 1051 * The extracted string will (if you are a user) / must (if you are a text provider) 1052 * be NUL-terminated if there is sufficient space in the destination buffer. 1053 * 1054 * @param ut the UText from which to extract data. 1055 * @param nativeStart the native index of the first characer to extract. 1056 * @param nativeLimit the native string index of the position following the last 1057 * character to extract. 1058 * @param dest the UChar (UTF-16) buffer into which the extracted text is placed 1059 * @param destCapacity The size, in UChars, of the destination buffer. May be zero 1060 * for precomputing the required size. 1061 * @param status receives any error status. 1062 * If U_BUFFER_OVERFLOW_ERROR: Returns number of UChars for 1063 * preflighting. 1064 * @return Number of UChars in the data. Does not include a trailing NUL. 1065 * 1066 * @stable ICU 3.4 1067 */ 1068typedef int32_t U_CALLCONV 1069UTextExtract(UText *ut, 1070 int64_t nativeStart, int64_t nativeLimit, 1071 UChar *dest, int32_t destCapacity, 1072 UErrorCode *status); 1073 1074/** 1075 * Function type declaration for UText.replace(). 1076 * 1077 * Replace a range of the original text with a replacement text. 1078 * 1079 * Leaves the current iteration position at the position following the 1080 * newly inserted replacement text. 1081 * 1082 * This function need only be implemented on UText types that support writing. 1083 * 1084 * When using this function, there should be only a single UText opened onto the 1085 * underlying native text string. The function is responsible for updating the 1086 * text chunk within the UText to reflect the updated iteration position, 1087 * taking into account any changes to the underlying string's structure caused 1088 * by the replace operation. 1089 * 1090 * @param ut the UText representing the text to be operated on. 1091 * @param nativeStart the index of the start of the region to be replaced 1092 * @param nativeLimit the index of the character following the region to be replaced. 1093 * @param replacementText pointer to the replacement text 1094 * @param replacmentLength length of the replacement text in UChars, or -1 if the text is NUL terminated. 1095 * @param status receives any error status. Possible errors include 1096 * U_NO_WRITE_PERMISSION 1097 * 1098 * @return The signed number of (native) storage units by which 1099 * the length of the text expanded or contracted. 1100 * 1101 * @stable ICU 3.4 1102 */ 1103typedef int32_t U_CALLCONV 1104UTextReplace(UText *ut, 1105 int64_t nativeStart, int64_t nativeLimit, 1106 const UChar *replacementText, int32_t replacmentLength, 1107 UErrorCode *status); 1108 1109/** 1110 * Function type declaration for UText.copy(). 1111 * 1112 * Copy or move a substring from one position to another within the text, 1113 * while retaining any metadata associated with the text. 1114 * This function is used to duplicate or reorder substrings. 1115 * The destination index must not overlap the source range. 1116 * 1117 * The text to be copied or moved is inserted at destIndex; 1118 * it does not replace or overwrite any existing text. 1119 * 1120 * This function need only be implemented for UText types that support writing. 1121 * 1122 * When using this function, there should be only a single UText opened onto the 1123 * underlying native text string. The function is responsible for updating the 1124 * text chunk within the UText to reflect the updated iteration position, 1125 * taking into account any changes to the underlying string's structure caused 1126 * by the replace operation. 1127 * 1128 * @param ut The UText representing the text to be operated on. 1129 * @param nativeStart The index of the start of the region to be copied or moved 1130 * @param nativeLimit The index of the character following the region to be replaced. 1131 * @param nativeDest The destination index to which the source substring is copied or moved. 1132 * @param move If TRUE, then the substring is moved, not copied/duplicated. 1133 * @param status receives any error status. Possible errors include U_NO_WRITE_PERMISSION 1134 * 1135 * @stable ICU 3.4 1136 */ 1137typedef void U_CALLCONV 1138UTextCopy(UText *ut, 1139 int64_t nativeStart, int64_t nativeLimit, 1140 int64_t nativeDest, 1141 UBool move, 1142 UErrorCode *status); 1143 1144/** 1145 * Function type declaration for UText.mapOffsetToNative(). 1146 * Map from the current UChar offset within the current text chunk to 1147 * the corresponding native index in the original source text. 1148 * 1149 * This is required only for text providers that do not use native UTF-16 indexes. 1150 * 1151 * @param ut the UText. 1152 * @return Absolute (native) index corresponding to chunkOffset in the current chunk. 1153 * The returned native index should always be to a code point boundary. 1154 * 1155 * @stable ICU 3.4 1156 */ 1157typedef int64_t U_CALLCONV 1158UTextMapOffsetToNative(const UText *ut); 1159 1160/** 1161 * Function type declaration for UText.mapIndexToUTF16(). 1162 * Map from a native index to a UChar offset within a text chunk. 1163 * Behavior is undefined if the native index does not fall within the 1164 * current chunk. 1165 * 1166 * This function is required only for text providers that do not use native UTF-16 indexes. 1167 * 1168 * @param ut The UText containing the text chunk. 1169 * @param nativeIndex Absolute (native) text index, chunk->start<=index<=chunk->limit. 1170 * @return Chunk-relative UTF-16 offset corresponding to the specified native 1171 * index. 1172 * 1173 * @stable ICU 3.4 1174 */ 1175typedef int32_t U_CALLCONV 1176UTextMapNativeIndexToUTF16(const UText *ut, int64_t nativeIndex); 1177 1178 1179/** 1180 * Function type declaration for UText.utextClose(). 1181 * 1182 * A Text Provider close function is only required for provider types that make 1183 * allocations in their open function (or other functions) that must be 1184 * cleaned when the UText is closed. 1185 * 1186 * The allocation of the UText struct itself and any "extra" storage 1187 * associated with the UText is handled by the common UText implementation 1188 * and does not require provider specific cleanup in a close function. 1189 * 1190 * Most UText provider implementations do not need to implement this function. 1191 * 1192 * @param ut A UText object to be closed. 1193 * 1194 * @stable ICU 3.4 1195 */ 1196typedef void U_CALLCONV 1197UTextClose(UText *ut); 1198 1199 1200/** 1201 * (public) Function dispatch table for UText. 1202 * Conceptually very much like a C++ Virtual Function Table. 1203 * This struct defines the organization of the table. 1204 * Each text provider implementation must provide an 1205 * actual table that is initialized with the appropriate functions 1206 * for the type of text being handled. 1207 * @stable ICU 3.6 1208 */ 1209struct UTextFuncs { 1210 /** 1211 * (public) Function table size, sizeof(UTextFuncs) 1212 * Intended for use should the table grow to accomodate added 1213 * functions in the future, to allow tests for older format 1214 * function tables that do not contain the extensions. 1215 * 1216 * Fields are placed for optimal alignment on 1217 * 32/64/128-bit-pointer machines, by normally grouping together 1218 * 4 32-bit fields, 1219 * 4 pointers, 1220 * 2 64-bit fields 1221 * in sequence. 1222 * @stable ICU 3.6 1223 */ 1224 int32_t tableSize; 1225 1226 /** 1227 * (private) Alignment padding. 1228 * Do not use, reserved for use by the UText framework only. 1229 * @internal 1230 */ 1231 int32_t reserved1, /** @internal */ reserved2, /** @internal */ reserved3; 1232 1233 1234 /** 1235 * (public) Function pointer for UTextClone 1236 * 1237 * @see UTextClone 1238 * @stable ICU 3.6 1239 */ 1240 UTextClone *clone; 1241 1242 /** 1243 * (public) function pointer for UTextLength 1244 * May be expensive to compute! 1245 * 1246 * @see UTextLength 1247 * @stable ICU 3.6 1248 */ 1249 UTextNativeLength *nativeLength; 1250 1251 /** 1252 * (public) Function pointer for UTextAccess. 1253 * 1254 * @see UTextAccess 1255 * @stable ICU 3.6 1256 */ 1257 UTextAccess *access; 1258 1259 /** 1260 * (public) Function pointer for UTextExtract. 1261 * 1262 * @see UTextExtract 1263 * @stable ICU 3.6 1264 */ 1265 UTextExtract *extract; 1266 1267 /** 1268 * (public) Function pointer for UTextReplace. 1269 * 1270 * @see UTextReplace 1271 * @stable ICU 3.6 1272 */ 1273 UTextReplace *replace; 1274 1275 /** 1276 * (public) Function pointer for UTextCopy. 1277 * 1278 * @see UTextCopy 1279 * @stable ICU 3.6 1280 */ 1281 UTextCopy *copy; 1282 1283 /** 1284 * (public) Function pointer for UTextMapOffsetToNative. 1285 * 1286 * @see UTextMapOffsetToNative 1287 * @stable ICU 3.6 1288 */ 1289 UTextMapOffsetToNative *mapOffsetToNative; 1290 1291 /** 1292 * (public) Function pointer for UTextMapNativeIndexToUTF16. 1293 * 1294 * @see UTextMapNativeIndexToUTF16 1295 * @stable ICU 3.6 1296 */ 1297 UTextMapNativeIndexToUTF16 *mapNativeIndexToUTF16; 1298 1299 /** 1300 * (public) Function pointer for UTextClose. 1301 * 1302 * @see UTextClose 1303 * @stable ICU 3.6 1304 */ 1305 UTextClose *close; 1306 1307 /** 1308 * (private) Spare function pointer 1309 * @internal 1310 */ 1311 UTextClose *spare1; 1312 1313 /** 1314 * (private) Spare function pointer 1315 * @internal 1316 */ 1317 UTextClose *spare2; 1318 1319 /** 1320 * (private) Spare function pointer 1321 * @internal 1322 */ 1323 UTextClose *spare3; 1324 1325}; 1326/** 1327 * Function dispatch table for UText 1328 * @see UTextFuncs 1329 */ 1330typedef struct UTextFuncs UTextFuncs; 1331 1332 /** 1333 * UText struct. Provides the interface between the generic UText access code 1334 * and the UText provider code that works on specific kinds of 1335 * text (UTF-8, noncontiguous UTF-16, whatever.) 1336 * 1337 * Applications that are using predefined types of text providers 1338 * to pass text data to ICU services will have no need to view the 1339 * internals of the UText structs that they open. 1340 * 1341 * @stable ICU 3.6 1342 */ 1343struct UText { 1344 /** 1345 * (private) Magic. Used to help detect when UText functions are handed 1346 * invalid or unitialized UText structs. 1347 * utext_openXYZ() functions take an initialized, 1348 * but not necessarily open, UText struct as an 1349 * optional fill-in parameter. This magic field 1350 * is used to check for that initialization. 1351 * Text provider close functions must NOT clear 1352 * the magic field because that would prevent 1353 * reuse of the UText struct. 1354 * @internal 1355 */ 1356 uint32_t magic; 1357 1358 1359 /** 1360 * (private) Flags for managing the allocation and freeing of 1361 * memory associated with this UText. 1362 * @internal 1363 */ 1364 int32_t flags; 1365 1366 1367 /** 1368 * Text provider properties. This set of flags is maintainted by the 1369 * text provider implementation. 1370 * @stable ICU 3.4 1371 */ 1372 int32_t providerProperties; 1373 1374 /** 1375 * (public) sizeOfStruct=sizeof(UText) 1376 * Allows possible backward compatible extension. 1377 * 1378 * @stable ICU 3.4 1379 */ 1380 int32_t sizeOfStruct; 1381 1382 /* ------ 16 byte alignment boundary ----------- */ 1383 1384 1385 /** 1386 * (protected) Native index of the first character position following 1387 * the current chunk. 1388 * @stable ICU 3.6 1389 */ 1390 int64_t chunkNativeLimit; 1391 1392 /** 1393 * (protected) Size in bytes of the extra space (pExtra). 1394 * @stable ICU 3.4 1395 */ 1396 int32_t extraSize; 1397 1398 /** 1399 * (protected) The highest chunk offset where native indexing and 1400 * chunk (UTF-16) indexing correspond. For UTF-16 sources, value 1401 * will be equal to chunkLength. 1402 * 1403 * @stable ICU 3.6 1404 */ 1405 int32_t nativeIndexingLimit; 1406 1407 /* ---- 16 byte alignment boundary------ */ 1408 1409 /** 1410 * (protected) Native index of the first character in the text chunk. 1411 * @stable ICU 3.6 1412 */ 1413 int64_t chunkNativeStart; 1414 1415 /** 1416 * (protected) Current iteration position within the text chunk (UTF-16 buffer). 1417 * This is the index to the character that will be returned by utext_next32(). 1418 * @stable ICU 3.6 1419 */ 1420 int32_t chunkOffset; 1421 1422 /** 1423 * (protected) Length the text chunk (UTF-16 buffer), in UChars. 1424 * @stable ICU 3.6 1425 */ 1426 int32_t chunkLength; 1427 1428 /* ---- 16 byte alignment boundary-- */ 1429 1430 1431 /** 1432 * (protected) pointer to a chunk of text in UTF-16 format. 1433 * May refer either to original storage of the source of the text, or 1434 * if conversion was required, to a buffer owned by the UText. 1435 * @stable ICU 3.6 1436 */ 1437 const UChar *chunkContents; 1438 1439 /** 1440 * (public) Pointer to Dispatch table for accessing functions for this UText. 1441 * @stable ICU 3.6 1442 */ 1443 const UTextFuncs *pFuncs; 1444 1445 /** 1446 * (protected) Pointer to additional space requested by the 1447 * text provider during the utext_open operation. 1448 * @stable ICU 3.4 1449 */ 1450 void *pExtra; 1451 1452 /** 1453 * (protected) Pointer to string or text-containin object or similar. 1454 * This is the source of the text that this UText is wrapping, in a format 1455 * that is known to the text provider functions. 1456 * @stable ICU 3.4 1457 */ 1458 const void *context; 1459 1460 /* --- 16 byte alignment boundary--- */ 1461 1462 /** 1463 * (protected) Pointer fields available for use by the text provider. 1464 * Not used by UText common code. 1465 * @stable ICU 3.6 1466 */ 1467 const void *p; 1468 /** 1469 * (protected) Pointer fields available for use by the text provider. 1470 * Not used by UText common code. 1471 * @stable ICU 3.6 1472 */ 1473 const void *q; 1474 /** 1475 * (protected) Pointer fields available for use by the text provider. 1476 * Not used by UText common code. 1477 * @stable ICU 3.6 1478 */ 1479 const void *r; 1480 1481 /** 1482 * Private field reserved for future use by the UText framework 1483 * itself. This is not to be touched by the text providers. 1484 * @internal ICU 3.4 1485 */ 1486 void *privP; 1487 1488 1489 /* --- 16 byte alignment boundary--- */ 1490 1491 1492 /** 1493 * (protected) Integer field reserved for use by the text provider. 1494 * Not used by the UText framework, or by the client (user) of the UText. 1495 * @stable ICU 3.4 1496 */ 1497 int64_t a; 1498 1499 /** 1500 * (protected) Integer field reserved for use by the text provider. 1501 * Not used by the UText framework, or by the client (user) of the UText. 1502 * @stable ICU 3.4 1503 */ 1504 int32_t b; 1505 1506 /** 1507 * (protected) Integer field reserved for use by the text provider. 1508 * Not used by the UText framework, or by the client (user) of the UText. 1509 * @stable ICU 3.4 1510 */ 1511 int32_t c; 1512 1513 /* ---- 16 byte alignment boundary---- */ 1514 1515 1516 /** 1517 * Private field reserved for future use by the UText framework 1518 * itself. This is not to be touched by the text providers. 1519 * @internal ICU 3.4 1520 */ 1521 int64_t privA; 1522 /** 1523 * Private field reserved for future use by the UText framework 1524 * itself. This is not to be touched by the text providers. 1525 * @internal ICU 3.4 1526 */ 1527 int32_t privB; 1528 /** 1529 * Private field reserved for future use by the UText framework 1530 * itself. This is not to be touched by the text providers. 1531 * @internal ICU 3.4 1532 */ 1533 int32_t privC; 1534}; 1535 1536 1537/** 1538 * Common function for use by Text Provider implementations to allocate and/or initialize 1539 * a new UText struct. To be called in the implementation of utext_open() functions. 1540 * If the supplied UText parameter is null, a new UText struct will be allocated on the heap. 1541 * If the supplied UText is already open, the provider's close function will be called 1542 * so that the struct can be reused by the open that is in progress. 1543 * 1544 * @param ut pointer to a UText struct to be re-used, or null if a new UText 1545 * should be allocated. 1546 * @param extraSpace The amount of additional space to be allocated as part 1547 * of this UText, for use by types of providers that require 1548 * additional storage. 1549 * @param status Errors are returned here. 1550 * @return pointer to the UText, allocated if necessary, with extra space set up if requested. 1551 * @stable ICU 3.4 1552 */ 1553U_STABLE UText * U_EXPORT2 1554utext_setup(UText *ut, int32_t extraSpace, UErrorCode *status); 1555 1556#ifndef U_HIDE_INTERNAL_API 1557/** 1558 * @internal 1559 * Value used to help identify correctly initialized UText structs. 1560 * Note: must be publicly visible so that UTEXT_INITIALIZER can access it. 1561 */ 1562enum { 1563 UTEXT_MAGIC = 0x345ad82c 1564}; 1565#endif /* U_HIDE_INTERNAL_API */ 1566 1567/** 1568 * initializer to be used with local (stack) instances of a UText 1569 * struct. UText structs must be initialized before passing 1570 * them to one of the utext_open functions. 1571 * 1572 * @stable ICU 3.6 1573 */ 1574#define UTEXT_INITIALIZER { \ 1575 UTEXT_MAGIC, /* magic */ \ 1576 0, /* flags */ \ 1577 0, /* providerProps */ \ 1578 sizeof(UText), /* sizeOfStruct */ \ 1579 0, /* chunkNativeLimit */ \ 1580 0, /* extraSize */ \ 1581 0, /* nativeIndexingLimit */ \ 1582 0, /* chunkNativeStart */ \ 1583 0, /* chunkOffset */ \ 1584 0, /* chunkLength */ \ 1585 NULL, /* chunkContents */ \ 1586 NULL, /* pFuncs */ \ 1587 NULL, /* pExtra */ \ 1588 NULL, /* context */ \ 1589 NULL, NULL, NULL, /* p, q, r */ \ 1590 NULL, /* privP */ \ 1591 0, 0, 0, /* a, b, c */ \ 1592 0, 0, 0 /* privA,B,C, */ \ 1593 } 1594 1595 1596U_CDECL_END 1597 1598 1599 1600#endif 1601