1f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)/* 2f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)******************************************************************************* 3f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)* 4f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)* Copyright (C) 1999-2009, International Business Machines 5f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)* Corporation and others. All Rights Reserved. 6f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)* 7f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)******************************************************************************* 8f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)* file name: utf8.h 9f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)* encoding: US-ASCII 10f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)* tab size: 8 (not used) 11f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)* indentation:4 12f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)* 13f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)* created on: 1999sep13 14f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)* created by: Markus W. Scherer 15f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)*/ 16f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 17f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)/** 18f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * \file 19f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * \brief C API: 8-bit Unicode handling macros 20f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * 21f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * This file defines macros to deal with 8-bit Unicode (UTF-8) code units (bytes) and strings. 22f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * utf8.h is included by utf.h after unicode/umachine.h 23f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * and some common definitions. 24f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * 25f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * For more information see utf.h and the ICU User Guide Strings chapter 26f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * (http://icu-project.org/userguide/strings.html). 27f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * 28f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * <em>Usage:</em> 29f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * ICU coding guidelines for if() statements should be followed when using these macros. 30f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * Compound statements (curly braces {}) must be used for if-else-while... 31f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * bodies and all macro statements should be terminated with semicolon. 32f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) */ 33f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 34f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)#ifndef __UTF8_H__ 35f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)#define __UTF8_H__ 36f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 37f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)/* utf.h must be included first. */ 38f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)#ifndef __UTF_H__ 39f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)# include "unicode/utf.h" 40f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)#endif 41f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 42f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)/* internal definitions ----------------------------------------------------- */ 43f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 44f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)/** 45f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * \var utf8_countTrailBytes 46f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * Internal array with numbers of trail bytes for any given byte used in 47f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * lead byte position. 48f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * 49f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * This is internal since it is not meant to be called directly by external clients; 50f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * however it is called by public macros in this file and thus must remain stable, 51f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * and should not be hidden when other internal functions are hidden (otherwise 52f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * public macros would fail to compile). 53f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * @internal 54f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) */ 55f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)#ifdef U_UTF8_IMPL 56f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)U_EXPORT const uint8_t 57f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)#elif defined(U_STATIC_IMPLEMENTATION) || defined(U_COMMON_IMPLEMENTATION) 58f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)U_CFUNC const uint8_t 59f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)#else 60f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)U_CFUNC U_IMPORT const uint8_t /* U_IMPORT2? */ /*U_IMPORT*/ 61f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)#endif 62f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)utf8_countTrailBytes[256]; 63f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 64f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)/** 65f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * Count the trail bytes for a UTF-8 lead byte. 66f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * 67f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * This is internal since it is not meant to be called directly by external clients; 68f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * however it is called by public macros in this file and thus must remain stable. 69f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * @internal 70f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) */ 71f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)#define U8_COUNT_TRAIL_BYTES(leadByte) (utf8_countTrailBytes[(uint8_t)leadByte]) 72f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 73f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)/** 74f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * Mask a UTF-8 lead byte, leave only the lower bits that form part of the code point value. 75f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * 76f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * This is internal since it is not meant to be called directly by external clients; 77f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * however it is called by public macros in this file and thus must remain stable. 78f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * @internal 79f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) */ 80f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)#define U8_MASK_LEAD_BYTE(leadByte, countTrailBytes) ((leadByte)&=(1<<(6-(countTrailBytes)))-1) 81f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 82f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)/** 83f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * Function for handling "next code point" with error-checking. 84f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * 85f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * This is internal since it is not meant to be called directly by external clients; 86f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * however it is U_STABLE (not U_INTERNAL) since it is called by public macros in this 87f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * file and thus must remain stable, and should not be hidden when other internal 88f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * functions are hidden (otherwise public macros would fail to compile). 89f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * @internal 90f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) */ 91f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)U_STABLE UChar32 U_EXPORT2 92f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)utf8_nextCharSafeBody(const uint8_t *s, int32_t *pi, int32_t length, UChar32 c, UBool strict); 93f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 94f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)/** 95f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * Function for handling "append code point" with error-checking. 96f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * 97f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * This is internal since it is not meant to be called directly by external clients; 98f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * however it is U_STABLE (not U_INTERNAL) since it is called by public macros in this 99f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * file and thus must remain stable, and should not be hidden when other internal 100f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * functions are hidden (otherwise public macros would fail to compile). 101f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * @internal 102f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) */ 103f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)U_STABLE int32_t U_EXPORT2 104f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)utf8_appendCharSafeBody(uint8_t *s, int32_t i, int32_t length, UChar32 c, UBool *pIsError); 105f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 106f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)/** 107f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * Function for handling "previous code point" with error-checking. 108f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * 109f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * This is internal since it is not meant to be called directly by external clients; 110f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * however it is U_STABLE (not U_INTERNAL) since it is called by public macros in this 111f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * file and thus must remain stable, and should not be hidden when other internal 112f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * functions are hidden (otherwise public macros would fail to compile). 113f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * @internal 114f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) */ 115f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)U_STABLE UChar32 U_EXPORT2 116f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)utf8_prevCharSafeBody(const uint8_t *s, int32_t start, int32_t *pi, UChar32 c, UBool strict); 117f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 118f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)/** 119f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * Function for handling "skip backward one code point" with error-checking. 120f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * 121f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * This is internal since it is not meant to be called directly by external clients; 122f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * however it is U_STABLE (not U_INTERNAL) since it is called by public macros in this 123f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * file and thus must remain stable, and should not be hidden when other internal 124f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * functions are hidden (otherwise public macros would fail to compile). 125f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * @internal 126f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) */ 127f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)U_STABLE int32_t U_EXPORT2 128f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i); 129f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 130f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)/* single-code point definitions -------------------------------------------- */ 131f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 132f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)/** 133f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * Does this code unit (byte) encode a code point by itself (US-ASCII 0..0x7f)? 134f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * @param c 8-bit code unit (byte) 135f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * @return TRUE or FALSE 136f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * @stable ICU 2.4 137f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) */ 138f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)#define U8_IS_SINGLE(c) (((c)&0x80)==0) 139f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 140f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)/** 141f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * Is this code unit (byte) a UTF-8 lead byte? 142f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * @param c 8-bit code unit (byte) 143f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * @return TRUE or FALSE 144f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * @stable ICU 2.4 145f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) */ 146f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)#define U8_IS_LEAD(c) ((uint8_t)((c)-0xc0)<0x3e) 147f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 148f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)/** 149f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * Is this code unit (byte) a UTF-8 trail byte? 150f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * @param c 8-bit code unit (byte) 151f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * @return TRUE or FALSE 152f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * @stable ICU 2.4 153f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) */ 154f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)#define U8_IS_TRAIL(c) (((c)&0xc0)==0x80) 155f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 156f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)/** 157f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * How many code units (bytes) are used for the UTF-8 encoding 158f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * of this Unicode code point? 159f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * @param c 32-bit code point 160f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * @return 1..4, or 0 if c is a surrogate or not a Unicode code point 161f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * @stable ICU 2.4 162f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) */ 163f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)#define U8_LENGTH(c) \ 164f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) ((uint32_t)(c)<=0x7f ? 1 : \ 165f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) ((uint32_t)(c)<=0x7ff ? 2 : \ 166f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) ((uint32_t)(c)<=0xd7ff ? 3 : \ 167f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) ((uint32_t)(c)<=0xdfff || (uint32_t)(c)>0x10ffff ? 0 : \ 168f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) ((uint32_t)(c)<=0xffff ? 3 : 4)\ 169f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) ) \ 170f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) ) \ 171f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) ) \ 172f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) ) 173f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 174f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)/** 175f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * The maximum number of UTF-8 code units (bytes) per Unicode code point (U+0000..U+10ffff). 176f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * @return 4 177f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * @stable ICU 2.4 178f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) */ 179f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)#define U8_MAX_LENGTH 4 180f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 181f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)/** 182f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * Get a code point from a string at a random-access offset, 183f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * without changing the offset. 184f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * The offset may point to either the lead byte or one of the trail bytes 185f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * for a code point, in which case the macro will read all of the bytes 186f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * for the code point. 187f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * The result is undefined if the offset points to an illegal UTF-8 188f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * byte sequence. 189f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * Iteration through a string is more efficient with U8_NEXT_UNSAFE or U8_NEXT. 190f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * 191f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * @param s const uint8_t * string 192f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * @param i string offset 193f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * @param c output UChar32 variable 194f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * @see U8_GET 195f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * @stable ICU 2.4 196f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) */ 197f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)#define U8_GET_UNSAFE(s, i, c) { \ 198f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) int32_t _u8_get_unsafe_index=(int32_t)(i); \ 199f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) U8_SET_CP_START_UNSAFE(s, _u8_get_unsafe_index); \ 200f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) U8_NEXT_UNSAFE(s, _u8_get_unsafe_index, c); \ 201f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)} 202f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 203f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)/** 204f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * Get a code point from a string at a random-access offset, 205f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * without changing the offset. 206f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * The offset may point to either the lead byte or one of the trail bytes 207f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * for a code point, in which case the macro will read all of the bytes 208f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * for the code point. 209f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * If the offset points to an illegal UTF-8 byte sequence, then 210f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * c is set to a negative value. 211f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * Iteration through a string is more efficient with U8_NEXT_UNSAFE or U8_NEXT. 212f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * 213f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * @param s const uint8_t * string 214f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * @param start starting string offset 215f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * @param i string offset, must be start<=i<length 216f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * @param length string length 217f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * @param c output UChar32 variable, set to <0 in case of an error 218f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * @see U8_GET_UNSAFE 219f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * @stable ICU 2.4 220f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) */ 221f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)#define U8_GET(s, start, i, length, c) { \ 222f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) int32_t _u8_get_index=(int32_t)(i); \ 223f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) U8_SET_CP_START(s, start, _u8_get_index); \ 224f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) U8_NEXT(s, _u8_get_index, length, c); \ 225f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)} 226f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 227f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)/* definitions with forward iteration --------------------------------------- */ 228f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 229f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)/** 230f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * Get a code point from a string at a code point boundary offset, 231f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * and advance the offset to the next code point boundary. 232f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * (Post-incrementing forward iteration.) 233f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * "Unsafe" macro, assumes well-formed UTF-8. 234f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * 235f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * The offset may point to the lead byte of a multi-byte sequence, 236f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * in which case the macro will read the whole sequence. 237f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * The result is undefined if the offset points to a trail byte 238f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * or an illegal UTF-8 sequence. 239f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * 240f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * @param s const uint8_t * string 241f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * @param i string offset 242f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * @param c output UChar32 variable 243f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * @see U8_NEXT 244f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * @stable ICU 2.4 245f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) */ 246f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)#define U8_NEXT_UNSAFE(s, i, c) { \ 247f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) (c)=(uint8_t)(s)[(i)++]; \ 248f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if((uint8_t)((c)-0xc0)<0x35) { \ 249f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) uint8_t __count=U8_COUNT_TRAIL_BYTES(c); \ 250f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) U8_MASK_LEAD_BYTE(c, __count); \ 251f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) switch(__count) { \ 252f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) /* each following branch falls through to the next one */ \ 253f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) case 3: \ 254f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) (c)=((c)<<6)|((s)[(i)++]&0x3f); \ 255f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) case 2: \ 256f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) (c)=((c)<<6)|((s)[(i)++]&0x3f); \ 257f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) case 1: \ 258f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) (c)=((c)<<6)|((s)[(i)++]&0x3f); \ 259f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) /* no other branches to optimize switch() */ \ 260f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) break; \ 261f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } \ 262f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } \ 263f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)} 264f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 265f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)/** 266f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * Get a code point from a string at a code point boundary offset, 267f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * and advance the offset to the next code point boundary. 268f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * (Post-incrementing forward iteration.) 269f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * "Safe" macro, checks for illegal sequences and for string boundaries. 270f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * 271f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * The offset may point to the lead byte of a multi-byte sequence, 272f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * in which case the macro will read the whole sequence. 273f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * If the offset points to a trail byte or an illegal UTF-8 sequence, then 274f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * c is set to a negative value. 275f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * 276f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * @param s const uint8_t * string 277f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * @param i string offset, must be i<length 278f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * @param length string length 279f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * @param c output UChar32 variable, set to <0 in case of an error 280f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * @see U8_NEXT_UNSAFE 281f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * @stable ICU 2.4 282f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) */ 283f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)#define U8_NEXT(s, i, length, c) { \ 284f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) (c)=(uint8_t)(s)[(i)++]; \ 285f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if((c)>=0x80) { \ 286f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) uint8_t __t1, __t2; \ 287f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if( /* handle U+1000..U+CFFF inline */ \ 288f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) (0xe0<(c) && (c)<=0xec) && \ 289f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) (((i)+1)<(length)) && \ 290f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) (__t1=(uint8_t)((s)[i]-0x80))<=0x3f && \ 291f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) (__t2=(uint8_t)((s)[(i)+1]-0x80))<= 0x3f \ 292f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) ) { \ 293f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) /* no need for (c&0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */ \ 294f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) (c)=(UChar)(((c)<<12)|(__t1<<6)|__t2); \ 295f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) (i)+=2; \ 296f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } else if( /* handle U+0080..U+07FF inline */ \ 297f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) ((c)<0xe0 && (c)>=0xc2) && \ 298f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) ((i)<(length)) && \ 299f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) (__t1=(uint8_t)((s)[i]-0x80))<=0x3f \ 300f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) ) { \ 301f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) (c)=(UChar)((((c)&0x1f)<<6)|__t1); \ 302f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) ++(i); \ 303f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } else if(U8_IS_LEAD(c)) { \ 304f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) /* function call for "complicated" and error cases */ \ 305f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) (c)=utf8_nextCharSafeBody((const uint8_t *)s, &(i), (int32_t)(length), c, -1); \ 306f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } else { \ 307f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) (c)=U_SENTINEL; \ 308f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } \ 309f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } \ 310f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)} 311f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 312f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)/** 313f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * Append a code point to a string, overwriting 1 to 4 bytes. 314f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * The offset points to the current end of the string contents 315f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * and is advanced (post-increment). 316f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * "Unsafe" macro, assumes a valid code point and sufficient space in the string. 317f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * Otherwise, the result is undefined. 318f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * 319f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * @param s const uint8_t * string buffer 320f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * @param i string offset 321f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * @param c code point to append 322f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * @see U8_APPEND 323f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * @stable ICU 2.4 324f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) */ 325f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)#define U8_APPEND_UNSAFE(s, i, c) { \ 326f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if((uint32_t)(c)<=0x7f) { \ 327f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) (s)[(i)++]=(uint8_t)(c); \ 328f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } else { \ 329f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if((uint32_t)(c)<=0x7ff) { \ 330f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) (s)[(i)++]=(uint8_t)(((c)>>6)|0xc0); \ 331f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } else { \ 332f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if((uint32_t)(c)<=0xffff) { \ 333f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) (s)[(i)++]=(uint8_t)(((c)>>12)|0xe0); \ 334f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } else { \ 335f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) (s)[(i)++]=(uint8_t)(((c)>>18)|0xf0); \ 336f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) (s)[(i)++]=(uint8_t)((((c)>>12)&0x3f)|0x80); \ 337f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } \ 338f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) (s)[(i)++]=(uint8_t)((((c)>>6)&0x3f)|0x80); \ 339f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } \ 340f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) (s)[(i)++]=(uint8_t)(((c)&0x3f)|0x80); \ 341f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } \ 342f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)} 343f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 344f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)/** 345f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * Append a code point to a string, overwriting 1 to 4 bytes. 346f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * The offset points to the current end of the string contents 347f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * and is advanced (post-increment). 348f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * "Safe" macro, checks for a valid code point. 349f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * If a non-ASCII code point is written, checks for sufficient space in the string. 350f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * If the code point is not valid or trail bytes do not fit, 351f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * then isError is set to TRUE. 352f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * 353f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * @param s const uint8_t * string buffer 354f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * @param i string offset, must be i<capacity 355f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * @param capacity size of the string buffer 356f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * @param c code point to append 357f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * @param isError output UBool set to TRUE if an error occurs, otherwise not modified 358f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * @see U8_APPEND_UNSAFE 359f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * @stable ICU 2.4 360f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) */ 361f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)#define U8_APPEND(s, i, capacity, c, isError) { \ 362f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if((uint32_t)(c)<=0x7f) { \ 363f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) (s)[(i)++]=(uint8_t)(c); \ 364f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } else if((uint32_t)(c)<=0x7ff && (i)+1<(capacity)) { \ 365f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) (s)[(i)++]=(uint8_t)(((c)>>6)|0xc0); \ 366f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) (s)[(i)++]=(uint8_t)(((c)&0x3f)|0x80); \ 367f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } else if((uint32_t)(c)<=0xd7ff && (i)+2<(capacity)) { \ 368f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) (s)[(i)++]=(uint8_t)(((c)>>12)|0xe0); \ 369f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) (s)[(i)++]=(uint8_t)((((c)>>6)&0x3f)|0x80); \ 370f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) (s)[(i)++]=(uint8_t)(((c)&0x3f)|0x80); \ 371f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } else { \ 372f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) (i)=utf8_appendCharSafeBody(s, (int32_t)(i), (int32_t)(capacity), c, &(isError)); \ 373f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } \ 374f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)} 375f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 376f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)/** 377f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * Advance the string offset from one code point boundary to the next. 378f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * (Post-incrementing iteration.) 379f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * "Unsafe" macro, assumes well-formed UTF-8. 380f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * 381f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * @param s const uint8_t * string 382f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * @param i string offset 383f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * @see U8_FWD_1 384f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * @stable ICU 2.4 385f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) */ 386f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)#define U8_FWD_1_UNSAFE(s, i) { \ 387f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) (i)+=1+U8_COUNT_TRAIL_BYTES((s)[i]); \ 388f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)} 389f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 390f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)/** 391f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * Advance the string offset from one code point boundary to the next. 392f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * (Post-incrementing iteration.) 393f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * "Safe" macro, checks for illegal sequences and for string boundaries. 394f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * 395f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * @param s const uint8_t * string 396f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * @param i string offset, must be i<length 397f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * @param length string length 398f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * @see U8_FWD_1_UNSAFE 399f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * @stable ICU 2.4 400f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) */ 401f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)#define U8_FWD_1(s, i, length) { \ 402f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) uint8_t __b=(uint8_t)(s)[(i)++]; \ 403f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if(U8_IS_LEAD(__b)) { \ 404f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) uint8_t __count=U8_COUNT_TRAIL_BYTES(__b); \ 405f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if((i)+__count>(length)) { \ 406f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) __count=(uint8_t)((length)-(i)); \ 407f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } \ 408f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) while(__count>0 && U8_IS_TRAIL((s)[i])) { \ 409f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) ++(i); \ 410f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) --__count; \ 411f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } \ 412f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } \ 413f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)} 414f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 415f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)/** 416f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * Advance the string offset from one code point boundary to the n-th next one, 417f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * i.e., move forward by n code points. 418f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * (Post-incrementing iteration.) 419f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * "Unsafe" macro, assumes well-formed UTF-8. 420f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * 421f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * @param s const uint8_t * string 422f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * @param i string offset 423f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * @param n number of code points to skip 424f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * @see U8_FWD_N 425f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * @stable ICU 2.4 426f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) */ 427f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)#define U8_FWD_N_UNSAFE(s, i, n) { \ 428f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) int32_t __N=(n); \ 429f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) while(__N>0) { \ 430f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) U8_FWD_1_UNSAFE(s, i); \ 431f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) --__N; \ 432f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } \ 433f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)} 434f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 435f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)/** 436f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * Advance the string offset from one code point boundary to the n-th next one, 437f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * i.e., move forward by n code points. 438f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * (Post-incrementing iteration.) 439f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * "Safe" macro, checks for illegal sequences and for string boundaries. 440f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * 441f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * @param s const uint8_t * string 442f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * @param i string offset, must be i<length 443f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * @param length string length 444f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * @param n number of code points to skip 445f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * @see U8_FWD_N_UNSAFE 446f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * @stable ICU 2.4 447f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) */ 448f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)#define U8_FWD_N(s, i, length, n) { \ 449f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) int32_t __N=(n); \ 450f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) while(__N>0 && (i)<(length)) { \ 451f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) U8_FWD_1(s, i, length); \ 452f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) --__N; \ 453f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } \ 454f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)} 455f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 456f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)/** 457f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * Adjust a random-access offset to a code point boundary 458f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * at the start of a code point. 459f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * If the offset points to a UTF-8 trail byte, 460f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * then the offset is moved backward to the corresponding lead byte. 461f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * Otherwise, it is not modified. 462f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * "Unsafe" macro, assumes well-formed UTF-8. 463f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * 464f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * @param s const uint8_t * string 465f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * @param i string offset 466f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * @see U8_SET_CP_START 467f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * @stable ICU 2.4 468f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) */ 469f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)#define U8_SET_CP_START_UNSAFE(s, i) { \ 470f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) while(U8_IS_TRAIL((s)[i])) { --(i); } \ 471f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)} 472f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 473f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)/** 474f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * Adjust a random-access offset to a code point boundary 475f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * at the start of a code point. 476f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * If the offset points to a UTF-8 trail byte, 477f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * then the offset is moved backward to the corresponding lead byte. 478f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * Otherwise, it is not modified. 479f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * "Safe" macro, checks for illegal sequences and for string boundaries. 480f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * 481f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * @param s const uint8_t * string 482f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * @param start starting string offset (usually 0) 483f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * @param i string offset, must be start<=i 484f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * @see U8_SET_CP_START_UNSAFE 485f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * @stable ICU 2.4 486f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) */ 487f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)#define U8_SET_CP_START(s, start, i) { \ 488f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if(U8_IS_TRAIL((s)[(i)])) { \ 489f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) (i)=utf8_back1SafeBody(s, start, (int32_t)(i)); \ 490f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } \ 491f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)} 492f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 493f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)/* definitions with backward iteration -------------------------------------- */ 494f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 495f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)/** 496f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * Move the string offset from one code point boundary to the previous one 497f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * and get the code point between them. 498f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * (Pre-decrementing backward iteration.) 499f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * "Unsafe" macro, assumes well-formed UTF-8. 500f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * 501f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * The input offset may be the same as the string length. 502f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * If the offset is behind a multi-byte sequence, then the macro will read 503f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * the whole sequence. 504f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * If the offset is behind a lead byte, then that itself 505f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * will be returned as the code point. 506f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * The result is undefined if the offset is behind an illegal UTF-8 sequence. 507f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * 508f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * @param s const uint8_t * string 509f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * @param i string offset 510f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * @param c output UChar32 variable 511f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * @see U8_PREV 512f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * @stable ICU 2.4 513f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) */ 514f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)#define U8_PREV_UNSAFE(s, i, c) { \ 515f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) (c)=(uint8_t)(s)[--(i)]; \ 516f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if(U8_IS_TRAIL(c)) { \ 517f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) uint8_t __b, __count=1, __shift=6; \ 518f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)\ 519f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) /* c is a trail byte */ \ 520f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) (c)&=0x3f; \ 521f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) for(;;) { \ 522f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) __b=(uint8_t)(s)[--(i)]; \ 523f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if(__b>=0xc0) { \ 524f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) U8_MASK_LEAD_BYTE(__b, __count); \ 525f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) (c)|=(UChar32)__b<<__shift; \ 526f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) break; \ 527f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } else { \ 528f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) (c)|=(UChar32)(__b&0x3f)<<__shift; \ 529f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) ++__count; \ 530f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) __shift+=6; \ 531f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } \ 532f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } \ 533f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } \ 534f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)} 535f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 536f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)/** 537f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * Move the string offset from one code point boundary to the previous one 538f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * and get the code point between them. 539f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * (Pre-decrementing backward iteration.) 540f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * "Safe" macro, checks for illegal sequences and for string boundaries. 541f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * 542f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * The input offset may be the same as the string length. 543f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * If the offset is behind a multi-byte sequence, then the macro will read 544f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * the whole sequence. 545f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * If the offset is behind a lead byte, then that itself 546f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * will be returned as the code point. 547f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * If the offset is behind an illegal UTF-8 sequence, then c is set to a negative value. 548f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * 549f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * @param s const uint8_t * string 550f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * @param start starting string offset (usually 0) 551f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * @param i string offset, must be start<i 552f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * @param c output UChar32 variable, set to <0 in case of an error 553f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * @see U8_PREV_UNSAFE 554f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * @stable ICU 2.4 555f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) */ 556f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)#define U8_PREV(s, start, i, c) { \ 557f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) (c)=(uint8_t)(s)[--(i)]; \ 558f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if((c)>=0x80) { \ 559f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if((c)<=0xbf) { \ 560f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) (c)=utf8_prevCharSafeBody((const uint8_t *)s, start, &(i), c, -1); \ 561f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } else { \ 562f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) (c)=U_SENTINEL; \ 563f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } \ 564f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } \ 565f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)} 566f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 567f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)/** 568f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * Move the string offset from one code point boundary to the previous one. 569f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * (Pre-decrementing backward iteration.) 570f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * The input offset may be the same as the string length. 571f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * "Unsafe" macro, assumes well-formed UTF-8. 572f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * 573f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * @param s const uint8_t * string 574f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * @param i string offset 575f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * @see U8_BACK_1 576f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * @stable ICU 2.4 577f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) */ 578f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)#define U8_BACK_1_UNSAFE(s, i) { \ 579f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) while(U8_IS_TRAIL((s)[--(i)])) {} \ 580f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)} 581f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 582f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)/** 583f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * Move the string offset from one code point boundary to the previous one. 584f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * (Pre-decrementing backward iteration.) 585f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * The input offset may be the same as the string length. 586f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * "Safe" macro, checks for illegal sequences and for string boundaries. 587f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * 588f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * @param s const uint8_t * string 589f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * @param start starting string offset (usually 0) 590f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * @param i string offset, must be start<i 591f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * @see U8_BACK_1_UNSAFE 592f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * @stable ICU 2.4 593f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) */ 594f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)#define U8_BACK_1(s, start, i) { \ 595f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if(U8_IS_TRAIL((s)[--(i)])) { \ 596f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) (i)=utf8_back1SafeBody(s, start, (int32_t)(i)); \ 597f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } \ 598f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)} 599f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 600f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)/** 601f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * Move the string offset from one code point boundary to the n-th one before it, 602f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * i.e., move backward by n code points. 603f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * (Pre-decrementing backward iteration.) 604f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * The input offset may be the same as the string length. 605f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * "Unsafe" macro, assumes well-formed UTF-8. 606f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * 607f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * @param s const uint8_t * string 608f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * @param i string offset 609f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * @param n number of code points to skip 610f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * @see U8_BACK_N 611f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * @stable ICU 2.4 612f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) */ 613f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)#define U8_BACK_N_UNSAFE(s, i, n) { \ 614f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) int32_t __N=(n); \ 615f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) while(__N>0) { \ 616f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) U8_BACK_1_UNSAFE(s, i); \ 617f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) --__N; \ 618f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } \ 619f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)} 620f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 621f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)/** 622f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * Move the string offset from one code point boundary to the n-th one before it, 623f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * i.e., move backward by n code points. 624f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * (Pre-decrementing backward iteration.) 625f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * The input offset may be the same as the string length. 626f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * "Safe" macro, checks for illegal sequences and for string boundaries. 627f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * 628f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * @param s const uint8_t * string 629f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * @param start index of the start of the string 630f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * @param i string offset, must be start<i 631f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * @param n number of code points to skip 632f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * @see U8_BACK_N_UNSAFE 633f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * @stable ICU 2.4 634f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) */ 635f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)#define U8_BACK_N(s, start, i, n) { \ 636f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) int32_t __N=(n); \ 637f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) while(__N>0 && (i)>(start)) { \ 638f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) U8_BACK_1(s, start, i); \ 639f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) --__N; \ 640f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } \ 641f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)} 642f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 643f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)/** 644f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * Adjust a random-access offset to a code point boundary after a code point. 645f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * If the offset is behind a partial multi-byte sequence, 646f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * then the offset is incremented to behind the whole sequence. 647f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * Otherwise, it is not modified. 648f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * The input offset may be the same as the string length. 649f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * "Unsafe" macro, assumes well-formed UTF-8. 650f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * 651f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * @param s const uint8_t * string 652f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * @param i string offset 653f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * @see U8_SET_CP_LIMIT 654f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * @stable ICU 2.4 655f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) */ 656f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)#define U8_SET_CP_LIMIT_UNSAFE(s, i) { \ 657f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) U8_BACK_1_UNSAFE(s, i); \ 658f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) U8_FWD_1_UNSAFE(s, i); \ 659f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)} 660f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 661f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)/** 662f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * Adjust a random-access offset to a code point boundary after a code point. 663f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * If the offset is behind a partial multi-byte sequence, 664f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * then the offset is incremented to behind the whole sequence. 665f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * Otherwise, it is not modified. 666f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * The input offset may be the same as the string length. 667f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * "Safe" macro, checks for illegal sequences and for string boundaries. 668f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * 669f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * @param s const uint8_t * string 670f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * @param start starting string offset (usually 0) 671f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * @param i string offset, must be start<=i<=length 672f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * @param length string length 673f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * @see U8_SET_CP_LIMIT_UNSAFE 674f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * @stable ICU 2.4 675f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) */ 676f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)#define U8_SET_CP_LIMIT(s, start, i, length) { \ 677f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) if((start)<(i) && (i)<(length)) { \ 678f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) U8_BACK_1(s, start, i); \ 679f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) U8_FWD_1(s, i, length); \ 680f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) } \ 681f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)} 682f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) 683f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)#endif 684