1ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/* 2ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru****************************************************************************** 3ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru* 454dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius* Copyright (C) 1999-2012, International Business Machines 5ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru* Corporation and others. All Rights Reserved. 6ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru* 7ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru****************************************************************************** 8ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru* file name: utf_impl.c 9ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru* encoding: US-ASCII 10ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru* tab size: 8 (not used) 11ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru* indentation:4 12ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru* 13ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru* created on: 1999sep13 14ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru* created by: Markus W. Scherer 15ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru* 16ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru* This file provides implementation functions for macros in the utfXX.h 17ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru* that would otherwise be too long as macros. 18ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru*/ 19ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 20ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/* set import/export definitions */ 21ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#ifndef U_UTF8_IMPL 22ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru# define U_UTF8_IMPL 23ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#endif 24ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 25ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "unicode/utypes.h" 26103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius#include "unicode/utf.h" 27103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius#include "unicode/utf8.h" 28103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius#include "unicode/utf_old.h" 29103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius#include "uassert.h" 30ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 31ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/* 32ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * This table could be replaced on many machines by 33ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * a few lines of assembler code using an 34ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * "index of first 0-bit from msb" instruction and 35ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * one or two more integer instructions. 36ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * 37ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * For example, on an i386, do something like 38ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * - MOV AL, leadByte 39ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * - NOT AL (8-bit, leave b15..b8==0..0, reverse only b7..b0) 40ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * - MOV AH, 0 41ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * - BSR BX, AX (16-bit) 42ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * - MOV AX, 6 (result) 43ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * - JZ finish (ZF==1 if leadByte==0xff) 44ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * - SUB AX, BX (result) 45ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * -finish: 46ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * (BSR: Bit Scan Reverse, scans for a 1-bit, starting from the MSB) 47ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * 48ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * In Unicode, all UTF-8 byte sequences with more than 4 bytes are illegal; 49ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * lead bytes above 0xf4 are illegal. 50ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * We keep them in this table for skipping long ISO 10646-UTF-8 sequences. 51ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */ 52ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruU_EXPORT const uint8_t 53ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruutf8_countTrailBytes[256]={ 54ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 55ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 56ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 57ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 58ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 59ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 60ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 61ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 62ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 63ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 64ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 65ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 66ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 67ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 68ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 69ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 70ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 71ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 72ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 73ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 3, 3, 3, 3, 3, 74ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 3, 3, 3, /* illegal in Unicode */ 75ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 4, 4, 4, 4, /* illegal in Unicode */ 76ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 5, 5, /* illegal in Unicode */ 77ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 0, 0 /* illegal bytes 0xfe and 0xff */ 78ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}; 79ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 80ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querustatic const UChar32 81ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruutf8_minLegal[4]={ 0, 0x80, 0x800, 0x10000 }; 82ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 83ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querustatic const UChar32 84ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruutf8_errorValue[6]={ 85ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_2, UTF_ERROR_VALUE, 0x10ffff, 86ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 0x3ffffff, 0x7fffffff 87ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}; 88ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 898393335b955da7340c9f19b1b4b2d6c0c2c04be7Craig Corneliusstatic UChar32 908393335b955da7340c9f19b1b4b2d6c0c2c04be7Craig CorneliuserrorValue(int32_t count, int8_t strict) { 918393335b955da7340c9f19b1b4b2d6c0c2c04be7Craig Cornelius if(strict>=0) { 928393335b955da7340c9f19b1b4b2d6c0c2c04be7Craig Cornelius return utf8_errorValue[count]; 938393335b955da7340c9f19b1b4b2d6c0c2c04be7Craig Cornelius } else if(strict==-3) { 948393335b955da7340c9f19b1b4b2d6c0c2c04be7Craig Cornelius return 0xfffd; 958393335b955da7340c9f19b1b4b2d6c0c2c04be7Craig Cornelius } else { 968393335b955da7340c9f19b1b4b2d6c0c2c04be7Craig Cornelius return U_SENTINEL; 978393335b955da7340c9f19b1b4b2d6c0c2c04be7Craig Cornelius } 988393335b955da7340c9f19b1b4b2d6c0c2c04be7Craig Cornelius} 998393335b955da7340c9f19b1b4b2d6c0c2c04be7Craig Cornelius 100ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/* 1018393335b955da7340c9f19b1b4b2d6c0c2c04be7Craig Cornelius * Handle the non-inline part of the U8_NEXT() and U8_NEXT_FFFD() macros 1028393335b955da7340c9f19b1b4b2d6c0c2c04be7Craig Cornelius * and their obsolete sibling UTF8_NEXT_CHAR_SAFE(). 1038393335b955da7340c9f19b1b4b2d6c0c2c04be7Craig Cornelius * 1048393335b955da7340c9f19b1b4b2d6c0c2c04be7Craig Cornelius * U8_NEXT() supports NUL-terminated strings indicated via length<0. 105ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * 106ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * The "strict" parameter controls the error behavior: 1078393335b955da7340c9f19b1b4b2d6c0c2c04be7Craig Cornelius * <0 "Safe" behavior of U8_NEXT(): 1088393335b955da7340c9f19b1b4b2d6c0c2c04be7Craig Cornelius * -1: All illegal byte sequences yield U_SENTINEL=-1. 1098393335b955da7340c9f19b1b4b2d6c0c2c04be7Craig Cornelius * -2: Same as -1, except for lenient treatment of surrogate code points as legal. 1108393335b955da7340c9f19b1b4b2d6c0c2c04be7Craig Cornelius * Some implementations use this for roundtripping of 1118393335b955da7340c9f19b1b4b2d6c0c2c04be7Craig Cornelius * Unicode 16-bit strings that are not well-formed UTF-16, that is, they 1128393335b955da7340c9f19b1b4b2d6c0c2c04be7Craig Cornelius * contain unpaired surrogates. 1138393335b955da7340c9f19b1b4b2d6c0c2c04be7Craig Cornelius * -3: All illegal byte sequences yield U+FFFD. 114ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * 0 Obsolete "safe" behavior of UTF8_NEXT_CHAR_SAFE(..., FALSE): 115ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * All illegal byte sequences yield a positive code point such that this 116ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * result code point would be encoded with the same number of bytes as 117ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * the illegal sequence. 118ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * >0 Obsolete "strict" behavior of UTF8_NEXT_CHAR_SAFE(..., TRUE): 119ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Same as the obsolete "safe" behavior, but non-characters are also treated 120ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * like illegal sequences. 121ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * 122ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru * Note that a UBool is the same as an int8_t. 123ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */ 124ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruU_CAPI UChar32 U_EXPORT2 125ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruutf8_nextCharSafeBody(const uint8_t *s, int32_t *pi, int32_t length, UChar32 c, UBool strict) { 126ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t i=*pi; 127103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius uint8_t count=U8_COUNT_TRAIL_BYTES(c); 12854dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius U_ASSERT(count <= 5); /* U8_COUNT_TRAIL_BYTES returns value 0...5 */ 1298393335b955da7340c9f19b1b4b2d6c0c2c04be7Craig Cornelius if(i+count<=length || length<0) { 1308393335b955da7340c9f19b1b4b2d6c0c2c04be7Craig Cornelius uint8_t trail; 131ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 1328393335b955da7340c9f19b1b4b2d6c0c2c04be7Craig Cornelius U8_MASK_LEAD_BYTE(c, count); 1338393335b955da7340c9f19b1b4b2d6c0c2c04be7Craig Cornelius /* support NUL-terminated strings: do not read beyond the first non-trail byte */ 134ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru switch(count) { 135ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru /* each branch falls through to the next one */ 1368393335b955da7340c9f19b1b4b2d6c0c2c04be7Craig Cornelius case 0: 1378393335b955da7340c9f19b1b4b2d6c0c2c04be7Craig Cornelius /* count==0 for illegally leading trail bytes and the illegal bytes 0xfe and 0xff */ 138ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru case 5: 139ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru case 4: 140ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru /* count>=4 is always illegal: no more than 3 trail bytes in Unicode's UTF-8 */ 141ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru break; 142ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru case 3: 1438393335b955da7340c9f19b1b4b2d6c0c2c04be7Craig Cornelius trail=s[i++]-0x80; 1448393335b955da7340c9f19b1b4b2d6c0c2c04be7Craig Cornelius c=(c<<6)|trail; 1458393335b955da7340c9f19b1b4b2d6c0c2c04be7Craig Cornelius /* c>=0x110 would result in code point>0x10ffff, outside Unicode */ 1468393335b955da7340c9f19b1b4b2d6c0c2c04be7Craig Cornelius if(c>=0x110 || trail>0x3f) { break; } 147ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru case 2: 1488393335b955da7340c9f19b1b4b2d6c0c2c04be7Craig Cornelius trail=s[i++]-0x80; 1498393335b955da7340c9f19b1b4b2d6c0c2c04be7Craig Cornelius c=(c<<6)|trail; 1508393335b955da7340c9f19b1b4b2d6c0c2c04be7Craig Cornelius /* 1518393335b955da7340c9f19b1b4b2d6c0c2c04be7Craig Cornelius * test for a surrogate d800..dfff unless we are lenient: 1528393335b955da7340c9f19b1b4b2d6c0c2c04be7Craig Cornelius * before the last (c<<6), a surrogate is c=360..37f 1538393335b955da7340c9f19b1b4b2d6c0c2c04be7Craig Cornelius */ 1548393335b955da7340c9f19b1b4b2d6c0c2c04be7Craig Cornelius if(((c&0xffe0)==0x360 && strict!=-2) || trail>0x3f) { break; } 155ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru case 1: 1568393335b955da7340c9f19b1b4b2d6c0c2c04be7Craig Cornelius trail=s[i++]-0x80; 1578393335b955da7340c9f19b1b4b2d6c0c2c04be7Craig Cornelius c=(c<<6)|trail; 1588393335b955da7340c9f19b1b4b2d6c0c2c04be7Craig Cornelius if(trail>0x3f) { break; } 1598393335b955da7340c9f19b1b4b2d6c0c2c04be7Craig Cornelius /* correct sequence - all trail bytes have (b7..b6)==(10) */ 1608393335b955da7340c9f19b1b4b2d6c0c2c04be7Craig Cornelius if(c>=utf8_minLegal[count] && 1618393335b955da7340c9f19b1b4b2d6c0c2c04be7Craig Cornelius /* strict: forbid non-characters like U+fffe */ 1628393335b955da7340c9f19b1b4b2d6c0c2c04be7Craig Cornelius (strict<=0 || !U_IS_UNICODE_NONCHAR(c))) { 1638393335b955da7340c9f19b1b4b2d6c0c2c04be7Craig Cornelius *pi=i; 1648393335b955da7340c9f19b1b4b2d6c0c2c04be7Craig Cornelius return c; 165ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 166ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru /* no default branch to optimize switch() - all values are covered */ 167ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 1688393335b955da7340c9f19b1b4b2d6c0c2c04be7Craig Cornelius } else { 1698393335b955da7340c9f19b1b4b2d6c0c2c04be7Craig Cornelius /* too few bytes left */ 1708393335b955da7340c9f19b1b4b2d6c0c2c04be7Craig Cornelius count=length-i; 1718393335b955da7340c9f19b1b4b2d6c0c2c04be7Craig Cornelius } 172ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 1738393335b955da7340c9f19b1b4b2d6c0c2c04be7Craig Cornelius /* error handling */ 1748393335b955da7340c9f19b1b4b2d6c0c2c04be7Craig Cornelius i=*pi; 1758393335b955da7340c9f19b1b4b2d6c0c2c04be7Craig Cornelius while(count>0 && U8_IS_TRAIL(s[i])) { 1768393335b955da7340c9f19b1b4b2d6c0c2c04be7Craig Cornelius ++i; 1778393335b955da7340c9f19b1b4b2d6c0c2c04be7Craig Cornelius --count; 178ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 1798393335b955da7340c9f19b1b4b2d6c0c2c04be7Craig Cornelius c=errorValue(i-*pi, strict); 180ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *pi=i; 181ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return c; 182ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 183ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 184ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruU_CAPI int32_t U_EXPORT2 185ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruutf8_appendCharSafeBody(uint8_t *s, int32_t i, int32_t length, UChar32 c, UBool *pIsError) { 186ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if((uint32_t)(c)<=0x7ff) { 187ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if((i)+1<(length)) { 188ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru (s)[(i)++]=(uint8_t)(((c)>>6)|0xc0); 189ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru (s)[(i)++]=(uint8_t)(((c)&0x3f)|0x80); 190ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return i; 191ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 192ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } else if((uint32_t)(c)<=0xffff) { 193ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru /* Starting with Unicode 3.2, surrogate code points must not be encoded in UTF-8. */ 194ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if((i)+2<(length) && !U_IS_SURROGATE(c)) { 195ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru (s)[(i)++]=(uint8_t)(((c)>>12)|0xe0); 196ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru (s)[(i)++]=(uint8_t)((((c)>>6)&0x3f)|0x80); 197ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru (s)[(i)++]=(uint8_t)(((c)&0x3f)|0x80); 198ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return i; 199ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 200ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } else if((uint32_t)(c)<=0x10ffff) { 201ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if((i)+3<(length)) { 202ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru (s)[(i)++]=(uint8_t)(((c)>>18)|0xf0); 203ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru (s)[(i)++]=(uint8_t)((((c)>>12)&0x3f)|0x80); 204ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru (s)[(i)++]=(uint8_t)((((c)>>6)&0x3f)|0x80); 205ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru (s)[(i)++]=(uint8_t)(((c)&0x3f)|0x80); 206ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return i; 207ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 208ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 209ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru /* c>0x10ffff or not enough space, write an error value */ 210ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(pIsError!=NULL) { 211ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *pIsError=TRUE; 212ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } else { 213ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru length-=i; 214ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(length>0) { 215ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t offset; 216ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(length>3) { 217ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru length=3; 218ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 219ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru s+=i; 220ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru offset=0; 221ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru c=utf8_errorValue[length-1]; 222ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UTF8_APPEND_CHAR_UNSAFE(s, offset, c); 223ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru i=i+offset; 224ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 225ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 226ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return i; 227ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 228ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 229ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruU_CAPI UChar32 U_EXPORT2 230ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruutf8_prevCharSafeBody(const uint8_t *s, int32_t start, int32_t *pi, UChar32 c, UBool strict) { 231ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t i=*pi; 232ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru uint8_t b, count=1, shift=6; 233ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 2348393335b955da7340c9f19b1b4b2d6c0c2c04be7Craig Cornelius if(!U8_IS_TRAIL(c)) { return errorValue(0, strict); } 2358393335b955da7340c9f19b1b4b2d6c0c2c04be7Craig Cornelius 236ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru /* extract value bits from the last trail byte */ 237ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru c&=0x3f; 238ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 239ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru for(;;) { 240ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(i<=start) { 241ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru /* no lead byte at all */ 2428393335b955da7340c9f19b1b4b2d6c0c2c04be7Craig Cornelius return errorValue(0, strict); 243ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 244ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 245ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru /* read another previous byte */ 246ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru b=s[--i]; 247ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if((uint8_t)(b-0x80)<0x7e) { /* 0x80<=b<0xfe */ 248ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(b&0x40) { 249ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru /* lead byte, this will always end the loop */ 250103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius uint8_t shouldCount=U8_COUNT_TRAIL_BYTES(b); 251ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 252ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(count==shouldCount) { 253ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru /* set the new position */ 254ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *pi=i; 255103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius U8_MASK_LEAD_BYTE(b, count); 256ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru c|=(UChar32)b<<shift; 257103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius if(count>=4 || c>0x10ffff || c<utf8_minLegal[count] || (U_IS_SURROGATE(c) && strict!=-2) || (strict>0 && U_IS_UNICODE_NONCHAR(c))) { 258ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru /* illegal sequence or (strict and non-character) */ 259ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(count>=4) { 260ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru count=3; 261ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 2628393335b955da7340c9f19b1b4b2d6c0c2c04be7Craig Cornelius c=errorValue(count, strict); 263ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } else { 264ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru /* exit with correct c */ 265ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 266ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } else { 267ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru /* the lead byte does not match the number of trail bytes */ 268ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru /* only set the position to the lead byte if it would 269ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru include the trail byte that we started with */ 270ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(count<shouldCount) { 271ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *pi=i; 2728393335b955da7340c9f19b1b4b2d6c0c2c04be7Craig Cornelius c=errorValue(count, strict); 273ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } else { 2748393335b955da7340c9f19b1b4b2d6c0c2c04be7Craig Cornelius c=errorValue(0, strict); 275ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 276ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 277ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru break; 278ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } else if(count<5) { 279ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru /* trail byte */ 280ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru c|=(UChar32)(b&0x3f)<<shift; 281ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru ++count; 282ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru shift+=6; 283ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } else { 284ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru /* more than 5 trail bytes is illegal */ 2858393335b955da7340c9f19b1b4b2d6c0c2c04be7Craig Cornelius c=errorValue(0, strict); 286ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru break; 287ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 288ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } else { 289ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru /* single-byte character precedes trailing bytes */ 2908393335b955da7340c9f19b1b4b2d6c0c2c04be7Craig Cornelius c=errorValue(0, strict); 291ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru break; 292ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 293ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 294ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return c; 295ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 296ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 297ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruU_CAPI int32_t U_EXPORT2 298ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruutf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i) { 299ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru /* i had been decremented once before the function call */ 300ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t I=i, Z; 301ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru uint8_t b; 302ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 303ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru /* read at most the 6 bytes s[Z] to s[i], inclusively */ 304ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(I-5>start) { 305ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru Z=I-5; 306ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } else { 307ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru Z=start; 308ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 309ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 310ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru /* return I if the sequence starting there is long enough to include i */ 311ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru do { 312ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru b=s[I]; 313ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if((uint8_t)(b-0x80)>=0x7e) { /* not 0x80<=b<0xfe */ 314ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru break; 315ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } else if(b>=0xc0) { 316103e9ffba2cba345d0078eb8b8db33249f81840aCraig Cornelius if(U8_COUNT_TRAIL_BYTES(b)>=(i-I)) { 317ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return I; 318ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } else { 319ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru break; 320ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 321ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 322ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } while(Z<=--I); 323ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 324ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru /* return i itself to be consistent with the FWD_1 macro */ 325ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return i; 326ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 327