15821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)/*
25821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)******************************************************************************
35821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)*
45821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)*   Copyright (C) 1999-2006, International Business Machines
55821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)*   Corporation and others.  All Rights Reserved.
65821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)*
75821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)******************************************************************************
85821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)*   file name:  utf_impl.c
95821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)*   encoding:   US-ASCII
105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)*   tab size:   8 (not used)
115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)*   indentation:4
125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)*
135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)*   created on: 1999sep13
145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)*   created by: Markus W. Scherer
155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)*
165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)*   This file provides implementation functions for macros in the utfXX.h
175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)*   that would otherwise be too long as macros.
185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)*/
195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "base/third_party/icu/icu_utf.h"
215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)namespace base_icu {
235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)/**
255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) * UTF8_ERROR_VALUE_1 and UTF8_ERROR_VALUE_2 are special error values for UTF-8,
265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) * which need 1 or 2 bytes in UTF-8:
275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) * \code
285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) * U+0015 = NAK = Negative Acknowledge, C0 control character
295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) * U+009f = highest C1 control character
305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) * \endcode
315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) *
325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) * These are used by UTF8_..._SAFE macros so that they can return an error value
335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) * that needs the same number of code units (bytes) as were seen by
345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) * a macro. They should be tested with UTF_IS_ERROR() or UTF_IS_VALID().
355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) *
365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) * @deprecated ICU 2.4. Obsolete, see utf_old.h.
375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) */
385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#define CBUTF8_ERROR_VALUE_1 0x15
395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)/**
415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) * See documentation on UTF8_ERROR_VALUE_1 for details.
425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) *
435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) * @deprecated ICU 2.4. Obsolete, see utf_old.h.
445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) */
455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#define CBUTF8_ERROR_VALUE_2 0x9f
465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)/**
495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) * Error value for all UTFs. This code point value will be set by macros with e>
505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) * checking if an error is detected.
515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) *
525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) * @deprecated ICU 2.4. Obsolete, see utf_old.h.
535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) */
545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#define CBUTF_ERROR_VALUE 0xffff
555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)/*
575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) * This table could be replaced on many machines by
585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) * a few lines of assembler code using an
595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) * "index of first 0-bit from msb" instruction and
605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) * one or two more integer instructions.
615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) *
625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) * For example, on an i386, do something like
635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) * - MOV AL, leadByte
645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) * - NOT AL         (8-bit, leave b15..b8==0..0, reverse only b7..b0)
655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) * - MOV AH, 0
665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) * - BSR BX, AX     (16-bit)
675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) * - MOV AX, 6      (result)
685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) * - JZ finish      (ZF==1 if leadByte==0xff)
695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) * - SUB AX, BX (result)
705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) * -finish:
715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) * (BSR: Bit Scan Reverse, scans for a 1-bit, starting from the MSB)
725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) *
735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) * In Unicode, all UTF-8 byte sequences with more than 4 bytes are illegal;
745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) * lead bytes above 0xf4 are illegal.
755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) * We keep them in this table for skipping long ISO 10646-UTF-8 sequences.
765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) */
775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)const uint8
785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)utf8_countTrailBytes[256]={
795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    3, 3, 3, 3, 3,
995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    3, 3, 3,    /* illegal in Unicode */
1005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    4, 4, 4, 4, /* illegal in Unicode */
1015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    5, 5,       /* illegal in Unicode */
1025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    0, 0        /* illegal bytes 0xfe and 0xff */
1035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)};
1045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)static const UChar32
1065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)utf8_minLegal[4]={ 0, 0x80, 0x800, 0x10000 };
1075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)static const UChar32
1095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)utf8_errorValue[6]={
1105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    CBUTF8_ERROR_VALUE_1, CBUTF8_ERROR_VALUE_2, CBUTF_ERROR_VALUE, 0x10ffff,
1115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    0x3ffffff, 0x7fffffff
1125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)};
1135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)/*
1155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) * Handle the non-inline part of the U8_NEXT() macro and its obsolete sibling
1165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) * UTF8_NEXT_CHAR_SAFE().
1175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) *
1185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) * The "strict" parameter controls the error behavior:
1195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) * <0  "Safe" behavior of U8_NEXT(): All illegal byte sequences yield a negative
1205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) *     code point result.
1215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) *  0  Obsolete "safe" behavior of UTF8_NEXT_CHAR_SAFE(..., FALSE):
1225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) *     All illegal byte sequences yield a positive code point such that this
1235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) *     result code point would be encoded with the same number of bytes as
1245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) *     the illegal sequence.
1255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) * >0  Obsolete "strict" behavior of UTF8_NEXT_CHAR_SAFE(..., TRUE):
1265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) *     Same as the obsolete "safe" behavior, but non-characters are also treated
1275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) *     like illegal sequences.
1285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) *
1295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) * The special negative (<0) value -2 is used for lenient treatment of surrogate
1305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) * code points as legal. Some implementations use this for roundtripping of
1315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) * Unicode 16-bit strings that are not well-formed UTF-16, that is, they
1325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) * contain unpaired surrogates.
1335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) *
1345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) * Note that a UBool is the same as an int8_t.
1355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) */
1365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)UChar32
1375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)utf8_nextCharSafeBody(const uint8 *s, int32 *pi, int32 length, UChar32 c, UBool strict) {
1385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    int32 i=*pi;
1395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    uint8 count=CBU8_COUNT_TRAIL_BYTES(c);
1405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    if((i)+count<=(length)) {
1415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        uint8 trail, illegal=0;
1425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        CBU8_MASK_LEAD_BYTE((c), count);
1445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        /* count==0 for illegally leading trail bytes and the illegal bytes 0xfe and 0xff */
1455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        switch(count) {
1465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        /* each branch falls through to the next one */
1475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        case 5:
1485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        case 4:
1495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)            /* count>=4 is always illegal: no more than 3 trail bytes in Unicode's UTF-8 */
1505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)            illegal=1;
1515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)            break;
1525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        case 3:
1535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)            trail=s[(i)++];
1545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)            (c)=((c)<<6)|(trail&0x3f);
1555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)            if(c<0x110) {
1565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                illegal|=(trail&0xc0)^0x80;
1575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)            } else {
1585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                /* code point>0x10ffff, outside Unicode */
1595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                illegal=1;
1605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                break;
1615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)            }
1625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        case 2:
1635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)            trail=s[(i)++];
1645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)            (c)=((c)<<6)|(trail&0x3f);
1655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)            illegal|=(trail&0xc0)^0x80;
1665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        case 1:
1675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)            trail=s[(i)++];
1685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)            (c)=((c)<<6)|(trail&0x3f);
1695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)            illegal|=(trail&0xc0)^0x80;
1705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)            break;
1715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        case 0:
1725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)            if(strict>=0) {
1735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                return CBUTF8_ERROR_VALUE_1;
1745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)            } else {
1755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                return CBU_SENTINEL;
1765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)            }
1775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        /* no default branch to optimize switch()  - all values are covered */
1785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        }
1795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        /*
1815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)         * All the error handling should return a value
1825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)         * that needs count bytes so that UTF8_GET_CHAR_SAFE() works right.
1835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)         *
1845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)         * Starting with Unicode 3.0.1, non-shortest forms are illegal.
1855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)         * Starting with Unicode 3.2, surrogate code points must not be
1865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)         * encoded in UTF-8, and there are no irregular sequences any more.
1875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)         *
1885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)         * U8_ macros (new in ICU 2.4) return negative values for error conditions.
1895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)         */
1905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        /* correct sequence - all trail bytes have (b7..b6)==(10)? */
1925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        /* illegal is also set if count>=4 */
1935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        if(illegal || (c)<utf8_minLegal[count] || (CBU_IS_SURROGATE(c) && strict!=-2)) {
1945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)            /* error handling */
1955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)            uint8 errorCount=count;
1965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)            /* don't go beyond this sequence */
1975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)            i=*pi;
1985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)            while(count>0 && CBU8_IS_TRAIL(s[i])) {
1995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                ++(i);
2005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                --count;
2015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)            }
2025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)            if(strict>=0) {
2035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                c=utf8_errorValue[errorCount-count];
2045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)            } else {
2055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                c=CBU_SENTINEL;
2065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)            }
2075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        } else if((strict)>0 && CBU_IS_UNICODE_NONCHAR(c)) {
2085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)            /* strict: forbid non-characters like U+fffe */
2095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)            c=utf8_errorValue[count];
2105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        }
2115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    } else /* too few bytes left */ {
2125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        /* error handling */
2135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        int32 i0=i;
2145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        /* don't just set (i)=(length) in case there is an illegal sequence */
2155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        while((i)<(length) && CBU8_IS_TRAIL(s[i])) {
2165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)            ++(i);
2175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        }
2185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        if(strict>=0) {
2195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)            c=utf8_errorValue[i-i0];
2205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        } else {
2215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)            c=CBU_SENTINEL;
2225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        }
2235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    }
2245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    *pi=i;
2255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    return c;
2265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
2275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}  // namespace base_icu
229