1f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)/*===--- ConvertUTF.c - Universal Character Names conversions ---------------=== 2f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) * 3f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) * The LLVM Compiler Infrastructure 4f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) * 5f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) * This file is distributed under the University of Illinois Open Source 6f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) * License. See LICENSE.TXT for details. 7f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) * 8f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) *===------------------------------------------------------------------------=*/ 9f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)/* 10f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) * Copyright 2001-2004 Unicode, Inc. 11f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) * 12f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) * Disclaimer 13f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) * 14f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) * This source code is provided as is by Unicode, Inc. No claims are 15f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) * made as to fitness for any particular purpose. No warranties of any 16f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) * kind are expressed or implied. The recipient agrees to determine 17f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) * applicability of information provided. If this file has been 18f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) * purchased on magnetic or optical media from Unicode, Inc., the 19f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) * sole remedy for any claim will be exchange of defective media 20f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) * within 90 days of receipt. 21f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) * 22f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) * Limitations on Rights to Redistribute This Code 23f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) * 24f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) * Unicode, Inc. hereby grants the right to freely use the information 25f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) * supplied in this file in the creation of products supporting the 26f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) * Unicode Standard, and to make copies of this file in any form 27f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) * for internal or external distribution as long as this notice 28f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) * remains attached. 29f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) */ 30f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) 31f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)/* --------------------------------------------------------------------- 32f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) 33f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) Conversions between UTF32, UTF-16, and UTF-8. Source code file. 34f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) Author: Mark E. Davis, 1994. 35f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) Rev History: Rick McGowan, fixes & updates May 2001. 36f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) Sept 2001: fixed const & error conditions per 37f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) mods suggested by S. Parent & A. Lillich. 38f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) June 2002: Tim Dodd added detection and handling of incomplete 39f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) source sequences, enhanced error detection, added casts 40f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) to eliminate compiler warnings. 41f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) July 2003: slight mods to back out aggressive FFFE detection. 42f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) Jan 2004: updated switches in from-UTF8 conversions. 43f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) Oct 2004: updated to use UNI_MAX_LEGAL_UTF32 in UTF-32 conversions. 44f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) 45f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) See the header file "ConvertUTF.h" for complete documentation. 46f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) 47f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)------------------------------------------------------------------------ */ 48f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) 49f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) 50f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)#include "llvm/Support/ConvertUTF.h" 51f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)#ifdef CVTUTF_DEBUG 52f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)#include <stdio.h> 53f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)#endif 54f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)#include <assert.h> 55f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) 56f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)static const int halfShift = 10; /* used for shifting by 10 bits */ 57f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) 58f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)static const UTF32 halfBase = 0x0010000UL; 59f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)static const UTF32 halfMask = 0x3FFUL; 60f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) 61f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)#define UNI_SUR_HIGH_START (UTF32)0xD800 62f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)#define UNI_SUR_HIGH_END (UTF32)0xDBFF 63f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)#define UNI_SUR_LOW_START (UTF32)0xDC00 64f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)#define UNI_SUR_LOW_END (UTF32)0xDFFF 65f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)#define false 0 66f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)#define true 1 67f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) 68f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)/* --------------------------------------------------------------------- */ 69f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) 70f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)/* 71f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) * Index into the table below with the first byte of a UTF-8 sequence to 72f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) * get the number of trailing bytes that are supposed to follow it. 73f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) * Note that *legal* UTF-8 values can't have 4 or 5-bytes. The table is 74f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) * left as-is for anyone who may want to do such conversion, which was 75f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) * allowed in earlier algorithms. 76f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) */ 77f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)static const char trailingBytesForUTF8[256] = { 78f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 79f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 80f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 81f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 82f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 83f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 84f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 85f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 86f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)}; 87f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) 88f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)/* 89f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) * Magic values subtracted from a buffer value during UTF8 conversion. 90f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) * This table contains as many values as there might be trailing bytes 91f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) * in a UTF-8 sequence. 92f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) */ 93f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)static const UTF32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL, 94f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) 0x03C82080UL, 0xFA082080UL, 0x82082080UL }; 95f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) 96f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)/* 97f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) * Once the bits are split out into bytes of UTF-8, this is a mask OR-ed 98f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) * into the first byte, depending on how many bytes follow. There are 99f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) * as many entries in this table as there are UTF-8 sequence types. 100f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) * (I.e., one byte sequence, two byte... etc.). Remember that sequencs 101f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) * for *legal* UTF-8 will be 4 or fewer bytes total. 102f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) */ 103f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)static const UTF8 firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC }; 104f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) 105f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)/* --------------------------------------------------------------------- */ 106f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) 107f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)/* The interface converts a whole buffer to avoid function-call overhead. 108f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) * Constants have been gathered. Loops & conditionals have been removed as 109f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) * much as possible for efficiency, in favor of drop-through switches. 110f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) * (See "Note A" at the bottom of the file for equivalent code.) 111f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) * If your compiler supports it, the "isLegalUTF8" call can be turned 112f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) * into an inline function. 113f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) */ 114f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) 115f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) 116f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)/* --------------------------------------------------------------------- */ 117f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) 118f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)ConversionResult ConvertUTF32toUTF16 ( 119f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) const UTF32** sourceStart, const UTF32* sourceEnd, 120f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) { 121f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) ConversionResult result = conversionOK; 122f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) const UTF32* source = *sourceStart; 123f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) UTF16* target = *targetStart; 124f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) while (source < sourceEnd) { 125f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) UTF32 ch; 126f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) if (target >= targetEnd) { 127f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) result = targetExhausted; break; 128f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) } 129f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) ch = *source++; 130f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */ 131f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) /* UTF-16 surrogate values are illegal in UTF-32; 0xffff or 0xfffe are both reserved values */ 132f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) { 133f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) if (flags == strictConversion) { 134f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) --source; /* return to the illegal value itself */ 135f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) result = sourceIllegal; 136f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) break; 137f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) } else { 138f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) *target++ = UNI_REPLACEMENT_CHAR; 139f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) } 140f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) } else { 141f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) *target++ = (UTF16)ch; /* normal case */ 142f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) } 143f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) } else if (ch > UNI_MAX_LEGAL_UTF32) { 144f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) if (flags == strictConversion) { 145f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) result = sourceIllegal; 146f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) } else { 147f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) *target++ = UNI_REPLACEMENT_CHAR; 148f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) } 149f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) } else { 150f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) /* target is a character in range 0xFFFF - 0x10FFFF. */ 151f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) if (target + 1 >= targetEnd) { 152f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) --source; /* Back up source pointer! */ 153f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) result = targetExhausted; break; 154f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) } 155f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) ch -= halfBase; 156f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) *target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START); 157f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) *target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START); 158f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) } 159f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) } 160f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) *sourceStart = source; 161f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) *targetStart = target; 162f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) return result; 163f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)} 164f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) 165f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)/* --------------------------------------------------------------------- */ 166f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) 167f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)ConversionResult ConvertUTF16toUTF32 ( 168f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) const UTF16** sourceStart, const UTF16* sourceEnd, 169f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags) { 170f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) ConversionResult result = conversionOK; 171f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) const UTF16* source = *sourceStart; 172f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) UTF32* target = *targetStart; 173f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) UTF32 ch, ch2; 174f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) while (source < sourceEnd) { 175f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */ 176f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) ch = *source++; 177f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) /* If we have a surrogate pair, convert to UTF32 first. */ 178f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) { 179f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) /* If the 16 bits following the high surrogate are in the source buffer... */ 180f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) if (source < sourceEnd) { 181f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) ch2 = *source; 182f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) /* If it's a low surrogate, convert to UTF32. */ 183f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) { 184f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) ch = ((ch - UNI_SUR_HIGH_START) << halfShift) 185f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) + (ch2 - UNI_SUR_LOW_START) + halfBase; 186f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) ++source; 187f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) } else if (flags == strictConversion) { /* it's an unpaired high surrogate */ 188f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) --source; /* return to the illegal value itself */ 189f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) result = sourceIllegal; 190f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) break; 191f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) } 192f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) } else { /* We don't have the 16 bits following the high surrogate. */ 193f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) --source; /* return to the high surrogate */ 194f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) result = sourceExhausted; 195f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) break; 196f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) } 197f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) } else if (flags == strictConversion) { 198f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) /* UTF-16 surrogate values are illegal in UTF-32 */ 199f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) { 200f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) --source; /* return to the illegal value itself */ 201f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) result = sourceIllegal; 202f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) break; 203f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) } 204f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) } 205f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) if (target >= targetEnd) { 206f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) source = oldSource; /* Back up source pointer! */ 207f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) result = targetExhausted; break; 208f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) } 209f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) *target++ = ch; 210f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) } 211f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) *sourceStart = source; 212f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) *targetStart = target; 213f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)#ifdef CVTUTF_DEBUG 214f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)if (result == sourceIllegal) { 215f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) fprintf(stderr, "ConvertUTF16toUTF32 illegal seq 0x%04x,%04x\n", ch, ch2); 216f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) fflush(stderr); 217f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)} 218f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)#endif 219f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) return result; 220f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)} 221f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)ConversionResult ConvertUTF16toUTF8 ( 222f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) const UTF16** sourceStart, const UTF16* sourceEnd, 223f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) { 224f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) ConversionResult result = conversionOK; 225f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) const UTF16* source = *sourceStart; 226f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) UTF8* target = *targetStart; 227f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) while (source < sourceEnd) { 228f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) UTF32 ch; 229f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) unsigned short bytesToWrite = 0; 230f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) const UTF32 byteMask = 0xBF; 231f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) const UTF32 byteMark = 0x80; 232f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */ 233f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) ch = *source++; 234f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) /* If we have a surrogate pair, convert to UTF32 first. */ 235f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) { 236f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) /* If the 16 bits following the high surrogate are in the source buffer... */ 237f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) if (source < sourceEnd) { 238f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) UTF32 ch2 = *source; 239f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) /* If it's a low surrogate, convert to UTF32. */ 240f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) { 241f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) ch = ((ch - UNI_SUR_HIGH_START) << halfShift) 242f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) + (ch2 - UNI_SUR_LOW_START) + halfBase; 243f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) ++source; 244f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) } else if (flags == strictConversion) { /* it's an unpaired high surrogate */ 245f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) --source; /* return to the illegal value itself */ 246f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) result = sourceIllegal; 247f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) break; 248f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) } 249f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) } else { /* We don't have the 16 bits following the high surrogate. */ 250f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) --source; /* return to the high surrogate */ 251f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) result = sourceExhausted; 252f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) break; 253f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) } 254f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) } else if (flags == strictConversion) { 255f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) /* UTF-16 surrogate values are illegal in UTF-32 */ 256f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) { 257f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) --source; /* return to the illegal value itself */ 258f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) result = sourceIllegal; 259f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) break; 260f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) } 261f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) } 262f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) /* Figure out how many bytes the result will require */ 263f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) if (ch < (UTF32)0x80) { bytesToWrite = 1; 264f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) } else if (ch < (UTF32)0x800) { bytesToWrite = 2; 265f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) } else if (ch < (UTF32)0x10000) { bytesToWrite = 3; 266f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) } else if (ch < (UTF32)0x110000) { bytesToWrite = 4; 267f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) } else { bytesToWrite = 3; 268f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) ch = UNI_REPLACEMENT_CHAR; 269f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) } 270f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) 271f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) target += bytesToWrite; 272f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) if (target > targetEnd) { 273f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) source = oldSource; /* Back up source pointer! */ 274f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) target -= bytesToWrite; result = targetExhausted; break; 275f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) } 276f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) switch (bytesToWrite) { /* note: everything falls through. */ 277f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) case 4: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6; 278f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) case 3: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6; 279f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) case 2: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6; 280f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) case 1: *--target = (UTF8)(ch | firstByteMark[bytesToWrite]); 281f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) } 282f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) target += bytesToWrite; 283f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) } 284f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) *sourceStart = source; 285f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) *targetStart = target; 286f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) return result; 287f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)} 288f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) 289f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)/* --------------------------------------------------------------------- */ 290f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) 291f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)ConversionResult ConvertUTF32toUTF8 ( 292f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) const UTF32** sourceStart, const UTF32* sourceEnd, 293f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) { 294f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) ConversionResult result = conversionOK; 295f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) const UTF32* source = *sourceStart; 296f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) UTF8* target = *targetStart; 297f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) while (source < sourceEnd) { 298f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) UTF32 ch; 299f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) unsigned short bytesToWrite = 0; 300f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) const UTF32 byteMask = 0xBF; 301f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) const UTF32 byteMark = 0x80; 302f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) ch = *source++; 303f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) if (flags == strictConversion ) { 304f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) /* UTF-16 surrogate values are illegal in UTF-32 */ 305f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) { 306f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) --source; /* return to the illegal value itself */ 307f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) result = sourceIllegal; 308f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) break; 309f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) } 310f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) } 311f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) /* 312f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) * Figure out how many bytes the result will require. Turn any 313f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) * illegally large UTF32 things (> Plane 17) into replacement chars. 314f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) */ 315f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) if (ch < (UTF32)0x80) { bytesToWrite = 1; 316f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) } else if (ch < (UTF32)0x800) { bytesToWrite = 2; 317f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) } else if (ch < (UTF32)0x10000) { bytesToWrite = 3; 318f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) } else if (ch <= UNI_MAX_LEGAL_UTF32) { bytesToWrite = 4; 319f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) } else { bytesToWrite = 3; 320f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) ch = UNI_REPLACEMENT_CHAR; 321f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) result = sourceIllegal; 322f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) } 323f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) 324f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) target += bytesToWrite; 325f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) if (target > targetEnd) { 326f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) --source; /* Back up source pointer! */ 327f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) target -= bytesToWrite; result = targetExhausted; break; 328f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) } 329f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) switch (bytesToWrite) { /* note: everything falls through. */ 330f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) case 4: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6; 331f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) case 3: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6; 332f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) case 2: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6; 333f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) case 1: *--target = (UTF8) (ch | firstByteMark[bytesToWrite]); 334f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) } 335f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) target += bytesToWrite; 336f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) } 337f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) *sourceStart = source; 338f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) *targetStart = target; 339f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) return result; 340f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)} 341f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) 342f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)/* --------------------------------------------------------------------- */ 343f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) 344f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)/* 345f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) * Utility routine to tell whether a sequence of bytes is legal UTF-8. 346f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) * This must be called with the length pre-determined by the first byte. 347f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) * If not calling this from ConvertUTF8to*, then the length can be set by: 348f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) * length = trailingBytesForUTF8[*source]+1; 349f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) * and the sequence is illegal right away if there aren't that many bytes 350f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) * available. 351f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) * If presented with a length > 4, this returns false. The Unicode 352f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) * definition of UTF-8 goes up to 4-byte sequences. 353f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) */ 354f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) 355f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)static Boolean isLegalUTF8(const UTF8 *source, int length) { 356f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) UTF8 a; 357f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) const UTF8 *srcptr = source+length; 358f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) switch (length) { 359f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) default: return false; 360f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) /* Everything else falls through when "true"... */ 361f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false; 362f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false; 363f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) case 2: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false; 364f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) 365f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) switch (*source) { 366f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) /* no fall-through in this inner switch */ 367f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) case 0xE0: if (a < 0xA0) return false; break; 368f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) case 0xED: if (a > 0x9F) return false; break; 369f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) case 0xF0: if (a < 0x90) return false; break; 370f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) case 0xF4: if (a > 0x8F) return false; break; 371f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) default: if (a < 0x80) return false; 372f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) } 373f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) 374f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) case 1: if (*source >= 0x80 && *source < 0xC2) return false; 375f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) } 376f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) if (*source > 0xF4) return false; 377f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) return true; 378f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)} 379f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) 380f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)/* --------------------------------------------------------------------- */ 381f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) 382f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)/* 383f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) * Exported function to return whether a UTF-8 sequence is legal or not. 384f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) * This is not used here; it's just exported. 385f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) */ 386f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)Boolean isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd) { 387f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) int length = trailingBytesForUTF8[*source]+1; 388f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) if (length > sourceEnd - source) { 389f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) return false; 390f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) } 391f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) return isLegalUTF8(source, length); 392f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)} 393f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) 394f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)/* --------------------------------------------------------------------- */ 395f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) 396f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)static unsigned 397f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)findMaximalSubpartOfIllFormedUTF8Sequence(const UTF8 *source, 398f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) const UTF8 *sourceEnd) { 399f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) UTF8 b1, b2, b3; 400f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) 401f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) assert(!isLegalUTF8Sequence(source, sourceEnd)); 402f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) 403f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) /* 404f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) * Unicode 6.3.0, D93b: 405f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) * 406f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) * Maximal subpart of an ill-formed subsequence: The longest code unit 407f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles) * subsequence starting at an unconvertible offset that is either: 408 * a. the initial subsequence of a well-formed code unit sequence, or 409 * b. a subsequence of length one. 410 */ 411 412 if (source == sourceEnd) 413 return 0; 414 415 /* 416 * Perform case analysis. See Unicode 6.3.0, Table 3-7. Well-Formed UTF-8 417 * Byte Sequences. 418 */ 419 420 b1 = *source; 421 ++source; 422 if (b1 >= 0xC2 && b1 <= 0xDF) { 423 /* 424 * First byte is valid, but we know that this code unit sequence is 425 * invalid, so the maximal subpart has to end after the first byte. 426 */ 427 return 1; 428 } 429 430 if (source == sourceEnd) 431 return 1; 432 433 b2 = *source; 434 ++source; 435 436 if (b1 == 0xE0) { 437 return (b2 >= 0xA0 && b2 <= 0xBF) ? 2 : 1; 438 } 439 if (b1 >= 0xE1 && b1 <= 0xEC) { 440 return (b2 >= 0x80 && b2 <= 0xBF) ? 2 : 1; 441 } 442 if (b1 == 0xED) { 443 return (b2 >= 0x80 && b2 <= 0x9F) ? 2 : 1; 444 } 445 if (b1 >= 0xEE && b1 <= 0xEF) { 446 return (b2 >= 0x80 && b2 <= 0xBF) ? 2 : 1; 447 } 448 if (b1 == 0xF0) { 449 if (b2 >= 0x90 && b2 <= 0xBF) { 450 if (source == sourceEnd) 451 return 2; 452 453 b3 = *source; 454 return (b3 >= 0x80 && b3 <= 0xBF) ? 3 : 2; 455 } 456 return 1; 457 } 458 if (b1 >= 0xF1 && b1 <= 0xF3) { 459 if (b2 >= 0x80 && b2 <= 0xBF) { 460 if (source == sourceEnd) 461 return 2; 462 463 b3 = *source; 464 return (b3 >= 0x80 && b3 <= 0xBF) ? 3 : 2; 465 } 466 return 1; 467 } 468 if (b1 == 0xF4) { 469 if (b2 >= 0x80 && b2 <= 0x8F) { 470 if (source == sourceEnd) 471 return 2; 472 473 b3 = *source; 474 return (b3 >= 0x80 && b3 <= 0xBF) ? 3 : 2; 475 } 476 return 1; 477 } 478 479 assert((b1 >= 0x80 && b1 <= 0xC1) || b1 >= 0xF5); 480 /* 481 * There are no valid sequences that start with these bytes. Maximal subpart 482 * is defined to have length 1 in these cases. 483 */ 484 return 1; 485} 486 487/* --------------------------------------------------------------------- */ 488 489/* 490 * Exported function to return the total number of bytes in a codepoint 491 * represented in UTF-8, given the value of the first byte. 492 */ 493unsigned getNumBytesForUTF8(UTF8 first) { 494 return trailingBytesForUTF8[first] + 1; 495} 496 497/* --------------------------------------------------------------------- */ 498 499/* 500 * Exported function to return whether a UTF-8 string is legal or not. 501 * This is not used here; it's just exported. 502 */ 503Boolean isLegalUTF8String(const UTF8 **source, const UTF8 *sourceEnd) { 504 while (*source != sourceEnd) { 505 int length = trailingBytesForUTF8[**source] + 1; 506 if (length > sourceEnd - *source || !isLegalUTF8(*source, length)) 507 return false; 508 *source += length; 509 } 510 return true; 511} 512 513/* --------------------------------------------------------------------- */ 514 515ConversionResult ConvertUTF8toUTF16 ( 516 const UTF8** sourceStart, const UTF8* sourceEnd, 517 UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) { 518 ConversionResult result = conversionOK; 519 const UTF8* source = *sourceStart; 520 UTF16* target = *targetStart; 521 while (source < sourceEnd) { 522 UTF32 ch = 0; 523 unsigned short extraBytesToRead = trailingBytesForUTF8[*source]; 524 if (extraBytesToRead >= sourceEnd - source) { 525 result = sourceExhausted; break; 526 } 527 /* Do this check whether lenient or strict */ 528 if (!isLegalUTF8(source, extraBytesToRead+1)) { 529 result = sourceIllegal; 530 break; 531 } 532 /* 533 * The cases all fall through. See "Note A" below. 534 */ 535 switch (extraBytesToRead) { 536 case 5: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */ 537 case 4: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */ 538 case 3: ch += *source++; ch <<= 6; 539 case 2: ch += *source++; ch <<= 6; 540 case 1: ch += *source++; ch <<= 6; 541 case 0: ch += *source++; 542 } 543 ch -= offsetsFromUTF8[extraBytesToRead]; 544 545 if (target >= targetEnd) { 546 source -= (extraBytesToRead+1); /* Back up source pointer! */ 547 result = targetExhausted; break; 548 } 549 if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */ 550 /* UTF-16 surrogate values are illegal in UTF-32 */ 551 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) { 552 if (flags == strictConversion) { 553 source -= (extraBytesToRead+1); /* return to the illegal value itself */ 554 result = sourceIllegal; 555 break; 556 } else { 557 *target++ = UNI_REPLACEMENT_CHAR; 558 } 559 } else { 560 *target++ = (UTF16)ch; /* normal case */ 561 } 562 } else if (ch > UNI_MAX_UTF16) { 563 if (flags == strictConversion) { 564 result = sourceIllegal; 565 source -= (extraBytesToRead+1); /* return to the start */ 566 break; /* Bail out; shouldn't continue */ 567 } else { 568 *target++ = UNI_REPLACEMENT_CHAR; 569 } 570 } else { 571 /* target is a character in range 0xFFFF - 0x10FFFF. */ 572 if (target + 1 >= targetEnd) { 573 source -= (extraBytesToRead+1); /* Back up source pointer! */ 574 result = targetExhausted; break; 575 } 576 ch -= halfBase; 577 *target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START); 578 *target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START); 579 } 580 } 581 *sourceStart = source; 582 *targetStart = target; 583 return result; 584} 585 586/* --------------------------------------------------------------------- */ 587 588static ConversionResult ConvertUTF8toUTF32Impl( 589 const UTF8** sourceStart, const UTF8* sourceEnd, 590 UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags, 591 Boolean InputIsPartial) { 592 ConversionResult result = conversionOK; 593 const UTF8* source = *sourceStart; 594 UTF32* target = *targetStart; 595 while (source < sourceEnd) { 596 UTF32 ch = 0; 597 unsigned short extraBytesToRead = trailingBytesForUTF8[*source]; 598 if (extraBytesToRead >= sourceEnd - source) { 599 if (flags == strictConversion || InputIsPartial) { 600 result = sourceExhausted; 601 break; 602 } else { 603 result = sourceIllegal; 604 605 /* 606 * Replace the maximal subpart of ill-formed sequence with 607 * replacement character. 608 */ 609 source += findMaximalSubpartOfIllFormedUTF8Sequence(source, 610 sourceEnd); 611 *target++ = UNI_REPLACEMENT_CHAR; 612 continue; 613 } 614 } 615 if (target >= targetEnd) { 616 result = targetExhausted; break; 617 } 618 619 /* Do this check whether lenient or strict */ 620 if (!isLegalUTF8(source, extraBytesToRead+1)) { 621 result = sourceIllegal; 622 if (flags == strictConversion) { 623 /* Abort conversion. */ 624 break; 625 } else { 626 /* 627 * Replace the maximal subpart of ill-formed sequence with 628 * replacement character. 629 */ 630 source += findMaximalSubpartOfIllFormedUTF8Sequence(source, 631 sourceEnd); 632 *target++ = UNI_REPLACEMENT_CHAR; 633 continue; 634 } 635 } 636 /* 637 * The cases all fall through. See "Note A" below. 638 */ 639 switch (extraBytesToRead) { 640 case 5: ch += *source++; ch <<= 6; 641 case 4: ch += *source++; ch <<= 6; 642 case 3: ch += *source++; ch <<= 6; 643 case 2: ch += *source++; ch <<= 6; 644 case 1: ch += *source++; ch <<= 6; 645 case 0: ch += *source++; 646 } 647 ch -= offsetsFromUTF8[extraBytesToRead]; 648 649 if (ch <= UNI_MAX_LEGAL_UTF32) { 650 /* 651 * UTF-16 surrogate values are illegal in UTF-32, and anything 652 * over Plane 17 (> 0x10FFFF) is illegal. 653 */ 654 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) { 655 if (flags == strictConversion) { 656 source -= (extraBytesToRead+1); /* return to the illegal value itself */ 657 result = sourceIllegal; 658 break; 659 } else { 660 *target++ = UNI_REPLACEMENT_CHAR; 661 } 662 } else { 663 *target++ = ch; 664 } 665 } else { /* i.e., ch > UNI_MAX_LEGAL_UTF32 */ 666 result = sourceIllegal; 667 *target++ = UNI_REPLACEMENT_CHAR; 668 } 669 } 670 *sourceStart = source; 671 *targetStart = target; 672 return result; 673} 674 675ConversionResult ConvertUTF8toUTF32Partial(const UTF8 **sourceStart, 676 const UTF8 *sourceEnd, 677 UTF32 **targetStart, 678 UTF32 *targetEnd, 679 ConversionFlags flags) { 680 return ConvertUTF8toUTF32Impl(sourceStart, sourceEnd, targetStart, targetEnd, 681 flags, /*InputIsPartial=*/true); 682} 683 684ConversionResult ConvertUTF8toUTF32(const UTF8 **sourceStart, 685 const UTF8 *sourceEnd, UTF32 **targetStart, 686 UTF32 *targetEnd, ConversionFlags flags) { 687 return ConvertUTF8toUTF32Impl(sourceStart, sourceEnd, targetStart, targetEnd, 688 flags, /*InputIsPartial=*/false); 689} 690 691/* --------------------------------------------------------------------- 692 693 Note A. 694 The fall-through switches in UTF-8 reading code save a 695 temp variable, some decrements & conditionals. The switches 696 are equivalent to the following loop: 697 { 698 int tmpBytesToRead = extraBytesToRead+1; 699 do { 700 ch += *source++; 701 --tmpBytesToRead; 702 if (tmpBytesToRead) ch <<= 6; 703 } while (tmpBytesToRead > 0); 704 } 705 In UTF-8 writing code, the switches on "bytesToWrite" are 706 similarly unrolled loops. 707 708 --------------------------------------------------------------------- */ 709