1/****************************************************************************** 2 3 @File PVRTUnicode.cpp 4 5 @Title PVRTUnicode 6 7 @Version @Version 8 9 @Copyright Copyright (c) Imagination Technologies Limited. 10 11 @Platform All 12 13 @Description A small collection of functions used to decode Unicode formats to 14 individual code points. 15 16******************************************************************************/ 17#include "PVRTUnicode.h" 18#include <string.h> 19 20/**************************************************************************** 21** Constants 22****************************************************************************/ 23const PVRTuint32 c_u32ReplChar = 0xFFFD; 24 25#define VALID_ASCII 0x80 26#define TAIL_MASK 0x3F 27#define BYTES_PER_TAIL 6 28 29#define UTF16_SURG_H_MARK 0xD800 30#define UTF16_SURG_H_END 0xDBFF 31#define UTF16_SURG_L_MARK 0xDC00 32#define UTF16_SURG_L_END 0xDFFF 33 34#define UNICODE_NONCHAR_MARK 0xFDD0 35#define UNICODE_NONCHAR_END 0xFDEF 36#define UNICODE_RESERVED 0xFFFE 37#define UNICODE_MAX 0x10FFFF 38 39#define MAX_LEN 0x8FFF 40 41/**************************************************************************** 42** A table which allows quick lookup to determine the number of bytes of a 43** UTF8 code point. 44****************************************************************************/ 45const PVRTuint8 c_u8UTF8Lengths[256] = 46{ 47 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 48 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 49 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 50 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 51 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 52 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 53 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 54 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 55 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 56 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 57 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 58 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 59 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 60 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 61 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 62 3,3,3,3,3,3,3,3,0,0,0,0,0,0,0,0, 63}; 64 65/**************************************************************************** 66** A table which allows quick lookup to determine whether a UTF8 sequence 67** is 'overlong'. 68****************************************************************************/ 69const PVRTuint32 c_u32MinVals[4] = 70{ 71 0x00000000, // 0 tail bytes 72 0x00000080, // 1 tail bytes 73 0x00000800, // 2 tail bytes 74 0x00010000, // 3 tail bytes 75}; 76 77/*!*************************************************************************** 78 @Function CheckGenericUnicode 79 @Input c32 A UTF32 character/Unicode code point 80 @Returns Success or failure. 81 @Description Checks that the decoded code point is valid. 82*****************************************************************************/ 83static bool CheckGenericUnicode(PVRTuint32 c32) 84{ 85 // Check that this value isn't a UTF16 surrogate mask. 86 if(c32 >= UTF16_SURG_H_MARK && c32 <= UTF16_SURG_L_END) 87 return false; 88 // Check non-char values 89 if(c32 >= UNICODE_NONCHAR_MARK && c32 <= UNICODE_NONCHAR_END) 90 return false; 91 // Check reserved values 92 if((c32 & UNICODE_RESERVED) == UNICODE_RESERVED) 93 return false; 94 // Check max value. 95 if(c32 > UNICODE_MAX) 96 return false; 97 98 return true; 99} 100 101/*!*************************************************************************** 102 @Function PVRTUnicodeUTF8ToUTF32 103 @Input pUTF8 A UTF8 string, which is null terminated. 104 @Output aUTF32 An array of Unicode code points. 105 @Returns Success or failure. 106 @Description Decodes a UTF8-encoded string in to Unicode code points 107 (UTF32). If pUTF8 is not null terminated, the results are 108 undefined. 109*****************************************************************************/ 110EPVRTError PVRTUnicodeUTF8ToUTF32(const PVRTuint8* const pUTF8, CPVRTArray<PVRTuint32>& aUTF32) 111{ 112 unsigned int uiTailLen, uiIndex; 113 unsigned int uiBytes = (unsigned int) strlen((const char*)pUTF8); 114 PVRTuint32 c32; 115 116 const PVRTuint8* pC = pUTF8; 117 while(*pC) 118 { 119 // Quick optimisation for ASCII characters 120 while(*pC && *pC < VALID_ASCII) 121 { 122 aUTF32.Append(*pC++); 123 } 124 // Done 125 if(!*pC) 126 break; 127 128 c32 = *pC++; 129 uiTailLen = c_u8UTF8Lengths[c32]; 130 131 // Check for invalid tail length. Maximum 4 bytes for each UTF8 character. 132 // Also check to make sure the tail length is inside the provided buffer. 133 if(uiTailLen == 0 || (pC + uiTailLen > pUTF8 + uiBytes)) 134 return PVR_OVERFLOW; 135 136 c32 &= (TAIL_MASK >> uiTailLen); // Get the data out of the first byte. This depends on the length of the tail. 137 138 // Get the data out of each tail byte 139 uiIndex = 0; 140 while(uiIndex < uiTailLen) 141 { 142 if((pC[uiIndex] & 0xC0) != 0x80) 143 return PVR_FAIL; // Invalid tail byte! 144 145 c32 = (c32 << BYTES_PER_TAIL) + (pC[uiIndex] & TAIL_MASK); 146 uiIndex++; 147 } 148 149 pC += uiIndex; 150 151 // Check overlong values. 152 if(c32 < c_u32MinVals[uiTailLen]) 153 return PVR_FAIL; 154 155 if(!CheckGenericUnicode(c32)) 156 return PVR_FAIL; 157 158 // OK 159 aUTF32.Append(c32); 160 } 161 162 return PVR_SUCCESS; 163} 164 165/*!*************************************************************************** 166 @Function PVRTUnicodeUTF16ToUTF32 167 @Input pUTF16 A UTF16 string, which is null terminated. 168 @Output aUTF32 An array of Unicode code points. 169 @Returns Success or failure. 170 @Description Decodes a UTF16-encoded string in to Unicode code points 171 (UTF32). If pUTF16 is not null terminated, the results are 172 undefined. 173*****************************************************************************/ 174EPVRTError PVRTUnicodeUTF16ToUTF32(const PVRTuint16* const pUTF16, CPVRTArray<PVRTuint32>& aUTF32) 175{ 176 const PVRTuint16* pC = pUTF16; 177 178 // Determine the number of shorts 179 while(*++pC && (pC - pUTF16) < MAX_LEN); 180 unsigned int uiBufferLen = (unsigned int) (pC - pUTF16); 181 182 if(uiBufferLen == MAX_LEN) 183 return PVR_OVERFLOW; // Probably not NULL terminated. 184 185 // Reset to start. 186 pC = pUTF16; 187 188 PVRTuint32 c32; 189 while(*pC) 190 { 191 // Straight copy. We'll check for surrogate pairs next... 192 c32 = *pC++; 193 194 // Check surrogate pair 195 if(c32 >= UTF16_SURG_H_MARK && c32 <= UTF16_SURG_H_END) 196 { 197 // Make sure the next 2 bytes are in range... 198 if(pC + 1 > pUTF16 + uiBufferLen || *pC == 0) 199 return PVR_OVERFLOW; 200 201 // Check that the next value is in the low surrogate range 202 if(*pC < UTF16_SURG_L_MARK || *pC > UTF16_SURG_L_END) 203 return PVR_FAIL; 204 205 // Decode 206 c32 = ((c32 - UTF16_SURG_H_MARK) << 10) + (*pC - UTF16_SURG_L_MARK) + 0x10000; 207 pC++; 208 } 209 210 if(!CheckGenericUnicode(c32)) 211 return PVR_FAIL; 212 213 // OK 214 aUTF32.Append(c32); 215 } 216 217 return PVR_SUCCESS; 218} 219 220/*!*************************************************************************** 221 @Function PVRTUnicodeUTF8Length 222 @Input pUTF8 A UTF8 string, which is null terminated. 223 @Returns The length of the string, in Unicode code points. 224 @Description Calculates the length of a UTF8 string. If pUTF8 is 225 not null terminated, the results are undefined. 226*****************************************************************************/ 227unsigned int PVRTUnicodeUTF8Length(const PVRTuint8* const pUTF8) 228{ 229 const PVRTuint8* pC = pUTF8; 230 231 unsigned int charCount = 0; 232 unsigned int mask; 233 while(*pC) 234 { 235 // Quick optimisation for ASCII characters 236 const PVRTuint8* pStart = pC; 237 while(*pC && *pC < VALID_ASCII) 238 pC++; 239 240 charCount += (unsigned int) (pC - pStart); 241 242 // Done 243 if(!*pC) 244 break; 245 246 mask = *pC & 0xF0; 247 switch(mask) 248 { 249 case 0xF0: pC++; 250 case 0xE0: pC++; 251 case 0xC0: pC++; 252 break; 253 default: 254 _ASSERT(!"Invalid tail byte!"); 255 return 0; 256 } 257 258 pC++; 259 charCount++; 260 } 261 262 return charCount; 263} 264 265/*!*************************************************************************** 266 @Function PVRTUnicodeUTF16Length 267 @Input pUTF16 A UTF16 string, which is null terminated. 268 @Returns The length of the string, in Unicode code points. 269 @Description Calculates the length of a UTF16 string. 270 If pUTF16 is not null terminated, the results are 271 undefined. 272*****************************************************************************/ 273unsigned int PVRTUnicodeUTF16Length(const PVRTuint16* const pUTF16) 274{ 275 const PVRTuint16* pC = pUTF16; 276 unsigned int charCount = 0; 277 while(*pC && (pC - pUTF16) < MAX_LEN) 278 { 279 if( pC[0] >= UTF16_SURG_H_MARK && pC[0] <= UTF16_SURG_H_END 280 && pC[1] >= UTF16_SURG_L_MARK && pC[0] <= UTF16_SURG_L_END) 281 { 282 pC += 2; 283 } 284 else 285 { 286 pC += 1; 287 } 288 289 charCount++; 290 } 291 292 return charCount; 293} 294 295/*!*************************************************************************** 296 @Function PVRTUnicodeValidUTF8 297 @Input pUTF8 A UTF8 string, which is null terminated. 298 @Returns true or false 299 @Description Checks whether the encoding of a UTF8 string is valid. 300 If pUTF8 is not null terminated, the results are undefined. 301*****************************************************************************/ 302bool PVRTUnicodeValidUTF8(const PVRTuint8* const pUTF8) 303{ 304 unsigned int uiTailLen, uiIndex; 305 unsigned int uiBytes = (unsigned int) strlen((const char*)pUTF8); 306 const PVRTuint8* pC = pUTF8; 307 while(*pC) 308 { 309 // Quick optimisation for ASCII characters 310 while(*pC && *pC < VALID_ASCII) pC++; 311 // Done? 312 if(!*pC) 313 break; 314 315 PVRTuint32 c32 = *pC++; 316 uiTailLen = c_u8UTF8Lengths[c32]; 317 318 // Check for invalid tail length. Maximum 4 bytes for each UTF8 character. 319 // Also check to make sure the tail length is inside the provided buffer. 320 if(uiTailLen == 0 || (pC + uiTailLen > pUTF8 + uiBytes)) 321 return false; 322 323 // Get the data out of each tail byte 324 uiIndex = 0; 325 while(uiIndex < uiTailLen) 326 { 327 if((pC[uiIndex] & 0xC0) != 0x80) 328 return false; // Invalid tail byte! 329 330 c32 = (c32 << BYTES_PER_TAIL) + (pC[uiIndex] & TAIL_MASK); 331 uiIndex++; 332 } 333 334 pC += uiIndex; 335 336 // Check overlong values. 337 if(c32 < c_u32MinVals[uiTailLen]) 338 return false; 339 if(!CheckGenericUnicode(c32)) 340 return false; 341 } 342 343 return true; 344} 345 346/***************************************************************************** 347 End of file (PVRTUnicode.cpp) 348*****************************************************************************/ 349 350