17b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens/****************************************************************************** 27b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens 37b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens @File PVRTUnicode.cpp 47b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens 57b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens @Title PVRTUnicode 67b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens 77b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens @Version @Version 87b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens 97b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens @Copyright Copyright (c) Imagination Technologies Limited. 107b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens 117b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens @Platform All 127b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens 137b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens @Description A small collection of functions used to decode Unicode formats to 147b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens individual code points. 157b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens 167b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens******************************************************************************/ 177b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens#include "PVRTUnicode.h" 187b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens#include <string.h> 197b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens 207b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens/**************************************************************************** 217b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens** Constants 227b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens****************************************************************************/ 237b21f276fa91ad62fd2055844688b07829f12205Nicolas Capensconst PVRTuint32 c_u32ReplChar = 0xFFFD; 247b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens 257b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens#define VALID_ASCII 0x80 267b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens#define TAIL_MASK 0x3F 277b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens#define BYTES_PER_TAIL 6 287b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens 297b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens#define UTF16_SURG_H_MARK 0xD800 307b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens#define UTF16_SURG_H_END 0xDBFF 317b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens#define UTF16_SURG_L_MARK 0xDC00 327b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens#define UTF16_SURG_L_END 0xDFFF 337b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens 347b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens#define UNICODE_NONCHAR_MARK 0xFDD0 357b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens#define UNICODE_NONCHAR_END 0xFDEF 367b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens#define UNICODE_RESERVED 0xFFFE 377b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens#define UNICODE_MAX 0x10FFFF 387b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens 397b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens#define MAX_LEN 0x8FFF 407b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens 417b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens/**************************************************************************** 427b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens** A table which allows quick lookup to determine the number of bytes of a 437b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens** UTF8 code point. 447b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens****************************************************************************/ 457b21f276fa91ad62fd2055844688b07829f12205Nicolas Capensconst PVRTuint8 c_u8UTF8Lengths[256] = 467b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens{ 477b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 487b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 497b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 507b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 517b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 527b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 537b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 547b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 557b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 567b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 577b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 587b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 597b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 607b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 617b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 627b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens 3,3,3,3,3,3,3,3,0,0,0,0,0,0,0,0, 637b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens}; 647b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens 657b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens/**************************************************************************** 667b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens** A table which allows quick lookup to determine whether a UTF8 sequence 677b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens** is 'overlong'. 687b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens****************************************************************************/ 697b21f276fa91ad62fd2055844688b07829f12205Nicolas Capensconst PVRTuint32 c_u32MinVals[4] = 707b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens{ 717b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens 0x00000000, // 0 tail bytes 727b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens 0x00000080, // 1 tail bytes 737b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens 0x00000800, // 2 tail bytes 747b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens 0x00010000, // 3 tail bytes 757b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens}; 767b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens 777b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens/*!*************************************************************************** 787b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens @Function CheckGenericUnicode 797b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens @Input c32 A UTF32 character/Unicode code point 807b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens @Returns Success or failure. 817b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens @Description Checks that the decoded code point is valid. 827b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens*****************************************************************************/ 837b21f276fa91ad62fd2055844688b07829f12205Nicolas Capensstatic bool CheckGenericUnicode(PVRTuint32 c32) 847b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens{ 857b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens // Check that this value isn't a UTF16 surrogate mask. 867b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens if(c32 >= UTF16_SURG_H_MARK && c32 <= UTF16_SURG_L_END) 877b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens return false; 887b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens // Check non-char values 897b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens if(c32 >= UNICODE_NONCHAR_MARK && c32 <= UNICODE_NONCHAR_END) 907b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens return false; 917b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens // Check reserved values 927b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens if((c32 & UNICODE_RESERVED) == UNICODE_RESERVED) 937b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens return false; 947b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens // Check max value. 957b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens if(c32 > UNICODE_MAX) 967b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens return false; 977b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens 987b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens return true; 997b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens} 1007b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens 1017b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens/*!*************************************************************************** 1027b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens @Function PVRTUnicodeUTF8ToUTF32 1037b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens @Input pUTF8 A UTF8 string, which is null terminated. 1047b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens @Output aUTF32 An array of Unicode code points. 1057b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens @Returns Success or failure. 1067b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens @Description Decodes a UTF8-encoded string in to Unicode code points 1077b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens (UTF32). If pUTF8 is not null terminated, the results are 1087b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens undefined. 1097b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens*****************************************************************************/ 1107b21f276fa91ad62fd2055844688b07829f12205Nicolas CapensEPVRTError PVRTUnicodeUTF8ToUTF32(const PVRTuint8* const pUTF8, CPVRTArray<PVRTuint32>& aUTF32) 1117b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens{ 1127b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens unsigned int uiTailLen, uiIndex; 1137b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens unsigned int uiBytes = (unsigned int) strlen((const char*)pUTF8); 1147b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens PVRTuint32 c32; 1157b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens 1167b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens const PVRTuint8* pC = pUTF8; 1177b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens while(*pC) 1187b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens { 1197b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens // Quick optimisation for ASCII characters 1207b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens while(*pC && *pC < VALID_ASCII) 1217b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens { 1227b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens aUTF32.Append(*pC++); 1237b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens } 1247b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens // Done 1257b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens if(!*pC) 1267b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens break; 1277b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens 1287b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens c32 = *pC++; 1297b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens uiTailLen = c_u8UTF8Lengths[c32]; 1307b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens 1317b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens // Check for invalid tail length. Maximum 4 bytes for each UTF8 character. 1327b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens // Also check to make sure the tail length is inside the provided buffer. 1337b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens if(uiTailLen == 0 || (pC + uiTailLen > pUTF8 + uiBytes)) 1347b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens return PVR_OVERFLOW; 1357b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens 1367b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens c32 &= (TAIL_MASK >> uiTailLen); // Get the data out of the first byte. This depends on the length of the tail. 1377b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens 1387b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens // Get the data out of each tail byte 1397b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens uiIndex = 0; 1407b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens while(uiIndex < uiTailLen) 1417b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens { 1427b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens if((pC[uiIndex] & 0xC0) != 0x80) 1437b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens return PVR_FAIL; // Invalid tail byte! 1447b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens 1457b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens c32 = (c32 << BYTES_PER_TAIL) + (pC[uiIndex] & TAIL_MASK); 1467b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens uiIndex++; 1477b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens } 1487b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens 1497b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens pC += uiIndex; 1507b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens 1517b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens // Check overlong values. 1527b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens if(c32 < c_u32MinVals[uiTailLen]) 1537b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens return PVR_FAIL; 1547b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens 1557b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens if(!CheckGenericUnicode(c32)) 1567b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens return PVR_FAIL; 1577b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens 1587b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens // OK 1597b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens aUTF32.Append(c32); 1607b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens } 1617b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens 1627b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens return PVR_SUCCESS; 1637b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens} 1647b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens 1657b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens/*!*************************************************************************** 1667b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens @Function PVRTUnicodeUTF16ToUTF32 1677b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens @Input pUTF16 A UTF16 string, which is null terminated. 1687b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens @Output aUTF32 An array of Unicode code points. 1697b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens @Returns Success or failure. 1707b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens @Description Decodes a UTF16-encoded string in to Unicode code points 1717b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens (UTF32). If pUTF16 is not null terminated, the results are 1727b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens undefined. 1737b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens*****************************************************************************/ 1747b21f276fa91ad62fd2055844688b07829f12205Nicolas CapensEPVRTError PVRTUnicodeUTF16ToUTF32(const PVRTuint16* const pUTF16, CPVRTArray<PVRTuint32>& aUTF32) 1757b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens{ 1767b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens const PVRTuint16* pC = pUTF16; 1777b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens 1787b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens // Determine the number of shorts 1797b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens while(*++pC && (pC - pUTF16) < MAX_LEN); 1807b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens unsigned int uiBufferLen = (unsigned int) (pC - pUTF16); 1817b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens 1827b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens if(uiBufferLen == MAX_LEN) 1837b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens return PVR_OVERFLOW; // Probably not NULL terminated. 1847b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens 1857b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens // Reset to start. 1867b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens pC = pUTF16; 1877b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens 1887b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens PVRTuint32 c32; 1897b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens while(*pC) 1907b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens { 1917b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens // Straight copy. We'll check for surrogate pairs next... 1927b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens c32 = *pC++; 1937b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens 1947b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens // Check surrogate pair 1957b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens if(c32 >= UTF16_SURG_H_MARK && c32 <= UTF16_SURG_H_END) 1967b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens { 1977b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens // Make sure the next 2 bytes are in range... 1987b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens if(pC + 1 > pUTF16 + uiBufferLen || *pC == 0) 1997b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens return PVR_OVERFLOW; 2007b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens 2017b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens // Check that the next value is in the low surrogate range 2027b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens if(*pC < UTF16_SURG_L_MARK || *pC > UTF16_SURG_L_END) 2037b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens return PVR_FAIL; 2047b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens 2057b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens // Decode 2067b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens c32 = ((c32 - UTF16_SURG_H_MARK) << 10) + (*pC - UTF16_SURG_L_MARK) + 0x10000; 2077b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens pC++; 2087b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens } 2097b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens 2107b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens if(!CheckGenericUnicode(c32)) 2117b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens return PVR_FAIL; 2127b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens 2137b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens // OK 2147b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens aUTF32.Append(c32); 2157b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens } 2167b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens 2177b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens return PVR_SUCCESS; 2187b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens} 2197b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens 2207b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens/*!*************************************************************************** 2217b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens @Function PVRTUnicodeUTF8Length 2227b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens @Input pUTF8 A UTF8 string, which is null terminated. 2237b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens @Returns The length of the string, in Unicode code points. 2247b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens @Description Calculates the length of a UTF8 string. If pUTF8 is 2257b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens not null terminated, the results are undefined. 2267b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens*****************************************************************************/ 2277b21f276fa91ad62fd2055844688b07829f12205Nicolas Capensunsigned int PVRTUnicodeUTF8Length(const PVRTuint8* const pUTF8) 2287b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens{ 2297b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens const PVRTuint8* pC = pUTF8; 2307b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens 2317b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens unsigned int charCount = 0; 2327b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens unsigned int mask; 2337b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens while(*pC) 2347b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens { 2357b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens // Quick optimisation for ASCII characters 2367b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens const PVRTuint8* pStart = pC; 2377b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens while(*pC && *pC < VALID_ASCII) 2387b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens pC++; 2397b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens 2407b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens charCount += (unsigned int) (pC - pStart); 2417b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens 2427b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens // Done 2437b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens if(!*pC) 2447b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens break; 2457b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens 2467b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens mask = *pC & 0xF0; 2477b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens switch(mask) 2487b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens { 2497b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens case 0xF0: pC++; 2507b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens case 0xE0: pC++; 2517b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens case 0xC0: pC++; 2527b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens break; 2537b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens default: 2547b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens _ASSERT(!"Invalid tail byte!"); 2557b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens return 0; 2567b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens } 2577b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens 2587b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens pC++; 2597b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens charCount++; 2607b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens } 2617b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens 2627b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens return charCount; 2637b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens} 2647b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens 2657b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens/*!*************************************************************************** 2667b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens @Function PVRTUnicodeUTF16Length 2677b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens @Input pUTF16 A UTF16 string, which is null terminated. 2687b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens @Returns The length of the string, in Unicode code points. 2697b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens @Description Calculates the length of a UTF16 string. 2707b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens If pUTF16 is not null terminated, the results are 2717b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens undefined. 2727b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens*****************************************************************************/ 2737b21f276fa91ad62fd2055844688b07829f12205Nicolas Capensunsigned int PVRTUnicodeUTF16Length(const PVRTuint16* const pUTF16) 2747b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens{ 2757b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens const PVRTuint16* pC = pUTF16; 2767b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens unsigned int charCount = 0; 2777b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens while(*pC && (pC - pUTF16) < MAX_LEN) 2787b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens { 2797b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens if( pC[0] >= UTF16_SURG_H_MARK && pC[0] <= UTF16_SURG_H_END 2807b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens && pC[1] >= UTF16_SURG_L_MARK && pC[0] <= UTF16_SURG_L_END) 2817b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens { 2827b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens pC += 2; 2837b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens } 2847b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens else 2857b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens { 2867b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens pC += 1; 2877b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens } 2887b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens 2897b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens charCount++; 2907b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens } 2917b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens 2927b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens return charCount; 2937b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens} 2947b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens 2957b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens/*!*************************************************************************** 2967b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens @Function PVRTUnicodeValidUTF8 2977b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens @Input pUTF8 A UTF8 string, which is null terminated. 2987b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens @Returns true or false 2997b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens @Description Checks whether the encoding of a UTF8 string is valid. 3007b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens If pUTF8 is not null terminated, the results are undefined. 3017b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens*****************************************************************************/ 3027b21f276fa91ad62fd2055844688b07829f12205Nicolas Capensbool PVRTUnicodeValidUTF8(const PVRTuint8* const pUTF8) 3037b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens{ 3047b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens unsigned int uiTailLen, uiIndex; 3057b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens unsigned int uiBytes = (unsigned int) strlen((const char*)pUTF8); 3067b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens const PVRTuint8* pC = pUTF8; 3077b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens while(*pC) 3087b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens { 3097b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens // Quick optimisation for ASCII characters 3107b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens while(*pC && *pC < VALID_ASCII) pC++; 3117b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens // Done? 3127b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens if(!*pC) 3137b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens break; 3147b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens 3157b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens PVRTuint32 c32 = *pC++; 3167b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens uiTailLen = c_u8UTF8Lengths[c32]; 3177b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens 3187b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens // Check for invalid tail length. Maximum 4 bytes for each UTF8 character. 3197b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens // Also check to make sure the tail length is inside the provided buffer. 3207b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens if(uiTailLen == 0 || (pC + uiTailLen > pUTF8 + uiBytes)) 3217b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens return false; 3227b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens 3237b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens // Get the data out of each tail byte 3247b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens uiIndex = 0; 3257b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens while(uiIndex < uiTailLen) 3267b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens { 3277b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens if((pC[uiIndex] & 0xC0) != 0x80) 3287b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens return false; // Invalid tail byte! 3297b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens 3307b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens c32 = (c32 << BYTES_PER_TAIL) + (pC[uiIndex] & TAIL_MASK); 3317b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens uiIndex++; 3327b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens } 3337b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens 3347b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens pC += uiIndex; 3357b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens 3367b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens // Check overlong values. 3377b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens if(c32 < c_u32MinVals[uiTailLen]) 3387b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens return false; 3397b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens if(!CheckGenericUnicode(c32)) 3407b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens return false; 3417b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens } 3427b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens 3437b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens return true; 3447b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens} 3457b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens 3467b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens/***************************************************************************** 3477b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens End of file (PVRTUnicode.cpp) 3487b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens*****************************************************************************/ 3497b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens 350