17b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens/******************************************************************************
27b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens
37b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens @File         PVRTUnicode.cpp
47b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens
57b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens @Title        PVRTUnicode
67b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens
77b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens @Version       @Version
87b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens
97b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens @Copyright    Copyright (c) Imagination Technologies Limited.
107b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens
117b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens @Platform     All
127b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens
137b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens @Description  A small collection of functions used to decode Unicode formats to
147b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens               individual code points.
157b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens
167b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens******************************************************************************/
177b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens#include "PVRTUnicode.h"
187b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens#include <string.h>
197b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens
207b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens/****************************************************************************
217b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens** Constants
227b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens****************************************************************************/
237b21f276fa91ad62fd2055844688b07829f12205Nicolas Capensconst PVRTuint32 c_u32ReplChar = 0xFFFD;
247b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens
257b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens#define VALID_ASCII 0x80
267b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens#define TAIL_MASK 0x3F
277b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens#define BYTES_PER_TAIL 6
287b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens
297b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens#define UTF16_SURG_H_MARK 0xD800
307b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens#define UTF16_SURG_H_END  0xDBFF
317b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens#define UTF16_SURG_L_MARK 0xDC00
327b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens#define UTF16_SURG_L_END  0xDFFF
337b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens
347b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens#define UNICODE_NONCHAR_MARK 0xFDD0
357b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens#define UNICODE_NONCHAR_END  0xFDEF
367b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens#define UNICODE_RESERVED	 0xFFFE
377b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens#define UNICODE_MAX			 0x10FFFF
387b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens
397b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens#define MAX_LEN 0x8FFF
407b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens
417b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens/****************************************************************************
427b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens** A table which allows quick lookup to determine the number of bytes of a
437b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens** UTF8 code point.
447b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens****************************************************************************/
457b21f276fa91ad62fd2055844688b07829f12205Nicolas Capensconst PVRTuint8 c_u8UTF8Lengths[256] =
467b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens{
477b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
487b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
497b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
507b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
517b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
527b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
537b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
547b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
557b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
567b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
577b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
587b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
597b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
607b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
617b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens	2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
627b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens	3,3,3,3,3,3,3,3,0,0,0,0,0,0,0,0,
637b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens};
647b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens
657b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens/****************************************************************************
667b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens** A table which allows quick lookup to determine whether a UTF8 sequence
677b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens** is 'overlong'.
687b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens****************************************************************************/
697b21f276fa91ad62fd2055844688b07829f12205Nicolas Capensconst PVRTuint32 c_u32MinVals[4] =
707b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens{
717b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens	0x00000000,		// 0 tail bytes
727b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens	0x00000080,		// 1 tail bytes
737b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens	0x00000800,		// 2 tail bytes
747b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens	0x00010000,		// 3 tail bytes
757b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens};
767b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens
777b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens/*!***************************************************************************
787b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens @Function			CheckGenericUnicode
797b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens @Input				c32			A UTF32 character/Unicode code point
807b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens @Returns			Success or failure.
817b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens @Description		Checks that the decoded code point is valid.
827b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens*****************************************************************************/
837b21f276fa91ad62fd2055844688b07829f12205Nicolas Capensstatic bool CheckGenericUnicode(PVRTuint32 c32)
847b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens{
857b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens	// Check that this value isn't a UTF16 surrogate mask.
867b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens	if(c32 >= UTF16_SURG_H_MARK && c32 <= UTF16_SURG_L_END)
877b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens		return false;
887b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens	// Check non-char values
897b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens	if(c32 >= UNICODE_NONCHAR_MARK && c32 <= UNICODE_NONCHAR_END)
907b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens		return false;
917b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens	// Check reserved values
927b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens	if((c32 & UNICODE_RESERVED) == UNICODE_RESERVED)
937b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens		return false;
947b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens	// Check max value.
957b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens	if(c32 > UNICODE_MAX)
967b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens		return false;
977b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens
987b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens	return true;
997b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens}
1007b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens
1017b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens/*!***************************************************************************
1027b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens @Function			PVRTUnicodeUTF8ToUTF32
1037b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens @Input				pUTF8			A UTF8 string, which is null terminated.
1047b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens @Output			aUTF32			An array of Unicode code points.
1057b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens @Returns			Success or failure.
1067b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens @Description		Decodes a UTF8-encoded string in to Unicode code points
1077b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens					(UTF32). If pUTF8 is not null terminated, the results are
1087b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens					undefined.
1097b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens*****************************************************************************/
1107b21f276fa91ad62fd2055844688b07829f12205Nicolas CapensEPVRTError PVRTUnicodeUTF8ToUTF32(const PVRTuint8* const pUTF8, CPVRTArray<PVRTuint32>& aUTF32)
1117b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens{
1127b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens	unsigned int uiTailLen, uiIndex;
1137b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens	unsigned int uiBytes = (unsigned int) strlen((const char*)pUTF8);
1147b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens	PVRTuint32 c32;
1157b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens
1167b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens	const PVRTuint8* pC = pUTF8;
1177b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens	while(*pC)
1187b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens	{
1197b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens		// Quick optimisation for ASCII characters
1207b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens		while(*pC && *pC < VALID_ASCII)
1217b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens		{
1227b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens			aUTF32.Append(*pC++);
1237b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens		}
1247b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens		// Done
1257b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens		if(!*pC)
1267b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens			break;
1277b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens
1287b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens		c32 = *pC++;
1297b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens		uiTailLen = c_u8UTF8Lengths[c32];
1307b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens
1317b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens		// Check for invalid tail length. Maximum 4 bytes for each UTF8 character.
1327b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens		// Also check to make sure the tail length is inside the provided buffer.
1337b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens		if(uiTailLen == 0 || (pC + uiTailLen > pUTF8 + uiBytes))
1347b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens			return PVR_OVERFLOW;
1357b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens
1367b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens		c32 &= (TAIL_MASK >> uiTailLen);	// Get the data out of the first byte. This depends on the length of the tail.
1377b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens
1387b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens		// Get the data out of each tail byte
1397b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens		uiIndex = 0;
1407b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens		while(uiIndex < uiTailLen)
1417b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens		{
1427b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens			if((pC[uiIndex] & 0xC0) != 0x80)
1437b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens				return PVR_FAIL;		// Invalid tail byte!
1447b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens
1457b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens			c32 = (c32 << BYTES_PER_TAIL) + (pC[uiIndex] & TAIL_MASK);
1467b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens			uiIndex++;
1477b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens		}
1487b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens
1497b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens		pC += uiIndex;
1507b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens
1517b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens		// Check overlong values.
1527b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens		if(c32 < c_u32MinVals[uiTailLen])
1537b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens			return PVR_FAIL;
1547b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens
1557b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens		if(!CheckGenericUnicode(c32))
1567b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens			return PVR_FAIL;
1577b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens
1587b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens		// OK
1597b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens		aUTF32.Append(c32);
1607b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens	}
1617b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens
1627b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens	return PVR_SUCCESS;
1637b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens}
1647b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens
1657b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens/*!***************************************************************************
1667b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens @Function			PVRTUnicodeUTF16ToUTF32
1677b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens @Input				pUTF16			A UTF16 string, which is null terminated.
1687b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens @Output			aUTF32			An array of Unicode code points.
1697b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens @Returns			Success or failure.
1707b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens @Description		Decodes a UTF16-encoded string in to Unicode code points
1717b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens					(UTF32). If pUTF16 is not null terminated, the results are
1727b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens					undefined.
1737b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens*****************************************************************************/
1747b21f276fa91ad62fd2055844688b07829f12205Nicolas CapensEPVRTError PVRTUnicodeUTF16ToUTF32(const PVRTuint16* const pUTF16, CPVRTArray<PVRTuint32>& aUTF32)
1757b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens{
1767b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens	const PVRTuint16* pC = pUTF16;
1777b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens
1787b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens	// Determine the number of shorts
1797b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens	while(*++pC && (pC - pUTF16) < MAX_LEN);
1807b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens	unsigned int uiBufferLen = (unsigned int) (pC - pUTF16);
1817b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens
1827b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens	if(uiBufferLen == MAX_LEN)
1837b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens		return PVR_OVERFLOW;		// Probably not NULL terminated.
1847b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens
1857b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens	// Reset to start.
1867b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens	pC = pUTF16;
1877b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens
1887b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens	PVRTuint32 c32;
1897b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens	while(*pC)
1907b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens	{
1917b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens		// Straight copy. We'll check for surrogate pairs next...
1927b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens		c32 = *pC++;
1937b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens
1947b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens		// Check surrogate pair
1957b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens		if(c32 >= UTF16_SURG_H_MARK && c32 <= UTF16_SURG_H_END)
1967b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens		{
1977b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens			// Make sure the next 2 bytes are in range...
1987b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens			if(pC + 1 > pUTF16 + uiBufferLen || *pC == 0)
1997b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens				return PVR_OVERFLOW;
2007b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens
2017b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens			// Check that the next value is in the low surrogate range
2027b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens			if(*pC < UTF16_SURG_L_MARK || *pC > UTF16_SURG_L_END)
2037b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens				return PVR_FAIL;
2047b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens
2057b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens			// Decode
2067b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens			c32 = ((c32 - UTF16_SURG_H_MARK) << 10) + (*pC - UTF16_SURG_L_MARK) + 0x10000;
2077b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens			pC++;
2087b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens		}
2097b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens
2107b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens		if(!CheckGenericUnicode(c32))
2117b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens			return PVR_FAIL;
2127b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens
2137b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens		// OK
2147b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens		aUTF32.Append(c32);
2157b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens	}
2167b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens
2177b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens	return PVR_SUCCESS;
2187b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens}
2197b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens
2207b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens/*!***************************************************************************
2217b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens @Function			PVRTUnicodeUTF8Length
2227b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens @Input				pUTF8			A UTF8 string, which is null terminated.
2237b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens @Returns			The length of the string, in Unicode code points.
2247b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens @Description		Calculates the length of a UTF8 string. If pUTF8 is
2257b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens					not null terminated, the results are undefined.
2267b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens*****************************************************************************/
2277b21f276fa91ad62fd2055844688b07829f12205Nicolas Capensunsigned int PVRTUnicodeUTF8Length(const PVRTuint8* const pUTF8)
2287b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens{
2297b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens	const PVRTuint8* pC = pUTF8;
2307b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens
2317b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens	unsigned int charCount = 0;
2327b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens	unsigned int mask;
2337b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens	while(*pC)
2347b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens	{
2357b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens		// Quick optimisation for ASCII characters
2367b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens		const PVRTuint8* pStart = pC;
2377b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens		while(*pC && *pC < VALID_ASCII)
2387b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens			pC++;
2397b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens
2407b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens		charCount += (unsigned int) (pC - pStart);
2417b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens
2427b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens		// Done
2437b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens		if(!*pC)
2447b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens			break;
2457b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens
2467b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens		mask = *pC & 0xF0;
2477b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens		switch(mask)
2487b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens		{
2497b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens		case 0xF0: pC++;
2507b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens		case 0xE0: pC++;
2517b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens		case 0xC0: pC++;
2527b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens			break;
2537b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens		default:
2547b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens			_ASSERT(!"Invalid tail byte!");
2557b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens			return 0;
2567b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens		}
2577b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens
2587b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens		pC++;
2597b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens		charCount++;
2607b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens	}
2617b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens
2627b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens	return charCount;
2637b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens}
2647b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens
2657b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens/*!***************************************************************************
2667b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens @Function			PVRTUnicodeUTF16Length
2677b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens @Input				pUTF16			A UTF16 string, which is null terminated.
2687b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens @Returns			The length of the string, in Unicode code points.
2697b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens @Description		Calculates the length of a UTF16 string.
2707b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens					If pUTF16 is not null terminated, the results are
2717b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens					undefined.
2727b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens*****************************************************************************/
2737b21f276fa91ad62fd2055844688b07829f12205Nicolas Capensunsigned int PVRTUnicodeUTF16Length(const PVRTuint16* const pUTF16)
2747b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens{
2757b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens	const PVRTuint16* pC = pUTF16;
2767b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens	unsigned int charCount = 0;
2777b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens	while(*pC && (pC - pUTF16) < MAX_LEN)
2787b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens	{
2797b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens		if(	pC[0] >= UTF16_SURG_H_MARK && pC[0] <= UTF16_SURG_H_END
2807b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens		 && pC[1] >= UTF16_SURG_L_MARK && pC[0] <= UTF16_SURG_L_END)
2817b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens		{
2827b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens			pC += 2;
2837b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens		}
2847b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens		else
2857b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens		{
2867b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens			pC += 1;
2877b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens		}
2887b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens
2897b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens		charCount++;
2907b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens	}
2917b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens
2927b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens	return charCount;
2937b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens}
2947b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens
2957b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens/*!***************************************************************************
2967b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens @Function			PVRTUnicodeValidUTF8
2977b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens @Input				pUTF8			A UTF8 string, which is null terminated.
2987b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens @Returns			true or false
2997b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens @Description		Checks whether the encoding of a UTF8 string is valid.
3007b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens					If pUTF8 is not null terminated, the results are undefined.
3017b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens*****************************************************************************/
3027b21f276fa91ad62fd2055844688b07829f12205Nicolas Capensbool PVRTUnicodeValidUTF8(const PVRTuint8* const pUTF8)
3037b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens{
3047b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens	unsigned int uiTailLen, uiIndex;
3057b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens	unsigned int uiBytes = (unsigned int) strlen((const char*)pUTF8);
3067b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens	const PVRTuint8* pC = pUTF8;
3077b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens	while(*pC)
3087b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens	{
3097b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens		// Quick optimisation for ASCII characters
3107b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens		while(*pC && *pC < VALID_ASCII)	pC++;
3117b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens		// Done?
3127b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens		if(!*pC)
3137b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens			break;
3147b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens
3157b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens		PVRTuint32 c32 = *pC++;
3167b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens		uiTailLen = c_u8UTF8Lengths[c32];
3177b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens
3187b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens		// Check for invalid tail length. Maximum 4 bytes for each UTF8 character.
3197b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens		// Also check to make sure the tail length is inside the provided buffer.
3207b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens		if(uiTailLen == 0 || (pC + uiTailLen > pUTF8 + uiBytes))
3217b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens			return false;
3227b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens
3237b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens		// Get the data out of each tail byte
3247b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens		uiIndex = 0;
3257b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens		while(uiIndex < uiTailLen)
3267b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens		{
3277b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens			if((pC[uiIndex] & 0xC0) != 0x80)
3287b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens				return false;		// Invalid tail byte!
3297b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens
3307b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens			c32 = (c32 << BYTES_PER_TAIL) + (pC[uiIndex] & TAIL_MASK);
3317b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens			uiIndex++;
3327b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens		}
3337b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens
3347b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens		pC += uiIndex;
3357b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens
3367b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens		// Check overlong values.
3377b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens		if(c32 < c_u32MinVals[uiTailLen])
3387b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens			return false;
3397b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens		if(!CheckGenericUnicode(c32))
3407b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens			return false;
3417b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens	}
3427b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens
3437b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens	return true;
3447b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens}
3457b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens
3467b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens/*****************************************************************************
3477b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens End of file (PVRTUnicode.cpp)
3487b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens*****************************************************************************/
3497b21f276fa91ad62fd2055844688b07829f12205Nicolas Capens
350