PowerVR_SDK/Tools/PVRTUnicode.cpp

/******************************************************************************

 @File         PVRTUnicode.cpp

 @Title        PVRTUnicode

 @Version       @Version

 @Copyright    Copyright (c) Imagination Technologies Limited.

 @Platform     All

 @Description  A small collection of functions used to decode Unicode formats to
               individual code points.

******************************************************************************/
#include "PVRTUnicode.h"
#include <string.h>

/****************************************************************************
** Constants
****************************************************************************/
const PVRTuint32 c_u32ReplChar = 0xFFFD;

#define VALID_ASCII 0x80
#define TAIL_MASK 0x3F
#define BYTES_PER_TAIL 6

#define UTF16_SURG_H_MARK 0xD800
#define UTF16_SURG_H_END  0xDBFF
#define UTF16_SURG_L_MARK 0xDC00
#define UTF16_SURG_L_END  0xDFFF

#define UNICODE_NONCHAR_MARK 0xFDD0
#define UNICODE_NONCHAR_END  0xFDEF
#define UNICODE_RESERVED	 0xFFFE
#define UNICODE_MAX			 0x10FFFF

#define MAX_LEN 0x8FFF

/****************************************************************************
** A table which allows quick lookup to determine the number of bytes of a
** UTF8 code point.
****************************************************************************/
const PVRTuint8 c_u8UTF8Lengths[256] =
{
	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
	2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
	3,3,3,3,3,3,3,3,0,0,0,0,0,0,0,0,
};

/****************************************************************************
** A table which allows quick lookup to determine whether a UTF8 sequence
** is 'overlong'.
****************************************************************************/
const PVRTuint32 c_u32MinVals[4] =
{
	0x00000000,		// 0 tail bytes
	0x00000080,		// 1 tail bytes
	0x00000800,		// 2 tail bytes
	0x00010000,		// 3 tail bytes
};

/*!***************************************************************************
 @Function			CheckGenericUnicode
 @Input				c32			A UTF32 character/Unicode code point
 @Returns			Success or failure.
 @Description		Checks that the decoded code point is valid.
*****************************************************************************/
static bool CheckGenericUnicode(PVRTuint32 c32)
{
	// Check that this value isn't a UTF16 surrogate mask.
	if(c32 >= UTF16_SURG_H_MARK && c32 <= UTF16_SURG_L_END)
		return false;
	// Check non-char values
	if(c32 >= UNICODE_NONCHAR_MARK && c32 <= UNICODE_NONCHAR_END)
		return false;
	// Check reserved values
	if((c32 & UNICODE_RESERVED) == UNICODE_RESERVED)
		return false;
	// Check max value.
	if(c32 > UNICODE_MAX)
		return false;

	return true;
}

/*!***************************************************************************
 @Function			PVRTUnicodeUTF8ToUTF32
 @Input				pUTF8			A UTF8 string, which is null terminated.
 @Output			aUTF32			An array of Unicode code points.
 @Returns			Success or failure.
 @Description		Decodes a UTF8-encoded string in to Unicode code points
					(UTF32). If pUTF8 is not null terminated, the results are
					undefined.
*****************************************************************************/
EPVRTError PVRTUnicodeUTF8ToUTF32(const PVRTuint8* const pUTF8, CPVRTArray<PVRTuint32>& aUTF32)
{
	unsigned int uiTailLen, uiIndex;
	unsigned int uiBytes = (unsigned int) strlen((const char*)pUTF8);
	PVRTuint32 c32;

	const PVRTuint8* pC = pUTF8;
	while(*pC)
	{
		// Quick optimisation for ASCII characters
		while(*pC && *pC < VALID_ASCII)
		{
			aUTF32.Append(*pC++);
		}
		// Done
		if(!*pC)
			break;

		c32 = *pC++;
		uiTailLen = c_u8UTF8Lengths[c32];

		// Check for invalid tail length. Maximum 4 bytes for each UTF8 character.
		// Also check to make sure the tail length is inside the provided buffer.
		if(uiTailLen == 0 || (pC + uiTailLen > pUTF8 + uiBytes))
			return PVR_OVERFLOW;

		c32 &= (TAIL_MASK >> uiTailLen);	// Get the data out of the first byte. This depends on the length of the tail.

		// Get the data out of each tail byte
		uiIndex = 0;
		while(uiIndex < uiTailLen)
		{
			if((pC[uiIndex] & 0xC0) != 0x80)
				return PVR_FAIL;		// Invalid tail byte!

			c32 = (c32 << BYTES_PER_TAIL) + (pC[uiIndex] & TAIL_MASK);
			uiIndex++;
		}

		pC += uiIndex;

		// Check overlong values.
		if(c32 < c_u32MinVals[uiTailLen])
			return PVR_FAIL;

		if(!CheckGenericUnicode(c32))
			return PVR_FAIL;

		// OK
		aUTF32.Append(c32);
	}

	return PVR_SUCCESS;
}

/*!***************************************************************************
 @Function			PVRTUnicodeUTF16ToUTF32
 @Input				pUTF16			A UTF16 string, which is null terminated.
 @Output			aUTF32			An array of Unicode code points.
 @Returns			Success or failure.
 @Description		Decodes a UTF16-encoded string in to Unicode code points
					(UTF32). If pUTF16 is not null terminated, the results are
					undefined.
*****************************************************************************/
EPVRTError PVRTUnicodeUTF16ToUTF32(const PVRTuint16* const pUTF16, CPVRTArray<PVRTuint32>& aUTF32)
{
	const PVRTuint16* pC = pUTF16;

	// Determine the number of shorts
	while(*++pC && (pC - pUTF16) < MAX_LEN);
	unsigned int uiBufferLen = (unsigned int) (pC - pUTF16);

	if(uiBufferLen == MAX_LEN)
		return PVR_OVERFLOW;		// Probably not NULL terminated.

	// Reset to start.
	pC = pUTF16;

	PVRTuint32 c32;
	while(*pC)
	{
		// Straight copy. We'll check for surrogate pairs next...
		c32 = *pC++;

		// Check surrogate pair
		if(c32 >= UTF16_SURG_H_MARK && c32 <= UTF16_SURG_H_END)
		{
			// Make sure the next 2 bytes are in range...
			if(pC + 1 > pUTF16 + uiBufferLen || *pC == 0)
				return PVR_OVERFLOW;

			// Check that the next value is in the low surrogate range
			if(*pC < UTF16_SURG_L_MARK || *pC > UTF16_SURG_L_END)
				return PVR_FAIL;

			// Decode
			c32 = ((c32 - UTF16_SURG_H_MARK) << 10) + (*pC - UTF16_SURG_L_MARK) + 0x10000;
			pC++;
		}

		if(!CheckGenericUnicode(c32))
			return PVR_FAIL;

		// OK
		aUTF32.Append(c32);
	}

	return PVR_SUCCESS;
}

/*!***************************************************************************
 @Function			PVRTUnicodeUTF8Length
 @Input				pUTF8			A UTF8 string, which is null terminated.
 @Returns			The length of the string, in Unicode code points.
 @Description		Calculates the length of a UTF8 string. If pUTF8 is
					not null terminated, the results are undefined.
*****************************************************************************/
unsigned int PVRTUnicodeUTF8Length(const PVRTuint8* const pUTF8)
{
	const PVRTuint8* pC = pUTF8;

	unsigned int charCount = 0;
	unsigned int mask;
	while(*pC)
	{
		// Quick optimisation for ASCII characters
		const PVRTuint8* pStart = pC;
		while(*pC && *pC < VALID_ASCII)
			pC++;

		charCount += (unsigned int) (pC - pStart);

		// Done
		if(!*pC)
			break;

		mask = *pC & 0xF0;
		switch(mask)
		{
		case 0xF0: pC++;
		case 0xE0: pC++;
		case 0xC0: pC++;
			break;
		default:
			_ASSERT(!"Invalid tail byte!");
			return 0;
		}

		pC++;
		charCount++;
	}

	return charCount;
}

/*!***************************************************************************
 @Function			PVRTUnicodeUTF16Length
 @Input				pUTF16			A UTF16 string, which is null terminated.
 @Returns			The length of the string, in Unicode code points.
 @Description		Calculates the length of a UTF16 string.
					If pUTF16 is not null terminated, the results are
					undefined.
*****************************************************************************/
unsigned int PVRTUnicodeUTF16Length(const PVRTuint16* const pUTF16)
{
	const PVRTuint16* pC = pUTF16;
	unsigned int charCount = 0;
	while(*pC && (pC - pUTF16) < MAX_LEN)
	{
		if(	pC[0] >= UTF16_SURG_H_MARK && pC[0] <= UTF16_SURG_H_END
		 && pC[1] >= UTF16_SURG_L_MARK && pC[0] <= UTF16_SURG_L_END)
		{
			pC += 2;
		}
		else
		{
			pC += 1;
		}

		charCount++;
	}

	return charCount;
}

/*!***************************************************************************
 @Function			PVRTUnicodeValidUTF8
 @Input				pUTF8			A UTF8 string, which is null terminated.
 @Returns			true or false
 @Description		Checks whether the encoding of a UTF8 string is valid.
					If pUTF8 is not null terminated, the results are undefined.
*****************************************************************************/
bool PVRTUnicodeValidUTF8(const PVRTuint8* const pUTF8)
{
	unsigned int uiTailLen, uiIndex;
	unsigned int uiBytes = (unsigned int) strlen((const char*)pUTF8);
	const PVRTuint8* pC = pUTF8;
	while(*pC)
	{
		// Quick optimisation for ASCII characters
		while(*pC && *pC < VALID_ASCII)	pC++;
		// Done?
		if(!*pC)
			break;

		PVRTuint32 c32 = *pC++;
		uiTailLen = c_u8UTF8Lengths[c32];

		// Check for invalid tail length. Maximum 4 bytes for each UTF8 character.
		// Also check to make sure the tail length is inside the provided buffer.
		if(uiTailLen == 0 || (pC + uiTailLen > pUTF8 + uiBytes))
			return false;

		// Get the data out of each tail byte
		uiIndex = 0;
		while(uiIndex < uiTailLen)
		{
			if((pC[uiIndex] & 0xC0) != 0x80)
				return false;		// Invalid tail byte!

			c32 = (c32 << BYTES_PER_TAIL) + (pC[uiIndex] & TAIL_MASK);
			uiIndex++;
		}

		pC += uiIndex;

		// Check overlong values.
		if(c32 < c_u32MinVals[uiTailLen])
			return false;
		if(!CheckGenericUnicode(c32))
			return false;
	}

	return true;
}

/*****************************************************************************
 End of file (PVRTUnicode.cpp)
*****************************************************************************/