1/******************************************************************************
2
3 @File         PVRTUnicode.cpp
4
5 @Title        PVRTUnicode
6
7 @Version       @Version
8
9 @Copyright    Copyright (c) Imagination Technologies Limited.
10
11 @Platform     All
12
13 @Description  A small collection of functions used to decode Unicode formats to
14               individual code points.
15
16******************************************************************************/
17#include "PVRTUnicode.h"
18#include <string.h>
19
20/****************************************************************************
21** Constants
22****************************************************************************/
23const PVRTuint32 c_u32ReplChar = 0xFFFD;
24
25#define VALID_ASCII 0x80
26#define TAIL_MASK 0x3F
27#define BYTES_PER_TAIL 6
28
29#define UTF16_SURG_H_MARK 0xD800
30#define UTF16_SURG_H_END  0xDBFF
31#define UTF16_SURG_L_MARK 0xDC00
32#define UTF16_SURG_L_END  0xDFFF
33
34#define UNICODE_NONCHAR_MARK 0xFDD0
35#define UNICODE_NONCHAR_END  0xFDEF
36#define UNICODE_RESERVED	 0xFFFE
37#define UNICODE_MAX			 0x10FFFF
38
39#define MAX_LEN 0x8FFF
40
41/****************************************************************************
42** A table which allows quick lookup to determine the number of bytes of a
43** UTF8 code point.
44****************************************************************************/
45const PVRTuint8 c_u8UTF8Lengths[256] =
46{
47	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
48	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
49	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
50	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
51	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
52	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
53	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
54	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
55	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
56	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
57	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
58	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
59	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
60	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
61	2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
62	3,3,3,3,3,3,3,3,0,0,0,0,0,0,0,0,
63};
64
65/****************************************************************************
66** A table which allows quick lookup to determine whether a UTF8 sequence
67** is 'overlong'.
68****************************************************************************/
69const PVRTuint32 c_u32MinVals[4] =
70{
71	0x00000000,		// 0 tail bytes
72	0x00000080,		// 1 tail bytes
73	0x00000800,		// 2 tail bytes
74	0x00010000,		// 3 tail bytes
75};
76
77/*!***************************************************************************
78 @Function			CheckGenericUnicode
79 @Input				c32			A UTF32 character/Unicode code point
80 @Returns			Success or failure.
81 @Description		Checks that the decoded code point is valid.
82*****************************************************************************/
83static bool CheckGenericUnicode(PVRTuint32 c32)
84{
85	// Check that this value isn't a UTF16 surrogate mask.
86	if(c32 >= UTF16_SURG_H_MARK && c32 <= UTF16_SURG_L_END)
87		return false;
88	// Check non-char values
89	if(c32 >= UNICODE_NONCHAR_MARK && c32 <= UNICODE_NONCHAR_END)
90		return false;
91	// Check reserved values
92	if((c32 & UNICODE_RESERVED) == UNICODE_RESERVED)
93		return false;
94	// Check max value.
95	if(c32 > UNICODE_MAX)
96		return false;
97
98	return true;
99}
100
101/*!***************************************************************************
102 @Function			PVRTUnicodeUTF8ToUTF32
103 @Input				pUTF8			A UTF8 string, which is null terminated.
104 @Output			aUTF32			An array of Unicode code points.
105 @Returns			Success or failure.
106 @Description		Decodes a UTF8-encoded string in to Unicode code points
107					(UTF32). If pUTF8 is not null terminated, the results are
108					undefined.
109*****************************************************************************/
110EPVRTError PVRTUnicodeUTF8ToUTF32(const PVRTuint8* const pUTF8, CPVRTArray<PVRTuint32>& aUTF32)
111{
112	unsigned int uiTailLen, uiIndex;
113	unsigned int uiBytes = (unsigned int) strlen((const char*)pUTF8);
114	PVRTuint32 c32;
115
116	const PVRTuint8* pC = pUTF8;
117	while(*pC)
118	{
119		// Quick optimisation for ASCII characters
120		while(*pC && *pC < VALID_ASCII)
121		{
122			aUTF32.Append(*pC++);
123		}
124		// Done
125		if(!*pC)
126			break;
127
128		c32 = *pC++;
129		uiTailLen = c_u8UTF8Lengths[c32];
130
131		// Check for invalid tail length. Maximum 4 bytes for each UTF8 character.
132		// Also check to make sure the tail length is inside the provided buffer.
133		if(uiTailLen == 0 || (pC + uiTailLen > pUTF8 + uiBytes))
134			return PVR_OVERFLOW;
135
136		c32 &= (TAIL_MASK >> uiTailLen);	// Get the data out of the first byte. This depends on the length of the tail.
137
138		// Get the data out of each tail byte
139		uiIndex = 0;
140		while(uiIndex < uiTailLen)
141		{
142			if((pC[uiIndex] & 0xC0) != 0x80)
143				return PVR_FAIL;		// Invalid tail byte!
144
145			c32 = (c32 << BYTES_PER_TAIL) + (pC[uiIndex] & TAIL_MASK);
146			uiIndex++;
147		}
148
149		pC += uiIndex;
150
151		// Check overlong values.
152		if(c32 < c_u32MinVals[uiTailLen])
153			return PVR_FAIL;
154
155		if(!CheckGenericUnicode(c32))
156			return PVR_FAIL;
157
158		// OK
159		aUTF32.Append(c32);
160	}
161
162	return PVR_SUCCESS;
163}
164
165/*!***************************************************************************
166 @Function			PVRTUnicodeUTF16ToUTF32
167 @Input				pUTF16			A UTF16 string, which is null terminated.
168 @Output			aUTF32			An array of Unicode code points.
169 @Returns			Success or failure.
170 @Description		Decodes a UTF16-encoded string in to Unicode code points
171					(UTF32). If pUTF16 is not null terminated, the results are
172					undefined.
173*****************************************************************************/
174EPVRTError PVRTUnicodeUTF16ToUTF32(const PVRTuint16* const pUTF16, CPVRTArray<PVRTuint32>& aUTF32)
175{
176	const PVRTuint16* pC = pUTF16;
177
178	// Determine the number of shorts
179	while(*++pC && (pC - pUTF16) < MAX_LEN);
180	unsigned int uiBufferLen = (unsigned int) (pC - pUTF16);
181
182	if(uiBufferLen == MAX_LEN)
183		return PVR_OVERFLOW;		// Probably not NULL terminated.
184
185	// Reset to start.
186	pC = pUTF16;
187
188	PVRTuint32 c32;
189	while(*pC)
190	{
191		// Straight copy. We'll check for surrogate pairs next...
192		c32 = *pC++;
193
194		// Check surrogate pair
195		if(c32 >= UTF16_SURG_H_MARK && c32 <= UTF16_SURG_H_END)
196		{
197			// Make sure the next 2 bytes are in range...
198			if(pC + 1 > pUTF16 + uiBufferLen || *pC == 0)
199				return PVR_OVERFLOW;
200
201			// Check that the next value is in the low surrogate range
202			if(*pC < UTF16_SURG_L_MARK || *pC > UTF16_SURG_L_END)
203				return PVR_FAIL;
204
205			// Decode
206			c32 = ((c32 - UTF16_SURG_H_MARK) << 10) + (*pC - UTF16_SURG_L_MARK) + 0x10000;
207			pC++;
208		}
209
210		if(!CheckGenericUnicode(c32))
211			return PVR_FAIL;
212
213		// OK
214		aUTF32.Append(c32);
215	}
216
217	return PVR_SUCCESS;
218}
219
220/*!***************************************************************************
221 @Function			PVRTUnicodeUTF8Length
222 @Input				pUTF8			A UTF8 string, which is null terminated.
223 @Returns			The length of the string, in Unicode code points.
224 @Description		Calculates the length of a UTF8 string. If pUTF8 is
225					not null terminated, the results are undefined.
226*****************************************************************************/
227unsigned int PVRTUnicodeUTF8Length(const PVRTuint8* const pUTF8)
228{
229	const PVRTuint8* pC = pUTF8;
230
231	unsigned int charCount = 0;
232	unsigned int mask;
233	while(*pC)
234	{
235		// Quick optimisation for ASCII characters
236		const PVRTuint8* pStart = pC;
237		while(*pC && *pC < VALID_ASCII)
238			pC++;
239
240		charCount += (unsigned int) (pC - pStart);
241
242		// Done
243		if(!*pC)
244			break;
245
246		mask = *pC & 0xF0;
247		switch(mask)
248		{
249		case 0xF0: pC++;
250		case 0xE0: pC++;
251		case 0xC0: pC++;
252			break;
253		default:
254			_ASSERT(!"Invalid tail byte!");
255			return 0;
256		}
257
258		pC++;
259		charCount++;
260	}
261
262	return charCount;
263}
264
265/*!***************************************************************************
266 @Function			PVRTUnicodeUTF16Length
267 @Input				pUTF16			A UTF16 string, which is null terminated.
268 @Returns			The length of the string, in Unicode code points.
269 @Description		Calculates the length of a UTF16 string.
270					If pUTF16 is not null terminated, the results are
271					undefined.
272*****************************************************************************/
273unsigned int PVRTUnicodeUTF16Length(const PVRTuint16* const pUTF16)
274{
275	const PVRTuint16* pC = pUTF16;
276	unsigned int charCount = 0;
277	while(*pC && (pC - pUTF16) < MAX_LEN)
278	{
279		if(	pC[0] >= UTF16_SURG_H_MARK && pC[0] <= UTF16_SURG_H_END
280		 && pC[1] >= UTF16_SURG_L_MARK && pC[0] <= UTF16_SURG_L_END)
281		{
282			pC += 2;
283		}
284		else
285		{
286			pC += 1;
287		}
288
289		charCount++;
290	}
291
292	return charCount;
293}
294
295/*!***************************************************************************
296 @Function			PVRTUnicodeValidUTF8
297 @Input				pUTF8			A UTF8 string, which is null terminated.
298 @Returns			true or false
299 @Description		Checks whether the encoding of a UTF8 string is valid.
300					If pUTF8 is not null terminated, the results are undefined.
301*****************************************************************************/
302bool PVRTUnicodeValidUTF8(const PVRTuint8* const pUTF8)
303{
304	unsigned int uiTailLen, uiIndex;
305	unsigned int uiBytes = (unsigned int) strlen((const char*)pUTF8);
306	const PVRTuint8* pC = pUTF8;
307	while(*pC)
308	{
309		// Quick optimisation for ASCII characters
310		while(*pC && *pC < VALID_ASCII)	pC++;
311		// Done?
312		if(!*pC)
313			break;
314
315		PVRTuint32 c32 = *pC++;
316		uiTailLen = c_u8UTF8Lengths[c32];
317
318		// Check for invalid tail length. Maximum 4 bytes for each UTF8 character.
319		// Also check to make sure the tail length is inside the provided buffer.
320		if(uiTailLen == 0 || (pC + uiTailLen > pUTF8 + uiBytes))
321			return false;
322
323		// Get the data out of each tail byte
324		uiIndex = 0;
325		while(uiIndex < uiTailLen)
326		{
327			if((pC[uiIndex] & 0xC0) != 0x80)
328				return false;		// Invalid tail byte!
329
330			c32 = (c32 << BYTES_PER_TAIL) + (pC[uiIndex] & TAIL_MASK);
331			uiIndex++;
332		}
333
334		pC += uiIndex;
335
336		// Check overlong values.
337		if(c32 < c_u32MinVals[uiTailLen])
338			return false;
339		if(!CheckGenericUnicode(c32))
340			return false;
341	}
342
343	return true;
344}
345
346/*****************************************************************************
347 End of file (PVRTUnicode.cpp)
348*****************************************************************************/
349
350