1/*
2 * The authors of this software are Rob Pike and Ken Thompson.
3 *              Copyright (c) 2002 by Lucent Technologies.
4 * Permission to use, copy, modify, and distribute this software for any
5 * purpose without fee is hereby granted, provided that this entire notice
6 * is included in all copies of any software which is or includes a copy
7 * or modification of this software and in all copies of the supporting
8 * documentation for such software.
9 * THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED
10 * WARRANTY.  IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE ANY
11 * REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
12 * OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.
13 */
14#include <stdarg.h>
15#include <string.h>
16#include "phonenumbers/utf/utf.h"
17#include "phonenumbers/utf/utfdef.h"
18
19enum
20{
21	Bit1	= 7,
22	Bitx	= 6,
23	Bit2	= 5,
24	Bit3	= 4,
25	Bit4	= 3,
26	Bit5	= 2,
27
28	T1	= ((1<<(Bit1+1))-1) ^ 0xFF,	/* 0000 0000 */
29	Tx	= ((1<<(Bitx+1))-1) ^ 0xFF,	/* 1000 0000 */
30	T2	= ((1<<(Bit2+1))-1) ^ 0xFF,	/* 1100 0000 */
31	T3	= ((1<<(Bit3+1))-1) ^ 0xFF,	/* 1110 0000 */
32	T4	= ((1<<(Bit4+1))-1) ^ 0xFF,	/* 1111 0000 */
33	T5	= ((1<<(Bit5+1))-1) ^ 0xFF,	/* 1111 1000 */
34
35	Rune1	= (1<<(Bit1+0*Bitx))-1,		/* 0000 0000 0111 1111 */
36	Rune2	= (1<<(Bit2+1*Bitx))-1,		/* 0000 0111 1111 1111 */
37	Rune3	= (1<<(Bit3+2*Bitx))-1,		/* 1111 1111 1111 1111 */
38	Rune4	= (1<<(Bit4+3*Bitx))-1,
39                                        /* 0001 1111 1111 1111 1111 1111 */
40
41	Maskx	= (1<<Bitx)-1,			/* 0011 1111 */
42	Testx	= Maskx ^ 0xFF,			/* 1100 0000 */
43
44	Bad	= Runeerror,
45};
46
47/*
48 * Modified by Wei-Hwa Huang, Google Inc., on 2004-09-24
49 * This is a slower but "safe" version of the old chartorune
50 * that works on strings that are not necessarily null-terminated.
51 *
52 * If you know for sure that your string is null-terminated,
53 * chartorune will be a bit faster.
54 *
55 * It is guaranteed not to attempt to access "length"
56 * past the incoming pointer.  This is to avoid
57 * possible access violations.  If the string appears to be
58 * well-formed but incomplete (i.e., to get the whole Rune
59 * we'd need to read past str+length) then we'll set the Rune
60 * to Bad and return 0.
61 *
62 * Note that if we have decoding problems for other
63 * reasons, we return 1 instead of 0.
64 */
65int
66charntorune(Rune *rune, const char *str, int length)
67{
68	int c, c1, c2, c3;
69	long l;
70
71	/* When we're not allowed to read anything */
72	if(length <= 0) {
73		goto badlen;
74	}
75
76	/*
77	 * one character sequence (7-bit value)
78	 *	00000-0007F => T1
79	 */
80	c = *(uchar*)str;
81	if(c < Tx) {
82		*rune = c;
83		return 1;
84	}
85
86	// If we can't read more than one character we must stop
87	if(length <= 1) {
88		goto badlen;
89	}
90
91	/*
92	 * two character sequence (11-bit value)
93	 *	0080-07FF => T2 Tx
94	 */
95	c1 = *(uchar*)(str+1) ^ Tx;
96	if(c1 & Testx)
97		goto bad;
98	if(c < T3) {
99		if(c < T2)
100			goto bad;
101		l = ((c << Bitx) | c1) & Rune2;
102		if(l <= Rune1)
103			goto bad;
104		*rune = l;
105		return 2;
106	}
107
108	// If we can't read more than two characters we must stop
109	if(length <= 2) {
110		goto badlen;
111	}
112
113	/*
114	 * three character sequence (16-bit value)
115	 *	0800-FFFF => T3 Tx Tx
116	 */
117	c2 = *(uchar*)(str+2) ^ Tx;
118	if(c2 & Testx)
119		goto bad;
120	if(c < T4) {
121		l = ((((c << Bitx) | c1) << Bitx) | c2) & Rune3;
122		if(l <= Rune2)
123			goto bad;
124		*rune = l;
125		return 3;
126	}
127
128	if (length <= 3)
129		goto badlen;
130
131	/*
132	 * four character sequence (21-bit value)
133	 *	10000-1FFFFF => T4 Tx Tx Tx
134	 */
135	c3 = *(uchar*)(str+3) ^ Tx;
136	if (c3 & Testx)
137		goto bad;
138	if (c < T5) {
139		l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4;
140		if (l <= Rune3)
141			goto bad;
142		*rune = l;
143		return 4;
144	}
145
146	// Support for 5-byte or longer UTF-8 would go here, but
147	// since we don't have that, we'll just fall through to bad.
148
149	/*
150	 * bad decoding
151	 */
152bad:
153	*rune = Bad;
154	return 1;
155badlen:
156	*rune = Bad;
157	return 0;
158
159}
160
161
162/*
163 * This is the older "unsafe" version, which works fine on
164 * null-terminated strings.
165 */
166int
167chartorune(Rune *rune, const char *str)
168{
169	int c, c1, c2, c3;
170	long l;
171
172	/*
173	 * one character sequence
174	 *	00000-0007F => T1
175	 */
176	c = *(uchar*)str;
177	if(c < Tx) {
178		*rune = c;
179		return 1;
180	}
181
182	/*
183	 * two character sequence
184	 *	0080-07FF => T2 Tx
185	 */
186	c1 = *(uchar*)(str+1) ^ Tx;
187	if(c1 & Testx)
188		goto bad;
189	if(c < T3) {
190		if(c < T2)
191			goto bad;
192		l = ((c << Bitx) | c1) & Rune2;
193		if(l <= Rune1)
194			goto bad;
195		*rune = l;
196		return 2;
197	}
198
199	/*
200	 * three character sequence
201	 *	0800-FFFF => T3 Tx Tx
202	 */
203	c2 = *(uchar*)(str+2) ^ Tx;
204	if(c2 & Testx)
205		goto bad;
206	if(c < T4) {
207		l = ((((c << Bitx) | c1) << Bitx) | c2) & Rune3;
208		if(l <= Rune2)
209			goto bad;
210		*rune = l;
211		return 3;
212	}
213
214	/*
215	 * four character sequence (21-bit value)
216	 *	10000-1FFFFF => T4 Tx Tx Tx
217	 */
218	c3 = *(uchar*)(str+3) ^ Tx;
219	if (c3 & Testx)
220		goto bad;
221	if (c < T5) {
222		l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4;
223		if (l <= Rune3)
224			goto bad;
225		*rune = l;
226		return 4;
227	}
228
229	/*
230	 * Support for 5-byte or longer UTF-8 would go here, but
231	 * since we don't have that, we'll just fall through to bad.
232	 */
233
234	/*
235	 * bad decoding
236	 */
237bad:
238	*rune = Bad;
239	return 1;
240}
241
242int
243isvalidcharntorune(const char* str, int length, Rune* rune, int* consumed) {
244	*consumed = charntorune(rune, str, length);
245	return *rune != Runeerror || *consumed == 3;
246}
247
248int
249runetochar(char *str, const Rune *rune)
250{
251	/* Runes are signed, so convert to unsigned for range check. */
252	unsigned long c;
253
254	/*
255	 * one character sequence
256	 *	00000-0007F => 00-7F
257	 */
258	c = *rune;
259	if(c <= Rune1) {
260		str[0] = c;
261		return 1;
262	}
263
264	/*
265	 * two character sequence
266	 *	0080-07FF => T2 Tx
267	 */
268	if(c <= Rune2) {
269		str[0] = T2 | (c >> 1*Bitx);
270		str[1] = Tx | (c & Maskx);
271		return 2;
272	}
273
274	/*
275	 * If the Rune is out of range, convert it to the error rune.
276	 * Do this test here because the error rune encodes to three bytes.
277	 * Doing it earlier would duplicate work, since an out of range
278	 * Rune wouldn't have fit in one or two bytes.
279	 */
280	if (c > Runemax)
281		c = Runeerror;
282
283	/*
284	 * three character sequence
285	 *	0800-FFFF => T3 Tx Tx
286	 */
287	if (c <= Rune3) {
288		str[0] = T3 |  (c >> 2*Bitx);
289		str[1] = Tx | ((c >> 1*Bitx) & Maskx);
290		str[2] = Tx |  (c & Maskx);
291		return 3;
292	}
293
294	/*
295	 * four character sequence (21-bit value)
296	 *     10000-1FFFFF => T4 Tx Tx Tx
297	 */
298	str[0] = T4 | (c >> 3*Bitx);
299	str[1] = Tx | ((c >> 2*Bitx) & Maskx);
300	str[2] = Tx | ((c >> 1*Bitx) & Maskx);
301	str[3] = Tx | (c & Maskx);
302	return 4;
303}
304
305int
306runelen(Rune rune)
307{
308	char str[10];
309
310	return runetochar(str, &rune);
311}
312
313int
314runenlen(const Rune *r, int nrune)
315{
316	int nb, c;
317
318	nb = 0;
319	while(nrune--) {
320		c = *r++;
321		if (c <= Rune1)
322			nb++;
323		else if (c <= Rune2)
324			nb += 2;
325		else if (c <= Rune3)
326			nb += 3;
327		else /* assert(c <= Rune4) */
328			nb += 4;
329	}
330	return nb;
331}
332
333int
334fullrune(const char *str, int n)
335{
336	if (n > 0) {
337		int c = *(uchar*)str;
338		if (c < Tx)
339			return 1;
340		if (n > 1) {
341			if (c < T3)
342				return 1;
343			if (n > 2) {
344				if (c < T4 || n > 3)
345					return 1;
346			}
347		}
348	}
349	return 0;
350}
351