1/*
2 * Copyright (C) 2007 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#include <objmng/drm_i18n.h>
18
19#define IS_GB2312_HIGH_BYTE(c)  ((c) >= 0xA1 && (c) <= 0xF7)
20#define IS_GB2312_LOW_BYTE(c)   ((c) >= 0xA1 && (c) <= 0xFE)
21#define IS_GBK_HIGH_BYTE(c)     ((c) >= 0x81 && (c) <= 0xFE)
22#define IS_GBK_LOW_BYTE(c)      ((c) >= 0x40 && (c) <= 0xFE && (c) != 0x7F)
23#define IS_BIG5_HIGH_BYTE(c)    ((c) >= 0xA1 && (c) <= 0xF9)
24#define IS_BIG5_LOW_BYTE(c)     (((c) >= 0x40 && (c) <= 0x7E) \
25                                 || ((c) >= 0xA1 && (c) <= 0xFE))
26#define IS_ASCII(c)             ((c) <= 127)
27
28#define INVALID_UNICODE         0xFFFD
29
30#define I18N_LATIN1_SUPPORT
31#define I18N_UTF8_UTF16_SUPPORT
32
33
34/**
35 * Simply convert ISO 8859-1 (latin1) to unicode
36 */
37static int32_t latin1ToWcs(const uint8_t *mbs, int32_t mbsLen,
38        uint16_t *wcsBuf, int32_t bufSizeInWideChar,
39        int32_t *bytesConsumed);
40
41/**
42 * Convert one unicode char to ISO 8859-1 (latin1) byte
43 */
44static int32_t wcToLatin1(uint16_t wc, uint8_t * mbs, int32_t bufSize);
45
46/**
47 * Convert UTF-8 to unicode
48 */
49static int32_t utf8ToWcs(const uint8_t *mbs, int32_t mbsLen,
50        uint16_t *wcsBuf, int32_t bufSizeInWideChar,
51        int32_t *bytesConsumed);
52
53/**
54 * Convert one unicode char to UTF-8 bytes
55 */
56static int32_t wcToUtf8(uint16_t wc, uint8_t * mbs, int32_t bufSize);
57
58/**
59 * Convert UTF-16 BE to unicode
60 */
61static int32_t utf16beToWcs(const uint8_t *mbs, int32_t mbsLen,
62        uint16_t *wcsBuf, int32_t bufSizeInWideChar,
63        int32_t *bytesConsumed);
64
65/**
66 * Convert one unicode char to UTF-16 BE bytes
67 */
68static int32_t wcToUtf16be(uint16_t wc, uint8_t * mbs, int32_t bufSize);
69
70/**
71 * Convert UTF-16 LE to unicode
72 */
73static int32_t utf16leToWcs(const uint8_t *mbs, int32_t mbsLen,
74        uint16_t *wcsBuf, int32_t bufSizeInWideChar,
75        int32_t *bytesConsumed);
76
77/**
78 * Convert one unicode char to UTF-16 LE bytes
79 */
80static int32_t wcToUtf16le(uint16_t wc, uint8_t * mbs, int32_t bufSize);
81
82/*
83 * see drm_i18n.h
84 */
85int32_t DRM_i18n_mbsToWcs(DRM_Charset_t charset,
86        const uint8_t *mbs, int32_t mbsLen,
87        uint16_t *wcsBuf, int32_t bufSizeInWideChar,
88        int32_t *bytesConsumed)
89{
90    switch (charset)
91    {
92#ifdef I18N_GB2312_SUPPORT
93        case DRM_CHARSET_GB2312:
94            return gb2312ToWcs(mbs, mbsLen, wcsBuf, bufSizeInWideChar, bytesConsumed);
95#endif
96#ifdef I18N_GBK_SUPPORT
97        case DRM_CHARSET_GBK:
98            return gbkToWcs(mbs, mbsLen, wcsBuf, bufSizeInWideChar, bytesConsumed);
99#endif
100#ifdef I18N_BIG5_SUPPORT
101        case DRM_CHARSET_BIG5:
102            return big5ToWcs(mbs, mbsLen, wcsBuf, bufSizeInWideChar, bytesConsumed);
103#endif
104#ifdef I18N_LATIN1_SUPPORT
105        case DRM_CHARSET_LATIN1:
106            return latin1ToWcs(mbs, mbsLen, wcsBuf, bufSizeInWideChar, bytesConsumed);
107#endif
108#ifdef I18N_ISO8859X_SUPPORT
109        case DRM_CHARSET_LATIN2:
110        case DRM_CHARSET_LATIN3:
111        case DRM_CHARSET_LATIN4:
112        case DRM_CHARSET_CYRILLIC:
113        case DRM_CHARSET_ARABIC:
114        case DRM_CHARSET_GREEK:
115        case DRM_CHARSET_HEBREW:
116        case DRM_CHARSET_LATIN5:
117        case DRM_CHARSET_LATIN6:
118        case DRM_CHARSET_THAI:
119        case DRM_CHARSET_LATIN7:
120        case DRM_CHARSET_LATIN8:
121        case DRM_CHARSET_LATIN9:
122        case DRM_CHARSET_LATIN10:
123            return iso8859xToWcs(charset, mbs, mbsLen, wcsBuf, bufSizeInWideChar, bytesConsumed);
124#endif
125#ifdef I18N_UTF8_UTF16_SUPPORT
126        case DRM_CHARSET_UTF8:
127            return utf8ToWcs(mbs, mbsLen, wcsBuf, bufSizeInWideChar, bytesConsumed);
128        case DRM_CHARSET_UTF16BE:
129            return utf16beToWcs(mbs, mbsLen, wcsBuf, bufSizeInWideChar, bytesConsumed);
130        case DRM_CHARSET_UTF16LE:
131            return utf16leToWcs(mbs, mbsLen, wcsBuf, bufSizeInWideChar, bytesConsumed);
132#endif
133        default:
134            return -1;
135    }
136}
137
138/*
139 * see drm_i18n.h
140 */
141int32_t DRM_i18n_wcsToMbs(DRM_Charset_t charset,
142        const uint16_t *wcs, int32_t wcsLen,
143        uint8_t *mbsBuf, int32_t bufSizeInByte)
144{
145    int32_t (* wcToMbFunc)(uint16_t, uint8_t *, int32_t);
146    int32_t charIndex = 0;
147    int32_t numMultiBytes = 0;
148
149    switch (charset)
150    {
151#ifdef I18N_LATIN1_SUPPORT
152        case DRM_CHARSET_LATIN1:
153            wcToMbFunc = wcToLatin1;
154            break;
155#endif
156#ifdef I18N_UTF8_UTF16_SUPPORT
157        case DRM_CHARSET_UTF8:
158            wcToMbFunc = wcToUtf8;
159            break;
160        case DRM_CHARSET_UTF16BE:
161            wcToMbFunc = wcToUtf16be;
162            break;
163        case DRM_CHARSET_UTF16LE:
164            wcToMbFunc = wcToUtf16le;
165            break;
166#endif
167#ifdef I18N_ISO8859X_SUPPORT
168        case DRM_CHARSET_LATIN2:
169        case DRM_CHARSET_LATIN3:
170        case DRM_CHARSET_LATIN4:
171        case DRM_CHARSET_CYRILLIC:
172        case DRM_CHARSET_ARABIC:
173        case DRM_CHARSET_GREEK:
174        case DRM_CHARSET_HEBREW:
175        case DRM_CHARSET_LATIN5:
176        case DRM_CHARSET_LATIN6:
177        case DRM_CHARSET_THAI:
178        case DRM_CHARSET_LATIN7:
179        case DRM_CHARSET_LATIN8:
180        case DRM_CHARSET_LATIN9:
181        case DRM_CHARSET_LATIN10:
182            return wcsToIso8859x(charset, wcs, wcsLen, mbsBuf, bufSizeInByte);
183#endif
184        default:
185            return -1;
186    }
187
188    if (mbsBuf) {
189        while (numMultiBytes < bufSizeInByte && charIndex < wcsLen) {
190            /* TODO: handle surrogate pair values here */
191            int32_t mbLen = wcToMbFunc(wcs[charIndex],
192                    &mbsBuf[numMultiBytes], bufSizeInByte - numMultiBytes);
193
194            if (numMultiBytes + mbLen > bufSizeInByte) {
195                /* Insufficient buffer. Don't update numMultiBytes */
196                break;
197            }
198            charIndex++;
199            numMultiBytes += mbLen;
200        }
201    } else {
202        while (charIndex < wcsLen) {
203            /* TODO: handle surrogate pair values here */
204            numMultiBytes += wcToMbFunc(wcs[charIndex], NULL, 0);
205            charIndex++;
206        }
207    }
208
209    return numMultiBytes;
210}
211
212
213#ifdef I18N_LATIN1_SUPPORT
214
215int32_t latin1ToWcs(const uint8_t *mbs, int32_t mbsLen,
216        uint16_t *wcsBuf, int32_t bufSizeInWideChar,
217        int32_t *bytesConsumed)
218{
219    int32_t charsToConvert;
220    int32_t len;
221
222    if (wcsBuf == NULL) {
223        return mbsLen;
224    }
225
226    len = charsToConvert = mbsLen > bufSizeInWideChar ? bufSizeInWideChar : mbsLen;
227    if (len < 0)
228        return 0;
229    while (len--) {
230        *wcsBuf++ = *mbs++;
231    }
232
233    if (bytesConsumed)
234        *bytesConsumed = charsToConvert;
235
236    return charsToConvert;
237}
238
239int32_t wcToLatin1(uint16_t wc, uint8_t * mbs, int32_t bufSize)
240{
241    uint8_t ch;
242
243    if (wc < 0x100) {
244        ch = (uint8_t)(wc & 0xff);
245    } else {
246        ch = '?';
247    }
248    if (mbs && bufSize > 0)
249        *mbs = ch;
250    return 1;
251}
252
253#endif /* I18N_LATIN1_SUPPORT */
254
255#ifdef I18N_UTF8_UTF16_SUPPORT
256
257int32_t utf8ToWcs(const uint8_t *mbs, int32_t mbsLen,
258        uint16_t *wcsBuf, int32_t bufSizeInWideChar,
259        int32_t *bytesConsumed)
260{
261    int32_t charsConverted = 0;
262    int32_t i = 0;
263    int32_t wideChar;
264
265    if (wcsBuf == NULL) {
266        /* No conversion but we're still going to calculate bytesConsumed */
267        bufSizeInWideChar = mbsLen * 2;
268    }
269
270    while((i < mbsLen) && (charsConverted < bufSizeInWideChar)) {
271        uint8_t ch = mbs[i];
272        uint8_t ch2, ch3, ch4;
273
274        wideChar = -1;
275
276        if(IS_ASCII(ch)) {
277            wideChar = ch;
278            i++;
279        } else if ((ch & 0xc0) == 0xc0) {
280            int utfStart = i;
281            if ((ch & 0xe0) == 0xc0) {
282                /* 2 byte sequence */
283                if (i + 1 < mbsLen && ((ch2 = mbs[i + 1]) & 0xc0) == 0x80) {
284                    wideChar = (uint16_t)(((ch & 0x1F) << 6) | (ch2 & 0x3F));
285                    i += 2;
286                } else {
287                    /* skip incomplete sequence */
288                    i++;
289                }
290            } else if ((ch & 0xf0) == 0xe0) {
291                /* 3 byte sequence */
292                if (i + 2 < mbsLen
293                        && ((ch2 = mbs[i + 1]) & 0xc0) == 0x80
294                        && ((ch3 = mbs[i + 2]) & 0xc0) == 0x80) {
295                    wideChar = (uint16_t)(((ch & 0x0F) << 12) | ((ch2 & 0x3F) << 6) | (ch3 & 0x3F));
296                    i += 3;
297                } else {
298                    /* skip incomplete sequence (up to 2 bytes) */
299                    i++;
300                    if (i < mbsLen && (mbs[i] & 0xc0) == 0x80)
301                        i++;
302                }
303            } else if ((ch & 0xf8) == 0xf0) {
304                /* 4 byte sequence */
305                if (i + 3 < mbsLen
306                        && ((ch2 = mbs[i + 1]) & 0xc0) == 0x80
307                        && ((ch3 = mbs[i + 2]) & 0xc0) == 0x80
308                        && ((ch4 = mbs[i + 3]) & 0xc0) == 0x80) {
309                    /* FIXME: we do NOT support U+10000 - U+10FFFF for now.
310                     *        leave it as 0xFFFD. */
311                    wideChar = INVALID_UNICODE;
312                    i += 4;
313                } else {
314                    /* skip incomplete sequence (up to 3 bytes) */
315                    i++;
316                    if (i < mbsLen && (mbs[i] & 0xc0) == 0x80) {
317                        i++;
318                        if (i < mbsLen && (mbs[i] & 0xc0) == 0x80) {
319                            i++;
320                        }
321                    }
322                }
323            } else {
324                /* invalid */
325                i++;
326            }
327            if (i >= mbsLen && wideChar == -1) {
328                /* Possible incomplete UTF-8 sequence at the end of mbs.
329                 * Leave it to the caller.
330                 */
331                i = utfStart;
332                break;
333            }
334        } else {
335            /* invalid */
336            i++;
337        }
338        if(wcsBuf) {
339            if (wideChar == -1)
340                wideChar = INVALID_UNICODE;
341            wcsBuf[charsConverted] = (uint16_t)wideChar;
342        }
343        charsConverted++;
344    }
345
346    if (bytesConsumed)
347        *bytesConsumed = i;
348
349    return charsConverted;
350}
351
352int32_t wcToUtf8(uint16_t wc, uint8_t * mbs, int32_t bufSize)
353{
354    if (wc <= 0x7f) {
355        if (mbs && (bufSize >= 1)) {
356            *mbs = (uint8_t)wc;
357        }
358        return 1;
359    } else if (wc <= 0x7ff) {
360        if (mbs && (bufSize >= 2)) {
361            *mbs++ = (uint8_t)((wc >> 6) | 0xc0);
362            *mbs = (uint8_t)((wc & 0x3f) | 0x80);
363        }
364        return 2;
365    } else {
366        if (mbs && (bufSize >= 3)) {
367            *mbs++ = (uint8_t)((wc >> 12) | 0xe0);
368            *mbs++ = (uint8_t)(((wc >> 6) & 0x3f)| 0x80);
369            *mbs = (uint8_t)((wc & 0x3f) | 0x80);
370        }
371        return 3;
372    }
373}
374
375int32_t utf16beToWcs(const uint8_t *mbs, int32_t mbsLen,
376        uint16_t *wcsBuf, int32_t bufSizeInWideChar,
377        int32_t *bytesConsumed)
378{
379    int32_t charsToConvert;
380    int32_t len;
381
382    if (wcsBuf == NULL) {
383        return mbsLen / 2;
384    }
385
386    len = charsToConvert = (mbsLen / 2) > bufSizeInWideChar ? bufSizeInWideChar : (mbsLen / 2);
387    while (len--) {
388        /* TODO: handle surrogate pair values */
389        *wcsBuf++ = (uint16_t)((*mbs << 8) | *(mbs + 1));
390        mbs += 2;
391    }
392
393    if (bytesConsumed)
394        *bytesConsumed = charsToConvert * 2;
395
396    return charsToConvert;
397}
398
399int32_t wcToUtf16be(uint16_t wc, uint8_t * mbs, int32_t bufSize)
400{
401    if (mbs && bufSize >= 2) {
402        /* TODO: handle surrogate pair values */
403        *mbs = (uint8_t)(wc >> 8);
404        *(mbs + 1) = (uint8_t)(wc & 0xff);
405    }
406    return 2;
407}
408
409int32_t utf16leToWcs(const uint8_t *mbs, int32_t mbsLen,
410        uint16_t *wcsBuf, int32_t bufSizeInWideChar,
411        int32_t *bytesConsumed)
412{
413    int32_t charsToConvert;
414    int32_t len;
415
416    if (wcsBuf == NULL) {
417        return mbsLen / 2;
418    }
419
420    len = charsToConvert = (mbsLen / 2) > bufSizeInWideChar ? bufSizeInWideChar : (mbsLen / 2);
421    while (len--) {
422        /* TODO: handle surrogate pair values */
423        *wcsBuf++ = (uint16_t)(*mbs | (*(mbs + 1) << 8));
424        mbs += 2;
425    }
426
427    if (bytesConsumed)
428        *bytesConsumed = charsToConvert * 2;
429
430    return charsToConvert;
431}
432
433int32_t wcToUtf16le(uint16_t wc, uint8_t * mbs, int32_t bufSize)
434{
435    if (mbs && bufSize >= 2) {
436        /* TODO: handle surrogate pair values */
437        *mbs = (uint8_t)(wc & 0xff);
438        *(mbs + 1) = (uint8_t)(wc >> 8);
439    }
440    return 2;
441}
442
443#endif /* I18N_UTF8_UTF16_SUPPORT */
444
445