src/objmng/drm_i18n.c

/*
 * Copyright (C) 2007 The Android Open Source Project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#include <objmng/drm_i18n.h>

#define IS_GB2312_HIGH_BYTE(c)  ((c) >= 0xA1 && (c) <= 0xF7)
#define IS_GB2312_LOW_BYTE(c)   ((c) >= 0xA1 && (c) <= 0xFE)
#define IS_GBK_HIGH_BYTE(c)     ((c) >= 0x81 && (c) <= 0xFE)
#define IS_GBK_LOW_BYTE(c)      ((c) >= 0x40 && (c) <= 0xFE && (c) != 0x7F)
#define IS_BIG5_HIGH_BYTE(c)    ((c) >= 0xA1 && (c) <= 0xF9)
#define IS_BIG5_LOW_BYTE(c)     (((c) >= 0x40 && (c) <= 0x7E) \
                                 || ((c) >= 0xA1 && (c) <= 0xFE))
#define IS_ASCII(c)             ((c) <= 127)

#define INVALID_UNICODE         0xFFFD

#define I18N_LATIN1_SUPPORT
#define I18N_UTF8_UTF16_SUPPORT


/**
 * Simply convert ISO 8859-1 (latin1) to unicode
 */
static int32_t latin1ToWcs(const uint8_t *mbs, int32_t mbsLen,
        uint16_t *wcsBuf, int32_t bufSizeInWideChar,
        int32_t *bytesConsumed);

/**
 * Convert one unicode char to ISO 8859-1 (latin1) byte
 */
static int32_t wcToLatin1(uint16_t wc, uint8_t * mbs, int32_t bufSize);

/**
 * Convert UTF-8 to unicode
 */
static int32_t utf8ToWcs(const uint8_t *mbs, int32_t mbsLen,
        uint16_t *wcsBuf, int32_t bufSizeInWideChar,
        int32_t *bytesConsumed);

/**
 * Convert one unicode char to UTF-8 bytes
 */
static int32_t wcToUtf8(uint16_t wc, uint8_t * mbs, int32_t bufSize);

/**
 * Convert UTF-16 BE to unicode
 */
static int32_t utf16beToWcs(const uint8_t *mbs, int32_t mbsLen,
        uint16_t *wcsBuf, int32_t bufSizeInWideChar,
        int32_t *bytesConsumed);

/**
 * Convert one unicode char to UTF-16 BE bytes
 */
static int32_t wcToUtf16be(uint16_t wc, uint8_t * mbs, int32_t bufSize);

/**
 * Convert UTF-16 LE to unicode
 */
static int32_t utf16leToWcs(const uint8_t *mbs, int32_t mbsLen,
        uint16_t *wcsBuf, int32_t bufSizeInWideChar,
        int32_t *bytesConsumed);

/**
 * Convert one unicode char to UTF-16 LE bytes
 */
static int32_t wcToUtf16le(uint16_t wc, uint8_t * mbs, int32_t bufSize);

/*
 * see drm_i18n.h
 */
int32_t DRM_i18n_mbsToWcs(DRM_Charset_t charset,
        const uint8_t *mbs, int32_t mbsLen,
        uint16_t *wcsBuf, int32_t bufSizeInWideChar,
        int32_t *bytesConsumed)
{
    switch (charset)
    {
#ifdef I18N_GB2312_SUPPORT
        case DRM_CHARSET_GB2312:
            return gb2312ToWcs(mbs, mbsLen, wcsBuf, bufSizeInWideChar, bytesConsumed);
#endif
#ifdef I18N_GBK_SUPPORT
        case DRM_CHARSET_GBK:
            return gbkToWcs(mbs, mbsLen, wcsBuf, bufSizeInWideChar, bytesConsumed);
#endif
#ifdef I18N_BIG5_SUPPORT
        case DRM_CHARSET_BIG5:
            return big5ToWcs(mbs, mbsLen, wcsBuf, bufSizeInWideChar, bytesConsumed);
#endif
#ifdef I18N_LATIN1_SUPPORT
        case DRM_CHARSET_LATIN1:
            return latin1ToWcs(mbs, mbsLen, wcsBuf, bufSizeInWideChar, bytesConsumed);
#endif
#ifdef I18N_ISO8859X_SUPPORT
        case DRM_CHARSET_LATIN2:
        case DRM_CHARSET_LATIN3:
        case DRM_CHARSET_LATIN4:
        case DRM_CHARSET_CYRILLIC:
        case DRM_CHARSET_ARABIC:
        case DRM_CHARSET_GREEK:
        case DRM_CHARSET_HEBREW:
        case DRM_CHARSET_LATIN5:
        case DRM_CHARSET_LATIN6:
        case DRM_CHARSET_THAI:
        case DRM_CHARSET_LATIN7:
        case DRM_CHARSET_LATIN8:
        case DRM_CHARSET_LATIN9:
        case DRM_CHARSET_LATIN10:
            return iso8859xToWcs(charset, mbs, mbsLen, wcsBuf, bufSizeInWideChar, bytesConsumed);
#endif
#ifdef I18N_UTF8_UTF16_SUPPORT
        case DRM_CHARSET_UTF8:
            return utf8ToWcs(mbs, mbsLen, wcsBuf, bufSizeInWideChar, bytesConsumed);
        case DRM_CHARSET_UTF16BE:
            return utf16beToWcs(mbs, mbsLen, wcsBuf, bufSizeInWideChar, bytesConsumed);
        case DRM_CHARSET_UTF16LE:
            return utf16leToWcs(mbs, mbsLen, wcsBuf, bufSizeInWideChar, bytesConsumed);
#endif
        default:
            return -1;
    }
}

/*
 * see drm_i18n.h
 */
int32_t DRM_i18n_wcsToMbs(DRM_Charset_t charset,
        const uint16_t *wcs, int32_t wcsLen,
        uint8_t *mbsBuf, int32_t bufSizeInByte)
{
    int32_t (* wcToMbFunc)(uint16_t, uint8_t *, int32_t);
    int32_t charIndex = 0;
    int32_t numMultiBytes = 0;

    switch (charset)
    {
#ifdef I18N_LATIN1_SUPPORT
        case DRM_CHARSET_LATIN1:
            wcToMbFunc = wcToLatin1;
            break;
#endif
#ifdef I18N_UTF8_UTF16_SUPPORT
        case DRM_CHARSET_UTF8:
            wcToMbFunc = wcToUtf8;
            break;
        case DRM_CHARSET_UTF16BE:
            wcToMbFunc = wcToUtf16be;
            break;
        case DRM_CHARSET_UTF16LE:
            wcToMbFunc = wcToUtf16le;
            break;
#endif
#ifdef I18N_ISO8859X_SUPPORT
        case DRM_CHARSET_LATIN2:
        case DRM_CHARSET_LATIN3:
        case DRM_CHARSET_LATIN4:
        case DRM_CHARSET_CYRILLIC:
        case DRM_CHARSET_ARABIC:
        case DRM_CHARSET_GREEK:
        case DRM_CHARSET_HEBREW:
        case DRM_CHARSET_LATIN5:
        case DRM_CHARSET_LATIN6:
        case DRM_CHARSET_THAI:
        case DRM_CHARSET_LATIN7:
        case DRM_CHARSET_LATIN8:
        case DRM_CHARSET_LATIN9:
        case DRM_CHARSET_LATIN10:
            return wcsToIso8859x(charset, wcs, wcsLen, mbsBuf, bufSizeInByte);
#endif
        default:
            return -1;
    }

    if (mbsBuf) {
        while (numMultiBytes < bufSizeInByte && charIndex < wcsLen) {
            /* TODO: handle surrogate pair values here */
            int32_t mbLen = wcToMbFunc(wcs[charIndex],
                    &mbsBuf[numMultiBytes], bufSizeInByte - numMultiBytes);

            if (numMultiBytes + mbLen > bufSizeInByte) {
                /* Insufficient buffer. Don't update numMultiBytes */
                break;
            }
            charIndex++;
            numMultiBytes += mbLen;
        }
    } else {
        while (charIndex < wcsLen) {
            /* TODO: handle surrogate pair values here */
            numMultiBytes += wcToMbFunc(wcs[charIndex], NULL, 0);
            charIndex++;
        }
    }

    return numMultiBytes;
}


#ifdef I18N_LATIN1_SUPPORT

int32_t latin1ToWcs(const uint8_t *mbs, int32_t mbsLen,
        uint16_t *wcsBuf, int32_t bufSizeInWideChar,
        int32_t *bytesConsumed)
{
    int32_t charsToConvert;
    int32_t len;

    if (wcsBuf == NULL) {
        return mbsLen;
    }

    len = charsToConvert = mbsLen > bufSizeInWideChar ? bufSizeInWideChar : mbsLen;
    if (len < 0)
        return 0;
    while (len--) {
        *wcsBuf++ = *mbs++;
    }

    if (bytesConsumed)
        *bytesConsumed = charsToConvert;

    return charsToConvert;
}

int32_t wcToLatin1(uint16_t wc, uint8_t * mbs, int32_t bufSize)
{
    uint8_t ch;

    if (wc < 0x100) {
        ch = (uint8_t)(wc & 0xff);
    } else {
        ch = '?';
    }
    if (mbs && bufSize > 0)
        *mbs = ch;
    return 1;
}

#endif /* I18N_LATIN1_SUPPORT */

#ifdef I18N_UTF8_UTF16_SUPPORT

int32_t utf8ToWcs(const uint8_t *mbs, int32_t mbsLen,
        uint16_t *wcsBuf, int32_t bufSizeInWideChar,
        int32_t *bytesConsumed)
{
    int32_t charsConverted = 0;
    int32_t i = 0;
    int32_t wideChar;

    if (wcsBuf == NULL) {
        /* No conversion but we're still going to calculate bytesConsumed */
        bufSizeInWideChar = mbsLen * 2;
    }

    while((i < mbsLen) && (charsConverted < bufSizeInWideChar)) {
        uint8_t ch = mbs[i];
        uint8_t ch2, ch3, ch4;

        wideChar = -1;

        if(IS_ASCII(ch)) {
            wideChar = ch;
            i++;
        } else if ((ch & 0xc0) == 0xc0) {
            int utfStart = i;
            if ((ch & 0xe0) == 0xc0) {
                /* 2 byte sequence */
                if (i + 1 < mbsLen && ((ch2 = mbs[i + 1]) & 0xc0) == 0x80) {
                    wideChar = (uint16_t)(((ch & 0x1F) << 6) | (ch2 & 0x3F));
                    i += 2;
                } else {
                    /* skip incomplete sequence */
                    i++;
                }
            } else if ((ch & 0xf0) == 0xe0) {
                /* 3 byte sequence */
                if (i + 2 < mbsLen
                        && ((ch2 = mbs[i + 1]) & 0xc0) == 0x80
                        && ((ch3 = mbs[i + 2]) & 0xc0) == 0x80) {
                    wideChar = (uint16_t)(((ch & 0x0F) << 12) | ((ch2 & 0x3F) << 6) | (ch3 & 0x3F));
                    i += 3;
                } else {
                    /* skip incomplete sequence (up to 2 bytes) */
                    i++;
                    if (i < mbsLen && (mbs[i] & 0xc0) == 0x80)
                        i++;
                }
            } else if ((ch & 0xf8) == 0xf0) {
                /* 4 byte sequence */
                if (i + 3 < mbsLen
                        && ((ch2 = mbs[i + 1]) & 0xc0) == 0x80
                        && ((ch3 = mbs[i + 2]) & 0xc0) == 0x80
                        && ((ch4 = mbs[i + 3]) & 0xc0) == 0x80) {
                    /* FIXME: we do NOT support U+10000 - U+10FFFF for now.
                     *        leave it as 0xFFFD. */
                    wideChar = INVALID_UNICODE;
                    i += 4;
                } else {
                    /* skip incomplete sequence (up to 3 bytes) */
                    i++;
                    if (i < mbsLen && (mbs[i] & 0xc0) == 0x80) {
                        i++;
                        if (i < mbsLen && (mbs[i] & 0xc0) == 0x80) {
                            i++;
                        }
                    }
                }
            } else {
                /* invalid */
                i++;
            }
            if (i >= mbsLen && wideChar == -1) {
                /* Possible incomplete UTF-8 sequence at the end of mbs.
                 * Leave it to the caller.
                 */
                i = utfStart;
                break;
            }
        } else {
            /* invalid */
            i++;
        }
        if(wcsBuf) {
            if (wideChar == -1)
                wideChar = INVALID_UNICODE;
            wcsBuf[charsConverted] = (uint16_t)wideChar;
        }
        charsConverted++;
    }

    if (bytesConsumed)
        *bytesConsumed = i;

    return charsConverted;
}

int32_t wcToUtf8(uint16_t wc, uint8_t * mbs, int32_t bufSize)
{
    if (wc <= 0x7f) {
        if (mbs && (bufSize >= 1)) {
            *mbs = (uint8_t)wc;
        }
        return 1;
    } else if (wc <= 0x7ff) {
        if (mbs && (bufSize >= 2)) {
            *mbs++ = (uint8_t)((wc >> 6) | 0xc0);
            *mbs = (uint8_t)((wc & 0x3f) | 0x80);
        }
        return 2;
    } else {
        if (mbs && (bufSize >= 3)) {
            *mbs++ = (uint8_t)((wc >> 12) | 0xe0);
            *mbs++ = (uint8_t)(((wc >> 6) & 0x3f)| 0x80);
            *mbs = (uint8_t)((wc & 0x3f) | 0x80);
        }
        return 3;
    }
}

int32_t utf16beToWcs(const uint8_t *mbs, int32_t mbsLen,
        uint16_t *wcsBuf, int32_t bufSizeInWideChar,
        int32_t *bytesConsumed)
{
    int32_t charsToConvert;
    int32_t len;

    if (wcsBuf == NULL) {
        return mbsLen / 2;
    }

    len = charsToConvert = (mbsLen / 2) > bufSizeInWideChar ? bufSizeInWideChar : (mbsLen / 2);
    while (len--) {
        /* TODO: handle surrogate pair values */
        *wcsBuf++ = (uint16_t)((*mbs << 8) | *(mbs + 1));
        mbs += 2;
    }

    if (bytesConsumed)
        *bytesConsumed = charsToConvert * 2;

    return charsToConvert;
}

int32_t wcToUtf16be(uint16_t wc, uint8_t * mbs, int32_t bufSize)
{
    if (mbs && bufSize >= 2) {
        /* TODO: handle surrogate pair values */
        *mbs = (uint8_t)(wc >> 8);
        *(mbs + 1) = (uint8_t)(wc & 0xff);
    }
    return 2;
}

int32_t utf16leToWcs(const uint8_t *mbs, int32_t mbsLen,
        uint16_t *wcsBuf, int32_t bufSizeInWideChar,
        int32_t *bytesConsumed)
{
    int32_t charsToConvert;
    int32_t len;

    if (wcsBuf == NULL) {
        return mbsLen / 2;
    }

    len = charsToConvert = (mbsLen / 2) > bufSizeInWideChar ? bufSizeInWideChar : (mbsLen / 2);
    while (len--) {
        /* TODO: handle surrogate pair values */
        *wcsBuf++ = (uint16_t)(*mbs | (*(mbs + 1) << 8));
        mbs += 2;
    }

    if (bytesConsumed)
        *bytesConsumed = charsToConvert * 2;

    return charsToConvert;
}

int32_t wcToUtf16le(uint16_t wc, uint8_t * mbs, int32_t bufSize)
{
    if (mbs && bufSize >= 2) {
        /* TODO: handle surrogate pair values */
        *mbs = (uint8_t)(wc & 0xff);
        *(mbs + 1) = (uint8_t)(wc >> 8);
    }
    return 2;
}

#endif /* I18N_UTF8_UTF16_SUPPORT */