1d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien/*
2d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien * Copyright (C) 2015 The Android Open Source Project
3d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien *
4d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien * Licensed under the Apache License, Version 2.0 (the "License");
5d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien * you may not use this file except in compliance with the License.
6d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien * You may obtain a copy of the License at
7d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien *
8d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien *      http://www.apache.org/licenses/LICENSE-2.0
9d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien *
10d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien * Unless required by applicable law or agreed to in writing, software
11d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien * distributed under the License is distributed on an "AS IS" BASIS,
12d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien * See the License for the specific language governing permissions and
14d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien * limitations under the License.
15d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien */
16d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien
17d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien#include <unicode/utf.h>
180ca4fb6d44160245ad2333851ac18a13fc553ec9Seigo Nonaka#include <unicode/utf8.h>
19d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien#include <cstdlib>
200ca4fb6d44160245ad2333851ac18a13fc553ec9Seigo Nonaka#include <cutils/log.h>
210ca4fb6d44160245ad2333851ac18a13fc553ec9Seigo Nonaka#include <vector>
220ca4fb6d44160245ad2333851ac18a13fc553ec9Seigo Nonaka#include <string>
23d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien
2414e2d136aaef271ba131f917cf5f27baa31ae5adSeigo Nonakanamespace minikin {
2514e2d136aaef271ba131f917cf5f27baa31ae5adSeigo Nonaka
26d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien// src is of the form "U+1F431 | 'h' 'i'". Position of "|" gets saved to offset if non-null.
27d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien// Size is returned in an out parameter because gtest needs a void return for ASSERT to work.
28d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levienvoid ParseUnicode(uint16_t* buf, size_t buf_size, const char* src, size_t* result_size,
29d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien        size_t* offset) {
30d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien    size_t input_ix = 0;
31d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien    size_t output_ix = 0;
32d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien    bool seen_offset = false;
33d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien
34d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien    while (src[input_ix] != 0) {
35d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien        switch (src[input_ix]) {
36d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien        case '\'':
37d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien            // single ASCII char
380ca4fb6d44160245ad2333851ac18a13fc553ec9Seigo Nonaka            LOG_ALWAYS_FATAL_IF(static_cast<uint8_t>(src[input_ix]) >= 0x80);
39d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien            input_ix++;
400ca4fb6d44160245ad2333851ac18a13fc553ec9Seigo Nonaka            LOG_ALWAYS_FATAL_IF(src[input_ix] == 0);
410ca4fb6d44160245ad2333851ac18a13fc553ec9Seigo Nonaka            LOG_ALWAYS_FATAL_IF(output_ix >= buf_size);
42d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien            buf[output_ix++] = (uint16_t)src[input_ix++];
430ca4fb6d44160245ad2333851ac18a13fc553ec9Seigo Nonaka            LOG_ALWAYS_FATAL_IF(src[input_ix] != '\'');
44d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien            input_ix++;
45d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien            break;
46d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien        case 'u':
47d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien        case 'U': {
48d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien            // Unicode codepoint in hex syntax
49d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien            input_ix++;
500ca4fb6d44160245ad2333851ac18a13fc553ec9Seigo Nonaka            LOG_ALWAYS_FATAL_IF(src[input_ix] != '+');
51d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien            input_ix++;
52d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien            char* endptr = (char*)src + input_ix;
53d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien            unsigned long int codepoint = strtoul(src + input_ix, &endptr, 16);
54d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien            size_t num_hex_digits = endptr - (src + input_ix);
550ca4fb6d44160245ad2333851ac18a13fc553ec9Seigo Nonaka
560ca4fb6d44160245ad2333851ac18a13fc553ec9Seigo Nonaka            // also triggers on invalid number syntax, digits = 0
570ca4fb6d44160245ad2333851ac18a13fc553ec9Seigo Nonaka            LOG_ALWAYS_FATAL_IF(num_hex_digits < 4u);
580ca4fb6d44160245ad2333851ac18a13fc553ec9Seigo Nonaka            LOG_ALWAYS_FATAL_IF(num_hex_digits > 6u);
590ca4fb6d44160245ad2333851ac18a13fc553ec9Seigo Nonaka            LOG_ALWAYS_FATAL_IF(codepoint > 0x10FFFFu);
60d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien            input_ix += num_hex_digits;
61d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien            if (U16_LENGTH(codepoint) == 1) {
620ca4fb6d44160245ad2333851ac18a13fc553ec9Seigo Nonaka                LOG_ALWAYS_FATAL_IF(output_ix + 1 > buf_size);
63d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien                buf[output_ix++] = codepoint;
64d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien            } else {
65d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien                // UTF-16 encoding
660ca4fb6d44160245ad2333851ac18a13fc553ec9Seigo Nonaka                LOG_ALWAYS_FATAL_IF(output_ix + 2 > buf_size);
67d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien                buf[output_ix++] = U16_LEAD(codepoint);
68d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien                buf[output_ix++] = U16_TRAIL(codepoint);
69d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien            }
70d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien            break;
71d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien        }
72d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien        case ' ':
73d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien            input_ix++;
74d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien            break;
75d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien        case '|':
760ca4fb6d44160245ad2333851ac18a13fc553ec9Seigo Nonaka            LOG_ALWAYS_FATAL_IF(seen_offset);
770ca4fb6d44160245ad2333851ac18a13fc553ec9Seigo Nonaka            LOG_ALWAYS_FATAL_IF(offset == nullptr);
78d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien            *offset = output_ix;
79d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien            seen_offset = true;
80d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien            input_ix++;
81d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien            break;
82d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien        default:
830ca4fb6d44160245ad2333851ac18a13fc553ec9Seigo Nonaka            LOG_ALWAYS_FATAL("Unexpected Character");
84d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien        }
85d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien    }
860ca4fb6d44160245ad2333851ac18a13fc553ec9Seigo Nonaka    LOG_ALWAYS_FATAL_IF(result_size == nullptr);
87d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien    *result_size = output_ix;
880ca4fb6d44160245ad2333851ac18a13fc553ec9Seigo Nonaka    LOG_ALWAYS_FATAL_IF(!seen_offset && offset != nullptr);
89d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien}
90d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien
91dfbc6e374259f9d81940b5195ac013b02429af27Seigo Nonakastd::vector<uint16_t> parseUnicodeStringWithOffset(const std::string& in, size_t* offset) {
92dfbc6e374259f9d81940b5195ac013b02429af27Seigo Nonaka    std::unique_ptr<uint16_t[]> buffer(new uint16_t[in.size()]);
93dfbc6e374259f9d81940b5195ac013b02429af27Seigo Nonaka    size_t result_size = 0;
94dfbc6e374259f9d81940b5195ac013b02429af27Seigo Nonaka    ParseUnicode(buffer.get(), in.size(), in.c_str(), &result_size, offset);
95dfbc6e374259f9d81940b5195ac013b02429af27Seigo Nonaka    return std::vector<uint16_t>(buffer.get(), buffer.get() + result_size);
96dfbc6e374259f9d81940b5195ac013b02429af27Seigo Nonaka}
97dfbc6e374259f9d81940b5195ac013b02429af27Seigo Nonaka
98dfbc6e374259f9d81940b5195ac013b02429af27Seigo Nonakastd::vector<uint16_t> parseUnicodeString(const std::string& in) {
99dfbc6e374259f9d81940b5195ac013b02429af27Seigo Nonaka    return parseUnicodeStringWithOffset(in, nullptr);
100dfbc6e374259f9d81940b5195ac013b02429af27Seigo Nonaka}
101dfbc6e374259f9d81940b5195ac013b02429af27Seigo Nonaka
1020ca4fb6d44160245ad2333851ac18a13fc553ec9Seigo Nonakastd::vector<uint16_t> utf8ToUtf16(const std::string& text) {
1030ca4fb6d44160245ad2333851ac18a13fc553ec9Seigo Nonaka    std::vector<uint16_t> result;
1040ca4fb6d44160245ad2333851ac18a13fc553ec9Seigo Nonaka    int32_t i = 0;
1050ca4fb6d44160245ad2333851ac18a13fc553ec9Seigo Nonaka    const int32_t textLength = static_cast<int32_t>(text.size());
1060ca4fb6d44160245ad2333851ac18a13fc553ec9Seigo Nonaka    uint32_t c = 0;
1070ca4fb6d44160245ad2333851ac18a13fc553ec9Seigo Nonaka    while (i < textLength) {
1080ca4fb6d44160245ad2333851ac18a13fc553ec9Seigo Nonaka        U8_NEXT(text.c_str(), i, textLength, c);
1090ca4fb6d44160245ad2333851ac18a13fc553ec9Seigo Nonaka        if (U16_LENGTH(c) == 1) {
1100ca4fb6d44160245ad2333851ac18a13fc553ec9Seigo Nonaka            result.push_back(c);
1110ca4fb6d44160245ad2333851ac18a13fc553ec9Seigo Nonaka        } else {
1120ca4fb6d44160245ad2333851ac18a13fc553ec9Seigo Nonaka            result.push_back(U16_LEAD(c));
1130ca4fb6d44160245ad2333851ac18a13fc553ec9Seigo Nonaka            result.push_back(U16_TRAIL(c));
1140ca4fb6d44160245ad2333851ac18a13fc553ec9Seigo Nonaka        }
1150ca4fb6d44160245ad2333851ac18a13fc553ec9Seigo Nonaka    }
1160ca4fb6d44160245ad2333851ac18a13fc553ec9Seigo Nonaka    return result;
117d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien}
11814e2d136aaef271ba131f917cf5f27baa31ae5adSeigo Nonaka
11914e2d136aaef271ba131f917cf5f27baa31ae5adSeigo Nonaka}  // namespace minikin
120