1d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien/* 2d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien * Copyright (C) 2015 The Android Open Source Project 3d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien * 4d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien * Licensed under the Apache License, Version 2.0 (the "License"); 5d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien * you may not use this file except in compliance with the License. 6d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien * You may obtain a copy of the License at 7d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien * 8d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien * http://www.apache.org/licenses/LICENSE-2.0 9d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien * 10d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien * Unless required by applicable law or agreed to in writing, software 11d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien * distributed under the License is distributed on an "AS IS" BASIS, 12d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien * See the License for the specific language governing permissions and 14d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien * limitations under the License. 15d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien */ 16d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien 17d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien#include <unicode/utf.h> 180ca4fb6d44160245ad2333851ac18a13fc553ec9Seigo Nonaka#include <unicode/utf8.h> 19d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien#include <cstdlib> 200ca4fb6d44160245ad2333851ac18a13fc553ec9Seigo Nonaka#include <cutils/log.h> 210ca4fb6d44160245ad2333851ac18a13fc553ec9Seigo Nonaka#include <vector> 220ca4fb6d44160245ad2333851ac18a13fc553ec9Seigo Nonaka#include <string> 23d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien 2414e2d136aaef271ba131f917cf5f27baa31ae5adSeigo Nonakanamespace minikin { 2514e2d136aaef271ba131f917cf5f27baa31ae5adSeigo Nonaka 26d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien// src is of the form "U+1F431 | 'h' 'i'". Position of "|" gets saved to offset if non-null. 27d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien// Size is returned in an out parameter because gtest needs a void return for ASSERT to work. 28d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levienvoid ParseUnicode(uint16_t* buf, size_t buf_size, const char* src, size_t* result_size, 29d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien size_t* offset) { 30d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien size_t input_ix = 0; 31d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien size_t output_ix = 0; 32d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien bool seen_offset = false; 33d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien 34d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien while (src[input_ix] != 0) { 35d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien switch (src[input_ix]) { 36d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien case '\'': 37d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien // single ASCII char 380ca4fb6d44160245ad2333851ac18a13fc553ec9Seigo Nonaka LOG_ALWAYS_FATAL_IF(static_cast<uint8_t>(src[input_ix]) >= 0x80); 39d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien input_ix++; 400ca4fb6d44160245ad2333851ac18a13fc553ec9Seigo Nonaka LOG_ALWAYS_FATAL_IF(src[input_ix] == 0); 410ca4fb6d44160245ad2333851ac18a13fc553ec9Seigo Nonaka LOG_ALWAYS_FATAL_IF(output_ix >= buf_size); 42d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien buf[output_ix++] = (uint16_t)src[input_ix++]; 430ca4fb6d44160245ad2333851ac18a13fc553ec9Seigo Nonaka LOG_ALWAYS_FATAL_IF(src[input_ix] != '\''); 44d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien input_ix++; 45d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien break; 46d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien case 'u': 47d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien case 'U': { 48d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien // Unicode codepoint in hex syntax 49d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien input_ix++; 500ca4fb6d44160245ad2333851ac18a13fc553ec9Seigo Nonaka LOG_ALWAYS_FATAL_IF(src[input_ix] != '+'); 51d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien input_ix++; 52d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien char* endptr = (char*)src + input_ix; 53d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien unsigned long int codepoint = strtoul(src + input_ix, &endptr, 16); 54d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien size_t num_hex_digits = endptr - (src + input_ix); 550ca4fb6d44160245ad2333851ac18a13fc553ec9Seigo Nonaka 560ca4fb6d44160245ad2333851ac18a13fc553ec9Seigo Nonaka // also triggers on invalid number syntax, digits = 0 570ca4fb6d44160245ad2333851ac18a13fc553ec9Seigo Nonaka LOG_ALWAYS_FATAL_IF(num_hex_digits < 4u); 580ca4fb6d44160245ad2333851ac18a13fc553ec9Seigo Nonaka LOG_ALWAYS_FATAL_IF(num_hex_digits > 6u); 590ca4fb6d44160245ad2333851ac18a13fc553ec9Seigo Nonaka LOG_ALWAYS_FATAL_IF(codepoint > 0x10FFFFu); 60d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien input_ix += num_hex_digits; 61d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien if (U16_LENGTH(codepoint) == 1) { 620ca4fb6d44160245ad2333851ac18a13fc553ec9Seigo Nonaka LOG_ALWAYS_FATAL_IF(output_ix + 1 > buf_size); 63d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien buf[output_ix++] = codepoint; 64d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien } else { 65d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien // UTF-16 encoding 660ca4fb6d44160245ad2333851ac18a13fc553ec9Seigo Nonaka LOG_ALWAYS_FATAL_IF(output_ix + 2 > buf_size); 67d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien buf[output_ix++] = U16_LEAD(codepoint); 68d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien buf[output_ix++] = U16_TRAIL(codepoint); 69d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien } 70d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien break; 71d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien } 72d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien case ' ': 73d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien input_ix++; 74d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien break; 75d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien case '|': 760ca4fb6d44160245ad2333851ac18a13fc553ec9Seigo Nonaka LOG_ALWAYS_FATAL_IF(seen_offset); 770ca4fb6d44160245ad2333851ac18a13fc553ec9Seigo Nonaka LOG_ALWAYS_FATAL_IF(offset == nullptr); 78d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien *offset = output_ix; 79d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien seen_offset = true; 80d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien input_ix++; 81d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien break; 82d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien default: 830ca4fb6d44160245ad2333851ac18a13fc553ec9Seigo Nonaka LOG_ALWAYS_FATAL("Unexpected Character"); 84d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien } 85d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien } 860ca4fb6d44160245ad2333851ac18a13fc553ec9Seigo Nonaka LOG_ALWAYS_FATAL_IF(result_size == nullptr); 87d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien *result_size = output_ix; 880ca4fb6d44160245ad2333851ac18a13fc553ec9Seigo Nonaka LOG_ALWAYS_FATAL_IF(!seen_offset && offset != nullptr); 89d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien} 90d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien 91dfbc6e374259f9d81940b5195ac013b02429af27Seigo Nonakastd::vector<uint16_t> parseUnicodeStringWithOffset(const std::string& in, size_t* offset) { 92dfbc6e374259f9d81940b5195ac013b02429af27Seigo Nonaka std::unique_ptr<uint16_t[]> buffer(new uint16_t[in.size()]); 93dfbc6e374259f9d81940b5195ac013b02429af27Seigo Nonaka size_t result_size = 0; 94dfbc6e374259f9d81940b5195ac013b02429af27Seigo Nonaka ParseUnicode(buffer.get(), in.size(), in.c_str(), &result_size, offset); 95dfbc6e374259f9d81940b5195ac013b02429af27Seigo Nonaka return std::vector<uint16_t>(buffer.get(), buffer.get() + result_size); 96dfbc6e374259f9d81940b5195ac013b02429af27Seigo Nonaka} 97dfbc6e374259f9d81940b5195ac013b02429af27Seigo Nonaka 98dfbc6e374259f9d81940b5195ac013b02429af27Seigo Nonakastd::vector<uint16_t> parseUnicodeString(const std::string& in) { 99dfbc6e374259f9d81940b5195ac013b02429af27Seigo Nonaka return parseUnicodeStringWithOffset(in, nullptr); 100dfbc6e374259f9d81940b5195ac013b02429af27Seigo Nonaka} 101dfbc6e374259f9d81940b5195ac013b02429af27Seigo Nonaka 1020ca4fb6d44160245ad2333851ac18a13fc553ec9Seigo Nonakastd::vector<uint16_t> utf8ToUtf16(const std::string& text) { 1030ca4fb6d44160245ad2333851ac18a13fc553ec9Seigo Nonaka std::vector<uint16_t> result; 1040ca4fb6d44160245ad2333851ac18a13fc553ec9Seigo Nonaka int32_t i = 0; 1050ca4fb6d44160245ad2333851ac18a13fc553ec9Seigo Nonaka const int32_t textLength = static_cast<int32_t>(text.size()); 1060ca4fb6d44160245ad2333851ac18a13fc553ec9Seigo Nonaka uint32_t c = 0; 1070ca4fb6d44160245ad2333851ac18a13fc553ec9Seigo Nonaka while (i < textLength) { 1080ca4fb6d44160245ad2333851ac18a13fc553ec9Seigo Nonaka U8_NEXT(text.c_str(), i, textLength, c); 1090ca4fb6d44160245ad2333851ac18a13fc553ec9Seigo Nonaka if (U16_LENGTH(c) == 1) { 1100ca4fb6d44160245ad2333851ac18a13fc553ec9Seigo Nonaka result.push_back(c); 1110ca4fb6d44160245ad2333851ac18a13fc553ec9Seigo Nonaka } else { 1120ca4fb6d44160245ad2333851ac18a13fc553ec9Seigo Nonaka result.push_back(U16_LEAD(c)); 1130ca4fb6d44160245ad2333851ac18a13fc553ec9Seigo Nonaka result.push_back(U16_TRAIL(c)); 1140ca4fb6d44160245ad2333851ac18a13fc553ec9Seigo Nonaka } 1150ca4fb6d44160245ad2333851ac18a13fc553ec9Seigo Nonaka } 1160ca4fb6d44160245ad2333851ac18a13fc553ec9Seigo Nonaka return result; 117d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien} 11814e2d136aaef271ba131f917cf5f27baa31ae5adSeigo Nonaka 11914e2d136aaef271ba131f917cf5f27baa31ae5adSeigo Nonaka} // namespace minikin 120