1f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi/* 2f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi * Copyright (C) 2014 The Android Open Source Project 3f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi * 4f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi * Licensed under the Apache License, Version 2.0 (the "License"); 5f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi * you may not use this file except in compliance with the License. 6f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi * You may obtain a copy of the License at 7f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi * 8f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi * http://www.apache.org/licenses/LICENSE-2.0 9f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi * 10f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi * Unless required by applicable law or agreed to in writing, software 11f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi * distributed under the License is distributed on an "AS IS" BASIS, 12f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi * See the License for the specific language governing permissions and 14f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi * limitations under the License. 15f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi */ 16f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi 17f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi#include "utils/utf8_utils.h" 18f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi 19f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi#include "utils/char_utils.h" 20f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi 21f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanaginamespace latinime { 22f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanaginamespace dicttoolkit { 23f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi 24f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagiconst size_t Utf8Utils::MAX_SEQUENCE_SIZE_FOR_A_CODE_POINT = 4; 25f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagiconst uint8_t Utf8Utils::FIRST_BYTE_MARKER_MASKS[] = {0, 0x80, 0xE0, 0xF0, 0xF8}; 26f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagiconst uint8_t Utf8Utils::FIRST_BYTE_MARKERS[] = {0, 0x00, 0xC0, 0xE0, 0xF0}; 27f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagiconst uint8_t Utf8Utils::FIRST_BYTE_CODE_POINT_BITS_MASKS[] = {0, 0x7F, 0x1F, 0x0F, 0x03}; 28f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagiconst int Utf8Utils::MAX_ENCODED_CODE_POINT_VALUES[] = {-1, 0x7F, 0x7FF, 0xFFFF, 0x10FFFF}; 29f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi 30f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagiconst uint8_t Utf8Utils::TRAILING_BYTE_CODE_POINT_BITS_MASK = 0x3F; 31f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagiconst uint8_t Utf8Utils::TRAILING_BYTE_MARKER = 0x80; 32f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagiconst size_t Utf8Utils::CODE_POINT_BIT_COUNT_IN_TRAILING_BYTE = 6; 33f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi 34f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi/* static */ std::vector<int> Utf8Utils::getCodePoints(const std::string &utf8Str) { 35f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi std::vector<int> codePoints; 36f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi int remainingByteCountForCurrentCodePoint = 0; 37f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi int currentCodePointSequenceSize = 0; 38f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi int codePoint = 0; 39f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi for (const char c : utf8Str) { 40f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi if (remainingByteCountForCurrentCodePoint == 0) { 41f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi currentCodePointSequenceSize = getSequenceSizeByCheckingFirstByte(c); 42f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi if (currentCodePointSequenceSize <= 0) { 43f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi AKLOGE("%x is an invalid utf8 first byte value.", c); 44f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi return std::vector<int>(); 45f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi } 46f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi remainingByteCountForCurrentCodePoint = currentCodePointSequenceSize; 47f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi codePoint = maskFirstByte(c, remainingByteCountForCurrentCodePoint); 48f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi } else { 49f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi codePoint <<= CODE_POINT_BIT_COUNT_IN_TRAILING_BYTE; 50f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi codePoint += maskTrailingByte(c); 51f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi } 52f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi remainingByteCountForCurrentCodePoint--; 53f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi if (remainingByteCountForCurrentCodePoint == 0) { 54f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi if (codePoint <= MAX_ENCODED_CODE_POINT_VALUES[currentCodePointSequenceSize - 1]) { 55f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi AKLOGE("%d bytes encode for codePoint(%x) is a redundant UTF-8 sequence.", 56f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi currentCodePointSequenceSize, codePoint); 57f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi return std::vector<int>(); 58f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi } 59f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi codePoints.push_back(codePoint); 60f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi } 61f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi } 62f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi return codePoints; 63f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi} 64f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi 65f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi/* static */ int Utf8Utils::getSequenceSizeByCheckingFirstByte(const uint8_t firstByte) { 66f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi for (size_t i = 1; i <= MAX_SEQUENCE_SIZE_FOR_A_CODE_POINT; ++i) { 67f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi if ((firstByte & FIRST_BYTE_MARKER_MASKS[i]) == FIRST_BYTE_MARKERS[i]) { 68f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi return i; 69f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi } 70f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi } 71f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi // Not a valid utf8 char first byte. 72f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi return -1; 73f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi} 74f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi 75f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi/* static */ AK_FORCE_INLINE int Utf8Utils::maskFirstByte(const uint8_t firstByte, 76f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi const int sequenceSize) { 77f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi return firstByte & FIRST_BYTE_CODE_POINT_BITS_MASKS[sequenceSize]; 78f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi} 79f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi 80f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi/* static */ AK_FORCE_INLINE int Utf8Utils::maskTrailingByte(const uint8_t secondOrLaterByte) { 81f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi return secondOrLaterByte & TRAILING_BYTE_CODE_POINT_BITS_MASK; 82f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi} 83f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi 84f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi/* static */ std::string Utf8Utils::getUtf8String(const CodePointArrayView codePoints) { 85f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi std::string utf8String; 86f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi for (const int codePoint : codePoints) { 87f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi const int sequenceSize = getSequenceSizeToEncodeCodePoint(codePoint); 88f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi if (sequenceSize <= 0) { 89f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi AKLOGE("Cannot encode code point (%d).", codePoint); 90f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi return std::string(); 91f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi } 92f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi const int trailingByteCount = sequenceSize - 1; 93f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi // Output first byte. 94f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi const int value = codePoint >> (trailingByteCount * CODE_POINT_BIT_COUNT_IN_TRAILING_BYTE); 95f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi utf8String.push_back(static_cast<char>(value | FIRST_BYTE_MARKERS[sequenceSize])); 96f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi // Output second and later bytes. 97f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi for (int i = 1; i < sequenceSize; ++i) { 98f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi const int shiftAmount = (trailingByteCount - i) * CODE_POINT_BIT_COUNT_IN_TRAILING_BYTE; 99f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi const int value = (codePoint >> shiftAmount) & TRAILING_BYTE_CODE_POINT_BITS_MASK; 100f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi utf8String.push_back(static_cast<char>(value | TRAILING_BYTE_MARKER)); 101f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi } 102f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi } 103f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi return utf8String; 104f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi} 105f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi 106f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi/* static */ int Utf8Utils::getSequenceSizeToEncodeCodePoint(const int codePoint) { 107f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi if (codePoint < 0) { 108f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi return -1; 109f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi } 110f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi for (size_t i = 1; i <= MAX_SEQUENCE_SIZE_FOR_A_CODE_POINT; ++i) { 111f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi if (codePoint <= MAX_ENCODED_CODE_POINT_VALUES[i]) { 112f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi return i; 113f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi } 114f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi } 115f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi return -1; 116f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi} 117f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi 118f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi} // namespace dicttoolkit 119f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi} // namespace latinime 120