1f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi/*
2f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi * Copyright (C) 2014 The Android Open Source Project
3f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi *
4f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi * Licensed under the Apache License, Version 2.0 (the "License");
5f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi * you may not use this file except in compliance with the License.
6f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi * You may obtain a copy of the License at
7f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi *
8f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi *      http://www.apache.org/licenses/LICENSE-2.0
9f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi *
10f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi * Unless required by applicable law or agreed to in writing, software
11f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi * distributed under the License is distributed on an "AS IS" BASIS,
12f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi * See the License for the specific language governing permissions and
14f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi * limitations under the License.
15f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi */
16f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi
17f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi#include "utils/utf8_utils.h"
18f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi
19f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi#include "utils/char_utils.h"
20f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi
21f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanaginamespace latinime {
22f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanaginamespace dicttoolkit {
23f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi
24f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagiconst size_t Utf8Utils::MAX_SEQUENCE_SIZE_FOR_A_CODE_POINT = 4;
25f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagiconst uint8_t Utf8Utils::FIRST_BYTE_MARKER_MASKS[] = {0, 0x80, 0xE0, 0xF0, 0xF8};
26f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagiconst uint8_t Utf8Utils::FIRST_BYTE_MARKERS[] = {0, 0x00, 0xC0, 0xE0, 0xF0};
27f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagiconst uint8_t Utf8Utils::FIRST_BYTE_CODE_POINT_BITS_MASKS[] = {0, 0x7F, 0x1F, 0x0F, 0x03};
28f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagiconst int Utf8Utils::MAX_ENCODED_CODE_POINT_VALUES[] = {-1, 0x7F, 0x7FF, 0xFFFF, 0x10FFFF};
29f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi
30f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagiconst uint8_t Utf8Utils::TRAILING_BYTE_CODE_POINT_BITS_MASK = 0x3F;
31f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagiconst uint8_t Utf8Utils::TRAILING_BYTE_MARKER = 0x80;
32f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagiconst size_t Utf8Utils::CODE_POINT_BIT_COUNT_IN_TRAILING_BYTE = 6;
33f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi
34f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi/* static */ std::vector<int> Utf8Utils::getCodePoints(const std::string &utf8Str) {
35f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi    std::vector<int> codePoints;
36f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi    int remainingByteCountForCurrentCodePoint = 0;
37f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi    int currentCodePointSequenceSize = 0;
38f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi    int codePoint = 0;
39f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi    for (const char c : utf8Str) {
40f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi        if (remainingByteCountForCurrentCodePoint == 0) {
41f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi            currentCodePointSequenceSize = getSequenceSizeByCheckingFirstByte(c);
42f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi            if (currentCodePointSequenceSize <= 0) {
43f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi                AKLOGE("%x is an invalid utf8 first byte value.", c);
44f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi                return std::vector<int>();
45f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi            }
46f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi            remainingByteCountForCurrentCodePoint = currentCodePointSequenceSize;
47f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi            codePoint = maskFirstByte(c, remainingByteCountForCurrentCodePoint);
48f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi        } else {
49f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi            codePoint <<= CODE_POINT_BIT_COUNT_IN_TRAILING_BYTE;
50f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi            codePoint += maskTrailingByte(c);
51f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi        }
52f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi        remainingByteCountForCurrentCodePoint--;
53f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi        if (remainingByteCountForCurrentCodePoint == 0) {
54f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi            if (codePoint <= MAX_ENCODED_CODE_POINT_VALUES[currentCodePointSequenceSize - 1]) {
55f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi                AKLOGE("%d bytes encode for codePoint(%x) is a redundant UTF-8 sequence.",
56f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi                        currentCodePointSequenceSize,  codePoint);
57f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi                return std::vector<int>();
58f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi            }
59f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi            codePoints.push_back(codePoint);
60f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi        }
61f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi    }
62f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi    return codePoints;
63f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi}
64f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi
65f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi/* static */ int Utf8Utils::getSequenceSizeByCheckingFirstByte(const uint8_t firstByte) {
66f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi    for (size_t i = 1; i <= MAX_SEQUENCE_SIZE_FOR_A_CODE_POINT; ++i) {
67f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi        if ((firstByte & FIRST_BYTE_MARKER_MASKS[i]) == FIRST_BYTE_MARKERS[i]) {
68f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi            return i;
69f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi        }
70f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi    }
71f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi    // Not a valid utf8 char first byte.
72f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi    return -1;
73f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi}
74f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi
75f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi/* static */ AK_FORCE_INLINE int Utf8Utils::maskFirstByte(const uint8_t firstByte,
76f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi        const int sequenceSize) {
77f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi    return firstByte & FIRST_BYTE_CODE_POINT_BITS_MASKS[sequenceSize];
78f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi}
79f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi
80f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi/* static */ AK_FORCE_INLINE int Utf8Utils::maskTrailingByte(const uint8_t secondOrLaterByte) {
81f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi    return secondOrLaterByte & TRAILING_BYTE_CODE_POINT_BITS_MASK;
82f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi}
83f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi
84f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi/* static */ std::string Utf8Utils::getUtf8String(const CodePointArrayView codePoints) {
85f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi    std::string utf8String;
86f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi    for (const int codePoint : codePoints) {
87f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi        const int sequenceSize = getSequenceSizeToEncodeCodePoint(codePoint);
88f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi        if (sequenceSize <= 0) {
89f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi            AKLOGE("Cannot encode code point (%d).", codePoint);
90f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi            return std::string();
91f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi        }
92f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi        const int trailingByteCount = sequenceSize - 1;
93f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi        // Output first byte.
94f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi        const int value = codePoint >> (trailingByteCount * CODE_POINT_BIT_COUNT_IN_TRAILING_BYTE);
95f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi        utf8String.push_back(static_cast<char>(value | FIRST_BYTE_MARKERS[sequenceSize]));
96f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi        // Output second and later bytes.
97f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi        for (int i = 1; i < sequenceSize; ++i) {
98f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi            const int shiftAmount = (trailingByteCount - i) * CODE_POINT_BIT_COUNT_IN_TRAILING_BYTE;
99f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi            const int value = (codePoint >> shiftAmount) & TRAILING_BYTE_CODE_POINT_BITS_MASK;
100f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi            utf8String.push_back(static_cast<char>(value | TRAILING_BYTE_MARKER));
101f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi        }
102f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi    }
103f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi    return utf8String;
104f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi}
105f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi
106f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi/* static */ int Utf8Utils::getSequenceSizeToEncodeCodePoint(const int codePoint) {
107f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi    if (codePoint < 0) {
108f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi        return -1;
109f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi    }
110f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi    for (size_t i = 1; i <= MAX_SEQUENCE_SIZE_FOR_A_CODE_POINT; ++i) {
111f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi        if (codePoint <= MAX_ENCODED_CODE_POINT_VALUES[i]) {
112f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi            return i;
113f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi        }
114f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi    }
115f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi    return -1;
116f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi}
117f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi
118f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi} // namespace dicttoolkit
119f0c303dd02a5df8ad544b3971e7738cb34a1d6beKeisuke Kuroyanagi} // namespace latinime
120