1/*
2 * Copyright (C) 2015 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#define LOG_TAG "Minikin"
18
19#include "FontLanguage.h"
20
21#include <algorithm>
22#include <hb.h>
23#include <string.h>
24#include <unicode/uloc.h>
25
26namespace minikin {
27
28#define SCRIPT_TAG(c1, c2, c3, c4) \
29        (((uint32_t)(c1)) << 24 | ((uint32_t)(c2)) << 16 | ((uint32_t)(c3)) <<  8 | \
30         ((uint32_t)(c4)))
31
32// Check if a language code supports emoji according to its subtag
33static bool isEmojiSubtag(const char* buf, size_t bufLen, const char* subtag, size_t subtagLen) {
34    if (bufLen < subtagLen) {
35        return false;
36    }
37    if (strncmp(buf, subtag, subtagLen) != 0) {
38        return false;  // no match between two strings
39    }
40    return (bufLen == subtagLen || buf[subtagLen] == '\0' ||
41            buf[subtagLen] == '-' || buf[subtagLen] == '_');
42}
43
44// Pack the three letter code into 15 bits and stored to 16 bit integer. The highest bit is 0.
45// For the region code, the letters must be all digits in three letter case, so the number of
46// possible values are 10. For the language code, the letters must be all small alphabets, so the
47// number of possible values are 26. Thus, 5 bits are sufficient for each case and we can pack the
48// three letter language code or region code to 15 bits.
49//
50// In case of two letter code, use fullbit(0x1f) for the first letter instead.
51static uint16_t packLanguageOrRegion(const char* c, size_t length, uint8_t twoLetterBase,
52        uint8_t threeLetterBase) {
53    if (length == 2) {
54        return 0x7c00u |  // 0x1fu << 10
55                (uint16_t)(c[0] - twoLetterBase) << 5 |
56                (uint16_t)(c[1] - twoLetterBase);
57    } else {
58        return ((uint16_t)(c[0] - threeLetterBase) << 10) |
59                (uint16_t)(c[1] - threeLetterBase) << 5 |
60                (uint16_t)(c[2] - threeLetterBase);
61    }
62}
63
64static size_t unpackLanguageOrRegion(uint16_t in, char* out, uint8_t twoLetterBase,
65        uint8_t threeLetterBase) {
66    uint8_t first = (in >> 10) & 0x1f;
67    uint8_t second = (in >> 5) & 0x1f;
68    uint8_t third = in & 0x1f;
69
70    if (first == 0x1f) {
71        out[0] = second + twoLetterBase;
72        out[1] = third + twoLetterBase;
73        return 2;
74    } else {
75        out[0] = first + threeLetterBase;
76        out[1] = second + threeLetterBase;
77        out[2] = third + threeLetterBase;
78        return 3;
79    }
80}
81
82// Find the next '-' or '_' index from startOffset position. If not found, returns bufferLength.
83static size_t nextDelimiterIndex(const char* buffer, size_t bufferLength, size_t startOffset) {
84    for (size_t i = startOffset; i < bufferLength; ++i) {
85        if (buffer[i] == '-' || buffer[i] == '_') {
86            return i;
87        }
88    }
89    return bufferLength;
90}
91
92static inline bool isLowercase(char c) {
93    return 'a' <= c && c <= 'z';
94}
95
96static inline bool isUppercase(char c) {
97    return 'A' <= c && c <= 'Z';
98}
99
100static inline bool isDigit(char c) {
101    return '0' <= c && c <= '9';
102}
103
104// Returns true if the buffer is valid for language code.
105static inline bool isValidLanguageCode(const char* buffer, size_t length) {
106    if (length != 2 && length != 3) return false;
107    if (!isLowercase(buffer[0])) return false;
108    if (!isLowercase(buffer[1])) return false;
109    if (length == 3 && !isLowercase(buffer[2])) return false;
110    return true;
111}
112
113// Returns true if buffer is valid for script code. The length of buffer must be 4.
114static inline bool isValidScriptCode(const char* buffer) {
115    return isUppercase(buffer[0]) && isLowercase(buffer[1]) && isLowercase(buffer[2]) &&
116        isLowercase(buffer[3]);
117}
118
119// Returns true if the buffer is valid for region code.
120static inline bool isValidRegionCode(const char* buffer, size_t length) {
121    return (length == 2 && isUppercase(buffer[0]) && isUppercase(buffer[1])) ||
122            (length == 3 && isDigit(buffer[0]) && isDigit(buffer[1]) && isDigit(buffer[2]));
123}
124
125// Parse BCP 47 language identifier into internal structure
126FontLanguage::FontLanguage(const char* buf, size_t length) : FontLanguage() {
127    size_t firstDelimiterPos = nextDelimiterIndex(buf, length, 0);
128    if (isValidLanguageCode(buf, firstDelimiterPos)) {
129        mLanguage = packLanguageOrRegion(buf, firstDelimiterPos, 'a', 'a');
130    } else {
131        // We don't understand anything other than two-letter or three-letter
132        // language codes, so we skip parsing the rest of the string.
133        return;
134    }
135
136    if (firstDelimiterPos == length) {
137        mHbLanguage = hb_language_from_string(getString().c_str(), -1);
138        return;  // Language code only.
139    }
140
141    size_t nextComponentStartPos = firstDelimiterPos + 1;
142    size_t nextDelimiterPos = nextDelimiterIndex(buf, length, nextComponentStartPos);
143    size_t componentLength = nextDelimiterPos - nextComponentStartPos;
144
145    if (componentLength == 4) {
146        // Possibly script code.
147        const char* p = buf + nextComponentStartPos;
148        if (isValidScriptCode(p)) {
149            mScript = SCRIPT_TAG(p[0], p[1], p[2], p[3]);
150            mSubScriptBits = scriptToSubScriptBits(mScript);
151        }
152
153        if (nextDelimiterPos == length) {
154            mHbLanguage = hb_language_from_string(getString().c_str(), -1);
155            mEmojiStyle = resolveEmojiStyle(buf, length, mScript);
156            return;  // No region code.
157        }
158
159        nextComponentStartPos = nextDelimiterPos + 1;
160        nextDelimiterPos = nextDelimiterIndex(buf, length, nextComponentStartPos);
161        componentLength = nextDelimiterPos - nextComponentStartPos;
162    }
163
164    if (componentLength == 2 || componentLength == 3) {
165        // Possibly region code.
166        const char* p = buf + nextComponentStartPos;
167        if (isValidRegionCode(p, componentLength)) {
168            mRegion = packLanguageOrRegion(p, componentLength, 'A', '0');
169        }
170    }
171
172    mHbLanguage = hb_language_from_string(getString().c_str(), -1);
173    mEmojiStyle = resolveEmojiStyle(buf, length, mScript);
174}
175
176// static
177FontLanguage::EmojiStyle FontLanguage::resolveEmojiStyle(const char* buf, size_t length,
178        uint32_t script) {
179    // First, lookup emoji subtag.
180    // 10 is the length of "-u-em-text", which is the shortest emoji subtag,
181    // unnecessary comparison can be avoided if total length is smaller than 10.
182    const size_t kMinSubtagLength = 10;
183    if (length >= kMinSubtagLength) {
184        static const char kPrefix[] = "-u-em-";
185        const char *pos = std::search(buf, buf + length, kPrefix, kPrefix + strlen(kPrefix));
186        if (pos != buf + length) {  // found
187            pos += strlen(kPrefix);
188            const size_t remainingLength = length - (pos - buf);
189            if (isEmojiSubtag(pos, remainingLength, "emoji", 5)){
190                return EMSTYLE_EMOJI;
191            } else if (isEmojiSubtag(pos, remainingLength, "text", 4)){
192                return EMSTYLE_TEXT;
193            } else if (isEmojiSubtag(pos, remainingLength, "default", 7)){
194                return EMSTYLE_DEFAULT;
195            }
196        }
197    }
198
199    // If no emoji subtag was provided, resolve the emoji style from script code.
200    if (script == SCRIPT_TAG('Z', 's', 'y', 'e')) {
201        return EMSTYLE_EMOJI;
202    } else if (script == SCRIPT_TAG('Z', 's', 'y', 'm')) {
203        return EMSTYLE_TEXT;
204    }
205
206    return EMSTYLE_EMPTY;
207}
208
209//static
210uint8_t FontLanguage::scriptToSubScriptBits(uint32_t script) {
211    uint8_t subScriptBits = 0u;
212    switch (script) {
213        case SCRIPT_TAG('B', 'o', 'p', 'o'):
214            subScriptBits = kBopomofoFlag;
215            break;
216        case SCRIPT_TAG('H', 'a', 'n', 'g'):
217            subScriptBits = kHangulFlag;
218            break;
219        case SCRIPT_TAG('H', 'a', 'n', 'b'):
220            // Bopomofo is almost exclusively used in Taiwan.
221            subScriptBits = kHanFlag | kBopomofoFlag;
222            break;
223        case SCRIPT_TAG('H', 'a', 'n', 'i'):
224            subScriptBits = kHanFlag;
225            break;
226        case SCRIPT_TAG('H', 'a', 'n', 's'):
227            subScriptBits = kHanFlag | kSimplifiedChineseFlag;
228            break;
229        case SCRIPT_TAG('H', 'a', 'n', 't'):
230            subScriptBits = kHanFlag | kTraditionalChineseFlag;
231            break;
232        case SCRIPT_TAG('H', 'i', 'r', 'a'):
233            subScriptBits = kHiraganaFlag;
234            break;
235        case SCRIPT_TAG('H', 'r', 'k', 't'):
236            subScriptBits = kKatakanaFlag | kHiraganaFlag;
237            break;
238        case SCRIPT_TAG('J', 'p', 'a', 'n'):
239            subScriptBits = kHanFlag | kKatakanaFlag | kHiraganaFlag;
240            break;
241        case SCRIPT_TAG('K', 'a', 'n', 'a'):
242            subScriptBits = kKatakanaFlag;
243            break;
244        case SCRIPT_TAG('K', 'o', 'r', 'e'):
245            subScriptBits = kHanFlag | kHangulFlag;
246            break;
247    }
248    return subScriptBits;
249}
250
251std::string FontLanguage::getString() const {
252    if (isUnsupported()) {
253        return "und";
254    }
255    char buf[16];
256    size_t i = unpackLanguageOrRegion(mLanguage, buf, 'a', 'a');
257    if (mScript != 0) {
258        buf[i++] = '-';
259        buf[i++] = (mScript >> 24) & 0xFFu;
260        buf[i++] = (mScript >> 16) & 0xFFu;
261        buf[i++] = (mScript >> 8) & 0xFFu;
262        buf[i++] = mScript & 0xFFu;
263    }
264    if (mRegion != INVALID_CODE) {
265        buf[i++] = '-';
266        i += unpackLanguageOrRegion(mRegion, buf + i, 'A', '0');
267    }
268    return std::string(buf, i);
269}
270
271bool FontLanguage::isEqualScript(const FontLanguage& other) const {
272    return other.mScript == mScript;
273}
274
275// static
276bool FontLanguage::supportsScript(uint8_t providedBits, uint8_t requestedBits) {
277    return requestedBits != 0 && (providedBits & requestedBits) == requestedBits;
278}
279
280bool FontLanguage::supportsHbScript(hb_script_t script) const {
281    static_assert(SCRIPT_TAG('J', 'p', 'a', 'n') == HB_TAG('J', 'p', 'a', 'n'),
282                  "The Minikin script and HarfBuzz hb_script_t have different encodings.");
283    if (script == mScript) return true;
284    return supportsScript(mSubScriptBits, scriptToSubScriptBits(script));
285}
286
287int FontLanguage::calcScoreFor(const FontLanguages& supported) const {
288    bool languageScriptMatch = false;
289    bool subtagMatch = false;
290    bool scriptMatch = false;
291
292    for (size_t i = 0; i < supported.size(); ++i) {
293        if (mEmojiStyle != EMSTYLE_EMPTY &&
294               mEmojiStyle == supported[i].mEmojiStyle) {
295            subtagMatch = true;
296            if (mLanguage == supported[i].mLanguage) {
297                return 4;
298            }
299        }
300        if (isEqualScript(supported[i]) ||
301                supportsScript(supported[i].mSubScriptBits, mSubScriptBits)) {
302            scriptMatch = true;
303            if (mLanguage == supported[i].mLanguage) {
304                languageScriptMatch = true;
305            }
306        }
307    }
308
309    if (supportsScript(supported.getUnionOfSubScriptBits(), mSubScriptBits)) {
310        scriptMatch = true;
311        if (mLanguage == supported[0].mLanguage && supported.isAllTheSameLanguage()) {
312            return 3;
313        }
314    }
315
316    if (languageScriptMatch) {
317        return 3;
318    } else if (subtagMatch) {
319        return 2;
320    } else if (scriptMatch) {
321        return 1;
322    }
323    return 0;
324}
325
326FontLanguages::FontLanguages(std::vector<FontLanguage>&& languages)
327    : mLanguages(std::move(languages)) {
328    if (mLanguages.empty()) {
329        return;
330    }
331
332    const FontLanguage& lang = mLanguages[0];
333
334    mIsAllTheSameLanguage = true;
335    mUnionOfSubScriptBits = lang.mSubScriptBits;
336    for (size_t i = 1; i < mLanguages.size(); ++i) {
337        mUnionOfSubScriptBits |= mLanguages[i].mSubScriptBits;
338        if (mIsAllTheSameLanguage && lang.mLanguage != mLanguages[i].mLanguage) {
339            mIsAllTheSameLanguage = false;
340        }
341    }
342}
343
344#undef SCRIPT_TAG
345}  // namespace minikin
346