1d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien/*
2d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien * Copyright (C) 2015 The Android Open Source Project
3d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien *
4d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien * Licensed under the Apache License, Version 2.0 (the "License");
5d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien * you may not use this file except in compliance with the License.
6d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien * You may obtain a copy of the License at
7d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien *
8d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien *      http://www.apache.org/licenses/LICENSE-2.0
9d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien *
10d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien * Unless required by applicable law or agreed to in writing, software
11d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien * distributed under the License is distributed on an "AS IS" BASIS,
12d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien * See the License for the specific language governing permissions and
14d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien * limitations under the License.
15d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien */
16d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien
17d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien#include <gtest/gtest.h>
18d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien#include <unicode/utf.h>
19d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien#include <cstdlib>
20d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien
21d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien// src is of the form "U+1F431 | 'h' 'i'". Position of "|" gets saved to offset if non-null.
22d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien// Size is returned in an out parameter because gtest needs a void return for ASSERT to work.
23d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levienvoid ParseUnicode(uint16_t* buf, size_t buf_size, const char* src, size_t* result_size,
24d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien        size_t* offset) {
25d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien    size_t input_ix = 0;
26d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien    size_t output_ix = 0;
27d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien    bool seen_offset = false;
28d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien
29d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien    while (src[input_ix] != 0) {
30d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien        switch (src[input_ix]) {
31d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien        case '\'':
32d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien            // single ASCII char
33d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien            ASSERT_LT(src[input_ix], 0x80);
34d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien            input_ix++;
35d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien            ASSERT_NE(src[input_ix], 0);
36d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien            ASSERT_LT(output_ix, buf_size);
37d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien            buf[output_ix++] = (uint16_t)src[input_ix++];
38d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien            ASSERT_EQ(src[input_ix], '\'');
39d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien            input_ix++;
40d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien            break;
41d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien        case 'u':
42d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien        case 'U': {
43d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien            // Unicode codepoint in hex syntax
44d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien            input_ix++;
45d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien            ASSERT_EQ(src[input_ix], '+');
46d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien            input_ix++;
47d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien            char* endptr = (char*)src + input_ix;
48d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien            unsigned long int codepoint = strtoul(src + input_ix, &endptr, 16);
49d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien            size_t num_hex_digits = endptr - (src + input_ix);
50d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien            ASSERT_GE(num_hex_digits, 4u);  // also triggers on invalid number syntax, digits = 0
51d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien            ASSERT_LE(num_hex_digits, 6u);
52d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien            ASSERT_LE(codepoint, 0x10FFFFu);
53d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien            input_ix += num_hex_digits;
54d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien            if (U16_LENGTH(codepoint) == 1) {
55d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien                ASSERT_LE(output_ix + 1, buf_size);
56d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien                buf[output_ix++] = codepoint;
57d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien            } else {
58d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien                // UTF-16 encoding
59d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien                ASSERT_LE(output_ix + 2, buf_size);
60d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien                buf[output_ix++] = U16_LEAD(codepoint);
61d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien                buf[output_ix++] = U16_TRAIL(codepoint);
62d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien            }
63d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien            break;
64d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien        }
65d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien        case ' ':
66d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien            input_ix++;
67d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien            break;
68d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien        case '|':
69d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien            ASSERT_FALSE(seen_offset);
70d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien            ASSERT_NE(offset, nullptr);
71d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien            *offset = output_ix;
72d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien            seen_offset = true;
73d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien            input_ix++;
74d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien            break;
75d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien        default:
76d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien            FAIL();  // unexpected character
77d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien        }
78d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien    }
79d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien    ASSERT_NE(result_size, nullptr);
80d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien    *result_size = output_ix;
81d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien    ASSERT_TRUE(seen_offset || offset == nullptr);
82d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien}
83d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien
84d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph LevienTEST(UnicodeUtils, parse) {
85d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien    const size_t BUF_SIZE = 256;
86d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien    uint16_t buf[BUF_SIZE];
87d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien    size_t offset;
88d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien    size_t size;
89d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien    ParseUnicode(buf, BUF_SIZE, "U+000D U+1F431 | 'a'", &size, &offset);
90d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien    EXPECT_EQ(size, 4u);
91d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien    EXPECT_EQ(offset, 3u);
92d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien    EXPECT_EQ(buf[0], 0x000D);
93d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien    EXPECT_EQ(buf[1], 0xD83D);
94d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien    EXPECT_EQ(buf[2], 0xDC31);
95d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien    EXPECT_EQ(buf[3], 'a');
96d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien}
97