1d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien/* 2d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien * Copyright (C) 2015 The Android Open Source Project 3d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien * 4d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien * Licensed under the Apache License, Version 2.0 (the "License"); 5d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien * you may not use this file except in compliance with the License. 6d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien * You may obtain a copy of the License at 7d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien * 8d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien * http://www.apache.org/licenses/LICENSE-2.0 9d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien * 10d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien * Unless required by applicable law or agreed to in writing, software 11d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien * distributed under the License is distributed on an "AS IS" BASIS, 12d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien * See the License for the specific language governing permissions and 14d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien * limitations under the License. 15d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien */ 16d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien 17d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien#include <gtest/gtest.h> 18d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien#include <unicode/utf.h> 19d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien#include <cstdlib> 20d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien 21d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien// src is of the form "U+1F431 | 'h' 'i'". Position of "|" gets saved to offset if non-null. 22d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien// Size is returned in an out parameter because gtest needs a void return for ASSERT to work. 23d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levienvoid ParseUnicode(uint16_t* buf, size_t buf_size, const char* src, size_t* result_size, 24d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien size_t* offset) { 25d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien size_t input_ix = 0; 26d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien size_t output_ix = 0; 27d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien bool seen_offset = false; 28d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien 29d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien while (src[input_ix] != 0) { 30d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien switch (src[input_ix]) { 31d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien case '\'': 32d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien // single ASCII char 33d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien ASSERT_LT(src[input_ix], 0x80); 34d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien input_ix++; 35d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien ASSERT_NE(src[input_ix], 0); 36d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien ASSERT_LT(output_ix, buf_size); 37d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien buf[output_ix++] = (uint16_t)src[input_ix++]; 38d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien ASSERT_EQ(src[input_ix], '\''); 39d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien input_ix++; 40d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien break; 41d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien case 'u': 42d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien case 'U': { 43d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien // Unicode codepoint in hex syntax 44d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien input_ix++; 45d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien ASSERT_EQ(src[input_ix], '+'); 46d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien input_ix++; 47d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien char* endptr = (char*)src + input_ix; 48d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien unsigned long int codepoint = strtoul(src + input_ix, &endptr, 16); 49d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien size_t num_hex_digits = endptr - (src + input_ix); 50d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien ASSERT_GE(num_hex_digits, 4u); // also triggers on invalid number syntax, digits = 0 51d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien ASSERT_LE(num_hex_digits, 6u); 52d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien ASSERT_LE(codepoint, 0x10FFFFu); 53d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien input_ix += num_hex_digits; 54d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien if (U16_LENGTH(codepoint) == 1) { 55d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien ASSERT_LE(output_ix + 1, buf_size); 56d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien buf[output_ix++] = codepoint; 57d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien } else { 58d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien // UTF-16 encoding 59d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien ASSERT_LE(output_ix + 2, buf_size); 60d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien buf[output_ix++] = U16_LEAD(codepoint); 61d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien buf[output_ix++] = U16_TRAIL(codepoint); 62d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien } 63d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien break; 64d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien } 65d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien case ' ': 66d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien input_ix++; 67d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien break; 68d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien case '|': 69d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien ASSERT_FALSE(seen_offset); 70d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien ASSERT_NE(offset, nullptr); 71d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien *offset = output_ix; 72d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien seen_offset = true; 73d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien input_ix++; 74d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien break; 75d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien default: 76d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien FAIL(); // unexpected character 77d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien } 78d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien } 79d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien ASSERT_NE(result_size, nullptr); 80d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien *result_size = output_ix; 81d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien ASSERT_TRUE(seen_offset || offset == nullptr); 82d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien} 83d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien 84d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph LevienTEST(UnicodeUtils, parse) { 85d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien const size_t BUF_SIZE = 256; 86d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien uint16_t buf[BUF_SIZE]; 87d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien size_t offset; 88d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien size_t size; 89d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien ParseUnicode(buf, BUF_SIZE, "U+000D U+1F431 | 'a'", &size, &offset); 90d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien EXPECT_EQ(size, 4u); 91d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien EXPECT_EQ(offset, 3u); 92d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien EXPECT_EQ(buf[0], 0x000D); 93d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien EXPECT_EQ(buf[1], 0xD83D); 94d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien EXPECT_EQ(buf[2], 0xDC31); 95d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien EXPECT_EQ(buf[3], 'a'); 96d8dd94b81ea7efd776859fbbdf4a76458e270eabRaph Levien} 97