1/* 2 * Copyright (C) 2015 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17#include "utf.h" 18 19#include "common_runtime_test.h" 20#include "utf-inl.h" 21 22#include <vector> 23 24namespace art { 25 26class UtfTest : public CommonRuntimeTest {}; 27 28TEST_F(UtfTest, GetLeadingUtf16Char) { 29 EXPECT_EQ(0xffff, GetLeadingUtf16Char(0xeeeeffff)); 30} 31 32TEST_F(UtfTest, GetTrailingUtf16Char) { 33 EXPECT_EQ(0xffff, GetTrailingUtf16Char(0xffffeeee)); 34 EXPECT_EQ(0, GetTrailingUtf16Char(0x0000aaaa)); 35} 36 37#define EXPECT_ARRAY_POSITION(expected, end, start) \ 38 EXPECT_EQ(static_cast<uintptr_t>(expected), \ 39 reinterpret_cast<uintptr_t>(end) - reinterpret_cast<uintptr_t>(start)); 40 41// A test string containing one, two, three and four byte UTF-8 sequences. 42static const uint8_t kAllSequences[] = { 43 0x24, 44 0xc2, 0xa2, 45 0xe2, 0x82, 0xac, 46 0xf0, 0x9f, 0x8f, 0xa0, 47 0x00 48}; 49 50// A test string that contains a UTF-8 encoding of a surrogate pair 51// (code point = U+10400) 52static const uint8_t kSurrogateEncoding[] = { 53 0xed, 0xa0, 0x81, 54 0xed, 0xb0, 0x80, 55 0x00 56}; 57 58TEST_F(UtfTest, GetUtf16FromUtf8) { 59 const char* const start = reinterpret_cast<const char*>(kAllSequences); 60 const char* ptr = start; 61 uint32_t pair = 0; 62 63 // Single byte sequence. 64 pair = GetUtf16FromUtf8(&ptr); 65 EXPECT_EQ(0x24, GetLeadingUtf16Char(pair)); 66 EXPECT_EQ(0, GetTrailingUtf16Char(pair)); 67 EXPECT_ARRAY_POSITION(1, ptr, start); 68 69 // Two byte sequence 70 pair = GetUtf16FromUtf8(&ptr); 71 EXPECT_EQ(0xa2, GetLeadingUtf16Char(pair)); 72 EXPECT_EQ(0, GetTrailingUtf16Char(pair)); 73 EXPECT_ARRAY_POSITION(3, ptr, start); 74 75 // Three byte sequence 76 pair = GetUtf16FromUtf8(&ptr); 77 EXPECT_EQ(0x20ac, GetLeadingUtf16Char(pair)); 78 EXPECT_EQ(0, GetTrailingUtf16Char(pair)); 79 EXPECT_ARRAY_POSITION(6, ptr, start); 80 81 // Four byte sequence 82 pair = GetUtf16FromUtf8(&ptr); 83 EXPECT_EQ(0xd83c, GetLeadingUtf16Char(pair)); 84 EXPECT_EQ(0xdfe0, GetTrailingUtf16Char(pair)); 85 EXPECT_ARRAY_POSITION(10, ptr, start); 86 87 // Null terminator 88 pair = GetUtf16FromUtf8(&ptr); 89 EXPECT_EQ(0, GetLeadingUtf16Char(pair)); 90 EXPECT_EQ(0, GetTrailingUtf16Char(pair)); 91 EXPECT_ARRAY_POSITION(11, ptr, start); 92} 93 94TEST_F(UtfTest, GetUtf16FromUtf8_SurrogatesPassThrough) { 95 const char* const start = reinterpret_cast<const char *>(kSurrogateEncoding); 96 const char* ptr = start; 97 uint32_t pair = 0; 98 99 pair = GetUtf16FromUtf8(&ptr); 100 EXPECT_EQ(0xd801, GetLeadingUtf16Char(pair)); 101 EXPECT_EQ(0, GetTrailingUtf16Char(pair)); 102 EXPECT_ARRAY_POSITION(3, ptr, start); 103 104 pair = GetUtf16FromUtf8(&ptr); 105 EXPECT_EQ(0xdc00, GetLeadingUtf16Char(pair)); 106 EXPECT_EQ(0, GetTrailingUtf16Char(pair)); 107 EXPECT_ARRAY_POSITION(6, ptr, start); 108} 109 110TEST_F(UtfTest, CountModifiedUtf8Chars) { 111 EXPECT_EQ(5u, CountModifiedUtf8Chars(reinterpret_cast<const char *>(kAllSequences))); 112 EXPECT_EQ(2u, CountModifiedUtf8Chars(reinterpret_cast<const char *>(kSurrogateEncoding))); 113} 114 115static void AssertConversion(const std::vector<uint16_t> input, 116 const std::vector<uint8_t> expected) { 117 ASSERT_EQ(expected.size(), CountUtf8Bytes(&input[0], input.size())); 118 119 std::vector<uint8_t> output(expected.size()); 120 ConvertUtf16ToModifiedUtf8(reinterpret_cast<char*>(&output[0]), &input[0], input.size()); 121 EXPECT_EQ(expected, output); 122} 123 124TEST_F(UtfTest, CountAndConvertUtf8Bytes) { 125 // Surrogate pairs will be converted into 4 byte sequences. 126 AssertConversion({ 0xd801, 0xdc00 }, { 0xf0, 0x90, 0x90, 0x80 }); 127 128 // Three byte encodings that are below & above the leading surrogate 129 // range respectively. 130 AssertConversion({ 0xdef0 }, { 0xed, 0xbb, 0xb0 }); 131 AssertConversion({ 0xdcff }, { 0xed, 0xb3, 0xbf }); 132 // Two byte encoding. 133 AssertConversion({ 0x0101 }, { 0xc4, 0x81 }); 134 135 // Two byte special case : 0 must use an overlong encoding. 136 AssertConversion({ 0x0101, 0x0000 }, { 0xc4, 0x81, 0xc0, 0x80 }); 137 138 // One byte encoding. 139 AssertConversion({ 'h', 'e', 'l', 'l', 'o' }, { 0x68, 0x65, 0x6c, 0x6c, 0x6f }); 140 141 AssertConversion({ 142 0xd802, 0xdc02, // Surrogate pair 143 0xdef0, 0xdcff, // Three byte encodings 144 0x0101, 0x0000, // Two byte encodings 145 'p' , 'p' // One byte encoding 146 }, { 147 0xf0, 0x90, 0xa0, 0x82, 148 0xed, 0xbb, 0xb0, 0xed, 0xb3, 0xbf, 149 0xc4, 0x81, 0xc0, 0x80, 150 0x70, 0x70 151 }); 152} 153 154TEST_F(UtfTest, CountAndConvertUtf8Bytes_UnpairedSurrogate) { 155 // Unpaired trailing surrogate at the end of input. 156 AssertConversion({ 'h', 'e', 0xd801 }, { 'h', 'e', 0xed, 0xa0, 0x81 }); 157 // Unpaired (or incorrectly paired) surrogates in the middle of the input. 158 AssertConversion({ 'h', 0xd801, 'e' }, { 'h', 0xed, 0xa0, 0x81, 'e' }); 159 AssertConversion({ 'h', 0xd801, 0xd801, 'e' }, { 'h', 0xed, 0xa0, 0x81, 0xed, 0xa0, 0x81, 'e' }); 160 AssertConversion({ 'h', 0xdc00, 0xdc00, 'e' }, { 'h', 0xed, 0xb0, 0x80, 0xed, 0xb0, 0x80, 'e' }); 161} 162 163} // namespace art 164