1//===- llvm/unittest/Support/ConvertUTFTest.cpp - ConvertUTF tests --------===//
2//
3//                     The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9
10#include "llvm/Support/ConvertUTF.h"
11#include "gtest/gtest.h"
12#include <string>
13#include <vector>
14#include <utility>
15
16using namespace llvm;
17
18TEST(ConvertUTFTest, ConvertUTF16LittleEndianToUTF8String) {
19  // Src is the look of disapproval.
20  static const char Src[] = "\xff\xfe\xa0\x0c_\x00\xa0\x0c";
21  ArrayRef<char> Ref(Src, sizeof(Src) - 1);
22  std::string Result;
23  bool Success = convertUTF16ToUTF8String(Ref, Result);
24  EXPECT_TRUE(Success);
25  std::string Expected("\xe0\xb2\xa0_\xe0\xb2\xa0");
26  EXPECT_EQ(Expected, Result);
27}
28
29TEST(ConvertUTFTest, ConvertUTF16BigEndianToUTF8String) {
30  // Src is the look of disapproval.
31  static const char Src[] = "\xfe\xff\x0c\xa0\x00_\x0c\xa0";
32  ArrayRef<char> Ref(Src, sizeof(Src) - 1);
33  std::string Result;
34  bool Success = convertUTF16ToUTF8String(Ref, Result);
35  EXPECT_TRUE(Success);
36  std::string Expected("\xe0\xb2\xa0_\xe0\xb2\xa0");
37  EXPECT_EQ(Expected, Result);
38}
39
40TEST(ConvertUTFTest, OddLengthInput) {
41  std::string Result;
42  bool Success = convertUTF16ToUTF8String(ArrayRef<char>("xxxxx", 5), Result);
43  EXPECT_FALSE(Success);
44}
45
46TEST(ConvertUTFTest, Empty) {
47  std::string Result;
48  bool Success = convertUTF16ToUTF8String(ArrayRef<char>(), Result);
49  EXPECT_TRUE(Success);
50  EXPECT_TRUE(Result.empty());
51}
52
53TEST(ConvertUTFTest, HasUTF16BOM) {
54  bool HasBOM = hasUTF16ByteOrderMark(ArrayRef<char>("\xff\xfe", 2));
55  EXPECT_TRUE(HasBOM);
56  HasBOM = hasUTF16ByteOrderMark(ArrayRef<char>("\xfe\xff", 2));
57  EXPECT_TRUE(HasBOM);
58  HasBOM = hasUTF16ByteOrderMark(ArrayRef<char>("\xfe\xff ", 3));
59  EXPECT_TRUE(HasBOM); // Don't care about odd lengths.
60  HasBOM = hasUTF16ByteOrderMark(ArrayRef<char>("\xfe\xff\x00asdf", 6));
61  EXPECT_TRUE(HasBOM);
62
63  HasBOM = hasUTF16ByteOrderMark(ArrayRef<char>());
64  EXPECT_FALSE(HasBOM);
65  HasBOM = hasUTF16ByteOrderMark(ArrayRef<char>("\xfe", 1));
66  EXPECT_FALSE(HasBOM);
67}
68
69struct ConvertUTFResultContainer {
70  ConversionResult ErrorCode;
71  std::vector<unsigned> UnicodeScalars;
72
73  ConvertUTFResultContainer(ConversionResult ErrorCode)
74      : ErrorCode(ErrorCode) {}
75
76  ConvertUTFResultContainer
77  withScalars(unsigned US0 = 0x110000, unsigned US1 = 0x110000,
78              unsigned US2 = 0x110000, unsigned US3 = 0x110000,
79              unsigned US4 = 0x110000, unsigned US5 = 0x110000,
80              unsigned US6 = 0x110000, unsigned US7 = 0x110000) {
81    ConvertUTFResultContainer Result(*this);
82    if (US0 != 0x110000)
83      Result.UnicodeScalars.push_back(US0);
84    if (US1 != 0x110000)
85      Result.UnicodeScalars.push_back(US1);
86    if (US2 != 0x110000)
87      Result.UnicodeScalars.push_back(US2);
88    if (US3 != 0x110000)
89      Result.UnicodeScalars.push_back(US3);
90    if (US4 != 0x110000)
91      Result.UnicodeScalars.push_back(US4);
92    if (US5 != 0x110000)
93      Result.UnicodeScalars.push_back(US5);
94    if (US6 != 0x110000)
95      Result.UnicodeScalars.push_back(US6);
96    if (US7 != 0x110000)
97      Result.UnicodeScalars.push_back(US7);
98    return Result;
99  }
100};
101
102std::pair<ConversionResult, std::vector<unsigned>>
103ConvertUTF8ToUnicodeScalarsLenient(StringRef S) {
104  const UTF8 *SourceStart = reinterpret_cast<const UTF8 *>(S.data());
105
106  const UTF8 *SourceNext = SourceStart;
107  std::vector<UTF32> Decoded(S.size(), 0);
108  UTF32 *TargetStart = Decoded.data();
109
110  auto ErrorCode =
111      ConvertUTF8toUTF32(&SourceNext, SourceStart + S.size(), &TargetStart,
112                         Decoded.data() + Decoded.size(), lenientConversion);
113
114  Decoded.resize(TargetStart - Decoded.data());
115
116  return std::make_pair(ErrorCode, Decoded);
117}
118
119std::pair<ConversionResult, std::vector<unsigned>>
120ConvertUTF8ToUnicodeScalarsPartialLenient(StringRef S) {
121  const UTF8 *SourceStart = reinterpret_cast<const UTF8 *>(S.data());
122
123  const UTF8 *SourceNext = SourceStart;
124  std::vector<UTF32> Decoded(S.size(), 0);
125  UTF32 *TargetStart = Decoded.data();
126
127  auto ErrorCode = ConvertUTF8toUTF32Partial(
128      &SourceNext, SourceStart + S.size(), &TargetStart,
129      Decoded.data() + Decoded.size(), lenientConversion);
130
131  Decoded.resize(TargetStart - Decoded.data());
132
133  return std::make_pair(ErrorCode, Decoded);
134}
135
136::testing::AssertionResult
137CheckConvertUTF8ToUnicodeScalars(ConvertUTFResultContainer Expected,
138                                 StringRef S, bool Partial = false) {
139  ConversionResult ErrorCode;
140  std::vector<unsigned> Decoded;
141  if (!Partial)
142    std::tie(ErrorCode, Decoded) = ConvertUTF8ToUnicodeScalarsLenient(S);
143  else
144
145    std::tie(ErrorCode, Decoded) = ConvertUTF8ToUnicodeScalarsPartialLenient(S);
146  if (Expected.ErrorCode != ErrorCode)
147    return ::testing::AssertionFailure() << "Expected error code "
148                                         << Expected.ErrorCode << ", actual "
149                                         << ErrorCode;
150
151  if (Expected.UnicodeScalars != Decoded)
152    return ::testing::AssertionFailure()
153           << "Expected lenient decoded result:\n"
154           << ::testing::PrintToString(Expected.UnicodeScalars) << "\n"
155           << "Actual result:\n" << ::testing::PrintToString(Decoded);
156
157  return ::testing::AssertionSuccess();
158}
159
160TEST(ConvertUTFTest, UTF8ToUTF32Lenient) {
161
162  //
163  // 1-byte sequences
164  //
165
166  // U+0041 LATIN CAPITAL LETTER A
167  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
168      ConvertUTFResultContainer(conversionOK).withScalars(0x0041), "\x41"));
169
170  //
171  // 2-byte sequences
172  //
173
174  // U+0283 LATIN SMALL LETTER ESH
175  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
176      ConvertUTFResultContainer(conversionOK).withScalars(0x0283),
177      "\xca\x83"));
178
179  // U+03BA GREEK SMALL LETTER KAPPA
180  // U+1F79 GREEK SMALL LETTER OMICRON WITH OXIA
181  // U+03C3 GREEK SMALL LETTER SIGMA
182  // U+03BC GREEK SMALL LETTER MU
183  // U+03B5 GREEK SMALL LETTER EPSILON
184  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
185      ConvertUTFResultContainer(conversionOK)
186          .withScalars(0x03ba, 0x1f79, 0x03c3, 0x03bc, 0x03b5),
187      "\xce\xba\xe1\xbd\xb9\xcf\x83\xce\xbc\xce\xb5"));
188
189  //
190  // 3-byte sequences
191  //
192
193  // U+4F8B CJK UNIFIED IDEOGRAPH-4F8B
194  // U+6587 CJK UNIFIED IDEOGRAPH-6587
195  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
196      ConvertUTFResultContainer(conversionOK).withScalars(0x4f8b, 0x6587),
197      "\xe4\xbe\x8b\xe6\x96\x87"));
198
199  // U+D55C HANGUL SYLLABLE HAN
200  // U+AE00 HANGUL SYLLABLE GEUL
201  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
202      ConvertUTFResultContainer(conversionOK).withScalars(0xd55c, 0xae00),
203      "\xed\x95\x9c\xea\xb8\x80"));
204
205  // U+1112 HANGUL CHOSEONG HIEUH
206  // U+1161 HANGUL JUNGSEONG A
207  // U+11AB HANGUL JONGSEONG NIEUN
208  // U+1100 HANGUL CHOSEONG KIYEOK
209  // U+1173 HANGUL JUNGSEONG EU
210  // U+11AF HANGUL JONGSEONG RIEUL
211  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
212      ConvertUTFResultContainer(conversionOK)
213          .withScalars(0x1112, 0x1161, 0x11ab, 0x1100, 0x1173, 0x11af),
214      "\xe1\x84\x92\xe1\x85\xa1\xe1\x86\xab\xe1\x84\x80\xe1\x85\xb3"
215      "\xe1\x86\xaf"));
216
217  //
218  // 4-byte sequences
219  //
220
221  // U+E0100 VARIATION SELECTOR-17
222  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
223      ConvertUTFResultContainer(conversionOK).withScalars(0x000E0100),
224      "\xf3\xa0\x84\x80"));
225
226  //
227  // First possible sequence of a certain length
228  //
229
230  // U+0000 NULL
231  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
232      ConvertUTFResultContainer(conversionOK).withScalars(0x0000),
233      StringRef("\x00", 1)));
234
235  // U+0080 PADDING CHARACTER
236  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
237      ConvertUTFResultContainer(conversionOK).withScalars(0x0080),
238      "\xc2\x80"));
239
240  // U+0800 SAMARITAN LETTER ALAF
241  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
242      ConvertUTFResultContainer(conversionOK).withScalars(0x0800),
243      "\xe0\xa0\x80"));
244
245  // U+10000 LINEAR B SYLLABLE B008 A
246  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
247      ConvertUTFResultContainer(conversionOK).withScalars(0x10000),
248      "\xf0\x90\x80\x80"));
249
250  // U+200000 (invalid)
251  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
252      ConvertUTFResultContainer(sourceIllegal)
253          .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
254      "\xf8\x88\x80\x80\x80"));
255
256  // U+4000000 (invalid)
257  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
258      ConvertUTFResultContainer(sourceIllegal)
259          .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
260      "\xfc\x84\x80\x80\x80\x80"));
261
262  //
263  // Last possible sequence of a certain length
264  //
265
266  // U+007F DELETE
267  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
268      ConvertUTFResultContainer(conversionOK).withScalars(0x007f), "\x7f"));
269
270  // U+07FF (unassigned)
271  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
272      ConvertUTFResultContainer(conversionOK).withScalars(0x07ff),
273      "\xdf\xbf"));
274
275  // U+FFFF (noncharacter)
276  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
277      ConvertUTFResultContainer(conversionOK).withScalars(0xffff),
278      "\xef\xbf\xbf"));
279
280  // U+1FFFFF (invalid)
281  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
282      ConvertUTFResultContainer(sourceIllegal)
283          .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
284      "\xf7\xbf\xbf\xbf"));
285
286  // U+3FFFFFF (invalid)
287  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
288      ConvertUTFResultContainer(sourceIllegal)
289          .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
290      "\xfb\xbf\xbf\xbf\xbf"));
291
292  // U+7FFFFFFF (invalid)
293  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
294      ConvertUTFResultContainer(sourceIllegal)
295          .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
296      "\xfd\xbf\xbf\xbf\xbf\xbf"));
297
298  //
299  // Other boundary conditions
300  //
301
302  // U+D7FF (unassigned)
303  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
304      ConvertUTFResultContainer(conversionOK).withScalars(0xd7ff),
305      "\xed\x9f\xbf"));
306
307  // U+E000 (private use)
308  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
309      ConvertUTFResultContainer(conversionOK).withScalars(0xe000),
310      "\xee\x80\x80"));
311
312  // U+FFFD REPLACEMENT CHARACTER
313  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
314      ConvertUTFResultContainer(conversionOK).withScalars(0xfffd),
315      "\xef\xbf\xbd"));
316
317  // U+10FFFF (noncharacter)
318  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
319      ConvertUTFResultContainer(conversionOK).withScalars(0x10ffff),
320      "\xf4\x8f\xbf\xbf"));
321
322  // U+110000 (invalid)
323  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
324      ConvertUTFResultContainer(sourceIllegal)
325          .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
326      "\xf4\x90\x80\x80"));
327
328  //
329  // Unexpected continuation bytes
330  //
331
332  // A sequence of unexpected continuation bytes that don't follow a first
333  // byte, every byte is a maximal subpart.
334
335  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
336      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\x80"));
337  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
338      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xbf"));
339  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
340      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
341      "\x80\x80"));
342  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
343      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
344      "\x80\xbf"));
345  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
346      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
347      "\xbf\x80"));
348  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
349      ConvertUTFResultContainer(sourceIllegal)
350          .withScalars(0xfffd, 0xfffd, 0xfffd),
351      "\x80\xbf\x80"));
352  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
353      ConvertUTFResultContainer(sourceIllegal)
354          .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
355      "\x80\xbf\x80\xbf"));
356  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
357      ConvertUTFResultContainer(sourceIllegal)
358          .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
359      "\x80\xbf\x82\xbf\xaa"));
360  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
361      ConvertUTFResultContainer(sourceIllegal)
362          .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
363      "\xaa\xb0\xbb\xbf\xaa\xa0"));
364  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
365      ConvertUTFResultContainer(sourceIllegal)
366          .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
367      "\xaa\xb0\xbb\xbf\xaa\xa0\x8f"));
368
369  // All continuation bytes (0x80--0xbf).
370  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
371      ConvertUTFResultContainer(sourceIllegal)
372          .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
373                       0xfffd, 0xfffd, 0xfffd, 0xfffd)
374          .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
375                       0xfffd, 0xfffd, 0xfffd, 0xfffd)
376          .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
377                       0xfffd, 0xfffd, 0xfffd, 0xfffd)
378          .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
379                       0xfffd, 0xfffd, 0xfffd, 0xfffd)
380          .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
381                       0xfffd, 0xfffd, 0xfffd, 0xfffd)
382          .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
383                       0xfffd, 0xfffd, 0xfffd, 0xfffd)
384          .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
385                       0xfffd, 0xfffd, 0xfffd, 0xfffd)
386          .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
387                       0xfffd, 0xfffd, 0xfffd, 0xfffd),
388      "\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f"
389      "\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f"
390      "\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9\xaa\xab\xac\xad\xae\xaf"
391      "\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf"));
392
393  //
394  // Lonely start bytes
395  //
396
397  // Start bytes of 2-byte sequences (0xc0--0xdf).
398  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
399      ConvertUTFResultContainer(sourceIllegal)
400          .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
401                       0xfffd, 0xfffd, 0xfffd, 0xfffd)
402          .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
403                       0xfffd, 0xfffd, 0xfffd, 0xfffd)
404          .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
405                       0xfffd, 0xfffd, 0xfffd, 0xfffd)
406          .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
407                       0xfffd, 0xfffd, 0xfffd, 0xfffd),
408      "\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf"
409      "\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf"));
410
411  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
412      ConvertUTFResultContainer(sourceIllegal)
413          .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
414                       0xfffd, 0x0020, 0xfffd, 0x0020)
415          .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
416                       0xfffd, 0x0020, 0xfffd, 0x0020)
417          .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
418                       0xfffd, 0x0020, 0xfffd, 0x0020)
419          .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
420                       0xfffd, 0x0020, 0xfffd, 0x0020)
421          .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
422                       0xfffd, 0x0020, 0xfffd, 0x0020)
423          .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
424                       0xfffd, 0x0020, 0xfffd, 0x0020)
425          .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
426                       0xfffd, 0x0020, 0xfffd, 0x0020)
427          .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
428                       0xfffd, 0x0020, 0xfffd, 0x0020),
429      "\xc0\x20\xc1\x20\xc2\x20\xc3\x20\xc4\x20\xc5\x20\xc6\x20\xc7\x20"
430      "\xc8\x20\xc9\x20\xca\x20\xcb\x20\xcc\x20\xcd\x20\xce\x20\xcf\x20"
431      "\xd0\x20\xd1\x20\xd2\x20\xd3\x20\xd4\x20\xd5\x20\xd6\x20\xd7\x20"
432      "\xd8\x20\xd9\x20\xda\x20\xdb\x20\xdc\x20\xdd\x20\xde\x20\xdf\x20"));
433
434  // Start bytes of 3-byte sequences (0xe0--0xef).
435  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
436      ConvertUTFResultContainer(sourceIllegal)
437          .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
438                       0xfffd, 0xfffd, 0xfffd, 0xfffd)
439          .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
440                       0xfffd, 0xfffd, 0xfffd, 0xfffd),
441      "\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef"));
442
443  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
444      ConvertUTFResultContainer(sourceIllegal)
445          .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
446                       0xfffd, 0x0020, 0xfffd, 0x0020)
447          .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
448                       0xfffd, 0x0020, 0xfffd, 0x0020)
449          .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
450                       0xfffd, 0x0020, 0xfffd, 0x0020)
451          .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
452                       0xfffd, 0x0020, 0xfffd, 0x0020),
453      "\xe0\x20\xe1\x20\xe2\x20\xe3\x20\xe4\x20\xe5\x20\xe6\x20\xe7\x20"
454      "\xe8\x20\xe9\x20\xea\x20\xeb\x20\xec\x20\xed\x20\xee\x20\xef\x20"));
455
456  // Start bytes of 4-byte sequences (0xf0--0xf7).
457  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
458      ConvertUTFResultContainer(sourceIllegal)
459          .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd,
460                       0xfffd, 0xfffd, 0xfffd, 0xfffd),
461      "\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7"));
462
463  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
464      ConvertUTFResultContainer(sourceIllegal)
465          .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
466                       0xfffd, 0x0020, 0xfffd, 0x0020)
467          .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
468                       0xfffd, 0x0020, 0xfffd, 0x0020),
469      "\xf0\x20\xf1\x20\xf2\x20\xf3\x20\xf4\x20\xf5\x20\xf6\x20\xf7\x20"));
470
471  // Start bytes of 5-byte sequences (0xf8--0xfb).
472  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
473      ConvertUTFResultContainer(sourceIllegal)
474          .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
475      "\xf8\xf9\xfa\xfb"));
476
477  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
478      ConvertUTFResultContainer(sourceIllegal)
479          .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
480                       0xfffd, 0x0020, 0xfffd, 0x0020),
481      "\xf8\x20\xf9\x20\xfa\x20\xfb\x20"));
482
483  // Start bytes of 6-byte sequences (0xfc--0xfd).
484  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
485      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
486      "\xfc\xfd"));
487
488  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
489      ConvertUTFResultContainer(sourceIllegal)
490          .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020),
491      "\xfc\x20\xfd\x20"));
492
493  //
494  // Other bytes (0xc0--0xc1, 0xfe--0xff).
495  //
496
497  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
498      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xc0"));
499  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
500      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xc1"));
501  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
502      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xfe"));
503  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
504      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xff"));
505
506  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
507      ConvertUTFResultContainer(sourceIllegal)
508          .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
509      "\xc0\xc1\xfe\xff"));
510
511  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
512      ConvertUTFResultContainer(sourceIllegal)
513          .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
514      "\xfe\xfe\xff\xff"));
515
516  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
517      ConvertUTFResultContainer(sourceIllegal)
518          .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
519      "\xfe\x80\x80\x80\x80\x80"));
520
521  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
522      ConvertUTFResultContainer(sourceIllegal)
523          .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
524      "\xff\x80\x80\x80\x80\x80"));
525
526  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
527      ConvertUTFResultContainer(sourceIllegal)
528          .withScalars(0xfffd, 0x0020, 0xfffd, 0x0020,
529                       0xfffd, 0x0020, 0xfffd, 0x0020),
530      "\xc0\x20\xc1\x20\xfe\x20\xff\x20"));
531
532  //
533  // Sequences with one continuation byte missing
534  //
535
536  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
537      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xc2"));
538  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
539      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xdf"));
540  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
541      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
542      "\xe0\xa0"));
543  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
544      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
545      "\xe0\xbf"));
546  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
547      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
548      "\xe1\x80"));
549  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
550      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
551      "\xec\xbf"));
552  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
553      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
554      "\xed\x80"));
555  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
556      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
557      "\xed\x9f"));
558  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
559      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
560      "\xee\x80"));
561  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
562      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
563      "\xef\xbf"));
564  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
565      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
566      "\xf0\x90\x80"));
567  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
568      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
569      "\xf0\xbf\xbf"));
570  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
571      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
572      "\xf1\x80\x80"));
573  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
574      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
575      "\xf3\xbf\xbf"));
576  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
577      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
578      "\xf4\x80\x80"));
579  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
580      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
581      "\xf4\x8f\xbf"));
582
583  // Overlong sequences with one trailing byte missing.
584  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
585      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
586      "\xc0"));
587  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
588      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
589      "\xc1"));
590  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
591      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
592      "\xe0\x80"));
593  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
594      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
595      "\xe0\x9f"));
596  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
597      ConvertUTFResultContainer(sourceIllegal)
598          .withScalars(0xfffd, 0xfffd, 0xfffd),
599      "\xf0\x80\x80"));
600  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
601      ConvertUTFResultContainer(sourceIllegal)
602          .withScalars(0xfffd, 0xfffd, 0xfffd),
603      "\xf0\x8f\x80"));
604  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
605      ConvertUTFResultContainer(sourceIllegal)
606          .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
607      "\xf8\x80\x80\x80"));
608  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
609      ConvertUTFResultContainer(sourceIllegal)
610          .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
611      "\xfc\x80\x80\x80\x80"));
612
613  // Sequences that represent surrogates with one trailing byte missing.
614  // High surrogates
615  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
616      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
617      "\xed\xa0"));
618  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
619      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
620      "\xed\xac"));
621  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
622      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
623      "\xed\xaf"));
624  // Low surrogates
625  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
626      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
627      "\xed\xb0"));
628  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
629      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
630      "\xed\xb4"));
631  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
632      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
633      "\xed\xbf"));
634
635  // Ill-formed 4-byte sequences.
636  // 11110zzz 10zzyyyy 10yyyyxx 10xxxxxx
637  // U+1100xx (invalid)
638  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
639      ConvertUTFResultContainer(sourceIllegal)
640          .withScalars(0xfffd, 0xfffd, 0xfffd),
641      "\xf4\x90\x80"));
642  // U+13FBxx (invalid)
643  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
644      ConvertUTFResultContainer(sourceIllegal)
645          .withScalars(0xfffd, 0xfffd, 0xfffd),
646      "\xf4\xbf\xbf"));
647  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
648      ConvertUTFResultContainer(sourceIllegal)
649          .withScalars(0xfffd, 0xfffd, 0xfffd),
650      "\xf5\x80\x80"));
651  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
652      ConvertUTFResultContainer(sourceIllegal)
653          .withScalars(0xfffd, 0xfffd, 0xfffd),
654      "\xf6\x80\x80"));
655  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
656      ConvertUTFResultContainer(sourceIllegal)
657          .withScalars(0xfffd, 0xfffd, 0xfffd),
658      "\xf7\x80\x80"));
659  // U+1FFBxx (invalid)
660  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
661      ConvertUTFResultContainer(sourceIllegal)
662          .withScalars(0xfffd, 0xfffd, 0xfffd),
663      "\xf7\xbf\xbf"));
664
665  // Ill-formed 5-byte sequences.
666  // 111110uu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx
667  // U+2000xx (invalid)
668  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
669      ConvertUTFResultContainer(sourceIllegal)
670          .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
671      "\xf8\x88\x80\x80"));
672  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
673      ConvertUTFResultContainer(sourceIllegal)
674          .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
675      "\xf8\xbf\xbf\xbf"));
676  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
677      ConvertUTFResultContainer(sourceIllegal)
678          .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
679      "\xf9\x80\x80\x80"));
680  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
681      ConvertUTFResultContainer(sourceIllegal)
682          .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
683      "\xfa\x80\x80\x80"));
684  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
685      ConvertUTFResultContainer(sourceIllegal)
686          .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
687      "\xfb\x80\x80\x80"));
688  // U+3FFFFxx (invalid)
689  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
690      ConvertUTFResultContainer(sourceIllegal)
691          .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
692      "\xfb\xbf\xbf\xbf"));
693
694  // Ill-formed 6-byte sequences.
695  // 1111110u 10uuuuuu 10uzzzzz 10zzzyyyy 10yyyyxx 10xxxxxx
696  // U+40000xx (invalid)
697  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
698      ConvertUTFResultContainer(sourceIllegal)
699          .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
700      "\xfc\x84\x80\x80\x80"));
701  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
702      ConvertUTFResultContainer(sourceIllegal)
703          .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
704      "\xfc\xbf\xbf\xbf\xbf"));
705  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
706      ConvertUTFResultContainer(sourceIllegal)
707          .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
708      "\xfd\x80\x80\x80\x80"));
709  // U+7FFFFFxx (invalid)
710  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
711      ConvertUTFResultContainer(sourceIllegal)
712          .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
713      "\xfd\xbf\xbf\xbf\xbf"));
714
715  //
716  // Sequences with two continuation bytes missing
717  //
718
719  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
720      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
721      "\xf0\x90"));
722  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
723      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
724      "\xf0\xbf"));
725  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
726      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
727      "\xf1\x80"));
728  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
729      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
730      "\xf3\xbf"));
731  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
732      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
733      "\xf4\x80"));
734  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
735      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd),
736      "\xf4\x8f"));
737
738  // Overlong sequences with two trailing byte missing.
739  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
740      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xe0"));
741  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
742      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
743      "\xf0\x80"));
744  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
745      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
746      "\xf0\x8f"));
747  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
748      ConvertUTFResultContainer(sourceIllegal)
749          .withScalars(0xfffd, 0xfffd, 0xfffd),
750      "\xf8\x80\x80"));
751  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
752      ConvertUTFResultContainer(sourceIllegal)
753          .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
754      "\xfc\x80\x80\x80"));
755
756  // Sequences that represent surrogates with two trailing bytes missing.
757  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
758      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xed"));
759
760  // Ill-formed 4-byte sequences.
761  // 11110zzz 10zzyyyy 10yyyyxx 10xxxxxx
762  // U+110yxx (invalid)
763  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
764      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
765      "\xf4\x90"));
766  // U+13Fyxx (invalid)
767  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
768      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
769      "\xf4\xbf"));
770  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
771      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
772      "\xf5\x80"));
773  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
774      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
775      "\xf6\x80"));
776  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
777      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
778      "\xf7\x80"));
779  // U+1FFyxx (invalid)
780  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
781      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
782      "\xf7\xbf"));
783
784  // Ill-formed 5-byte sequences.
785  // 111110uu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx
786  // U+200yxx (invalid)
787  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
788      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd),
789      "\xf8\x88\x80"));
790  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
791      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd),
792      "\xf8\xbf\xbf"));
793  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
794      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd),
795      "\xf9\x80\x80"));
796  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
797      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd),
798      "\xfa\x80\x80"));
799  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
800      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd),
801      "\xfb\x80\x80"));
802  // U+3FFFyxx (invalid)
803  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
804      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd),
805      "\xfb\xbf\xbf"));
806
807  // Ill-formed 6-byte sequences.
808  // 1111110u 10uuuuuu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx
809  // U+4000yxx (invalid)
810  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
811      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
812      "\xfc\x84\x80\x80"));
813  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
814      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
815      "\xfc\xbf\xbf\xbf"));
816  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
817      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
818      "\xfd\x80\x80\x80"));
819  // U+7FFFFyxx (invalid)
820  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
821      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
822      "\xfd\xbf\xbf\xbf"));
823
824  //
825  // Sequences with three continuation bytes missing
826  //
827
828  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
829      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf0"));
830  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
831      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf1"));
832  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
833      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf2"));
834  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
835      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf3"));
836  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
837      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf4"));
838
839  // Broken overlong sequences.
840  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
841      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf0"));
842  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
843      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
844      "\xf8\x80"));
845  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
846      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd),
847      "\xfc\x80\x80"));
848
849  // Ill-formed 4-byte sequences.
850  // 11110zzz 10zzyyyy 10yyyyxx 10xxxxxx
851  // U+14yyxx (invalid)
852  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
853      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf5"));
854  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
855      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf6"));
856  // U+1Cyyxx (invalid)
857  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
858      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf7"));
859
860  // Ill-formed 5-byte sequences.
861  // 111110uu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx
862  // U+20yyxx (invalid)
863  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
864      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
865      "\xf8\x88"));
866  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
867      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
868      "\xf8\xbf"));
869  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
870      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
871      "\xf9\x80"));
872  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
873      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
874      "\xfa\x80"));
875  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
876      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
877      "\xfb\x80"));
878  // U+3FCyyxx (invalid)
879  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
880      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
881      "\xfb\xbf"));
882
883  // Ill-formed 6-byte sequences.
884  // 1111110u 10uuuuuu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx
885  // U+400yyxx (invalid)
886  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
887      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd),
888      "\xfc\x84\x80"));
889  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
890      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd),
891      "\xfc\xbf\xbf"));
892  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
893      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd),
894      "\xfd\x80\x80"));
895  // U+7FFCyyxx (invalid)
896  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
897      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd, 0xfffd),
898      "\xfd\xbf\xbf"));
899
900  //
901  // Sequences with four continuation bytes missing
902  //
903
904  // Ill-formed 5-byte sequences.
905  // 111110uu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx
906  // U+uzyyxx (invalid)
907  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
908      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf8"));
909  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
910      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf9"));
911  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
912      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xfa"));
913  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
914      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xfb"));
915  // U+3zyyxx (invalid)
916  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
917      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xfb"));
918
919  // Broken overlong sequences.
920  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
921      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xf8"));
922  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
923      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
924      "\xfc\x80"));
925
926  // Ill-formed 6-byte sequences.
927  // 1111110u 10uuuuuu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx
928  // U+uzzyyxx (invalid)
929  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
930      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
931      "\xfc\x84"));
932  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
933      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
934      "\xfc\xbf"));
935  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
936      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
937      "\xfd\x80"));
938  // U+7Fzzyyxx (invalid)
939  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
940      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
941      "\xfd\xbf"));
942
943  //
944  // Sequences with five continuation bytes missing
945  //
946
947  // Ill-formed 6-byte sequences.
948  // 1111110u 10uuuuuu 10zzzzzz 10zzyyyy 10yyyyxx 10xxxxxx
949  // U+uzzyyxx (invalid)
950  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
951      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xfc"));
952  // U+uuzzyyxx (invalid)
953  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
954      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd), "\xfd"));
955
956  //
957  // Consecutive sequences with trailing bytes missing
958  //
959
960  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
961      ConvertUTFResultContainer(sourceIllegal)
962          .withScalars(0xfffd, /**/ 0xfffd, 0xfffd, /**/ 0xfffd, 0xfffd, 0xfffd)
963          .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd)
964          .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd)
965          .withScalars(0xfffd, /**/ 0xfffd, /**/ 0xfffd, 0xfffd, 0xfffd)
966          .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd)
967          .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
968      "\xc0" "\xe0\x80" "\xf0\x80\x80"
969      "\xf8\x80\x80\x80"
970      "\xfc\x80\x80\x80\x80"
971      "\xdf" "\xef\xbf" "\xf7\xbf\xbf"
972      "\xfb\xbf\xbf\xbf"
973      "\xfd\xbf\xbf\xbf\xbf"));
974
975  //
976  // Overlong UTF-8 sequences
977  //
978
979  // U+002F SOLIDUS
980  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
981      ConvertUTFResultContainer(conversionOK).withScalars(0x002f), "\x2f"));
982
983  // Overlong sequences of the above.
984  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
985      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
986      "\xc0\xaf"));
987  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
988      ConvertUTFResultContainer(sourceIllegal)
989          .withScalars(0xfffd, 0xfffd, 0xfffd),
990      "\xe0\x80\xaf"));
991  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
992      ConvertUTFResultContainer(sourceIllegal)
993          .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
994      "\xf0\x80\x80\xaf"));
995  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
996      ConvertUTFResultContainer(sourceIllegal)
997          .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
998      "\xf8\x80\x80\x80\xaf"));
999  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1000      ConvertUTFResultContainer(sourceIllegal)
1001          .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1002      "\xfc\x80\x80\x80\x80\xaf"));
1003
1004  // U+0000 NULL
1005  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1006      ConvertUTFResultContainer(conversionOK).withScalars(0x0000),
1007      StringRef("\x00", 1)));
1008
1009  // Overlong sequences of the above.
1010  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1011      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
1012      "\xc0\x80"));
1013  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1014      ConvertUTFResultContainer(sourceIllegal)
1015          .withScalars(0xfffd, 0xfffd, 0xfffd),
1016      "\xe0\x80\x80"));
1017  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1018      ConvertUTFResultContainer(sourceIllegal)
1019          .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
1020      "\xf0\x80\x80\x80"));
1021  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1022      ConvertUTFResultContainer(sourceIllegal)
1023          .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1024      "\xf8\x80\x80\x80\x80"));
1025  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1026      ConvertUTFResultContainer(sourceIllegal)
1027          .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1028      "\xfc\x80\x80\x80\x80\x80"));
1029
1030  // Other overlong sequences.
1031  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1032      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
1033      "\xc0\xbf"));
1034  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1035      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
1036      "\xc1\x80"));
1037  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1038      ConvertUTFResultContainer(sourceIllegal).withScalars(0xfffd, 0xfffd),
1039      "\xc1\xbf"));
1040  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1041      ConvertUTFResultContainer(sourceIllegal)
1042          .withScalars(0xfffd, 0xfffd, 0xfffd),
1043      "\xe0\x9f\xbf"));
1044  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1045      ConvertUTFResultContainer(sourceIllegal)
1046          .withScalars(0xfffd, 0xfffd, 0xfffd),
1047      "\xed\xa0\x80"));
1048  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1049      ConvertUTFResultContainer(sourceIllegal)
1050          .withScalars(0xfffd, 0xfffd, 0xfffd),
1051      "\xed\xbf\xbf"));
1052  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1053      ConvertUTFResultContainer(sourceIllegal)
1054          .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
1055      "\xf0\x8f\x80\x80"));
1056  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1057      ConvertUTFResultContainer(sourceIllegal)
1058          .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd),
1059      "\xf0\x8f\xbf\xbf"));
1060  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1061      ConvertUTFResultContainer(sourceIllegal)
1062          .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1063      "\xf8\x87\xbf\xbf\xbf"));
1064  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1065      ConvertUTFResultContainer(sourceIllegal)
1066          .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1067      "\xfc\x83\xbf\xbf\xbf\xbf"));
1068
1069  //
1070  // Isolated surrogates
1071  //
1072
1073  // Unicode 6.3.0:
1074  //
1075  //    D71.  High-surrogate code point: A Unicode code point in the range
1076  //    U+D800 to U+DBFF.
1077  //
1078  //    D73.  Low-surrogate code point: A Unicode code point in the range
1079  //    U+DC00 to U+DFFF.
1080
1081  // Note: U+E0100 is <DB40 DD00> in UTF16.
1082
1083  // High surrogates
1084
1085  // U+D800
1086  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1087      ConvertUTFResultContainer(sourceIllegal)
1088          .withScalars(0xfffd, 0xfffd, 0xfffd),
1089      "\xed\xa0\x80"));
1090
1091  // U+DB40
1092  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1093      ConvertUTFResultContainer(sourceIllegal)
1094          .withScalars(0xfffd, 0xfffd, 0xfffd),
1095      "\xed\xac\xa0"));
1096
1097  // U+DBFF
1098  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1099      ConvertUTFResultContainer(sourceIllegal)
1100          .withScalars(0xfffd, 0xfffd, 0xfffd),
1101      "\xed\xaf\xbf"));
1102
1103  // Low surrogates
1104
1105  // U+DC00
1106  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1107      ConvertUTFResultContainer(sourceIllegal)
1108          .withScalars(0xfffd, 0xfffd, 0xfffd),
1109      "\xed\xb0\x80"));
1110
1111  // U+DD00
1112  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1113      ConvertUTFResultContainer(sourceIllegal)
1114          .withScalars(0xfffd, 0xfffd, 0xfffd),
1115      "\xed\xb4\x80"));
1116
1117  // U+DFFF
1118  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1119      ConvertUTFResultContainer(sourceIllegal)
1120          .withScalars(0xfffd, 0xfffd, 0xfffd),
1121      "\xed\xbf\xbf"));
1122
1123  // Surrogate pairs
1124
1125  // U+D800 U+DC00
1126  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1127      ConvertUTFResultContainer(sourceIllegal)
1128          .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1129      "\xed\xa0\x80\xed\xb0\x80"));
1130
1131  // U+D800 U+DD00
1132  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1133      ConvertUTFResultContainer(sourceIllegal)
1134          .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1135      "\xed\xa0\x80\xed\xb4\x80"));
1136
1137  // U+D800 U+DFFF
1138  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1139      ConvertUTFResultContainer(sourceIllegal)
1140          .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1141      "\xed\xa0\x80\xed\xbf\xbf"));
1142
1143  // U+DB40 U+DC00
1144  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1145      ConvertUTFResultContainer(sourceIllegal)
1146          .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1147      "\xed\xac\xa0\xed\xb0\x80"));
1148
1149  // U+DB40 U+DD00
1150  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1151      ConvertUTFResultContainer(sourceIllegal)
1152          .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1153      "\xed\xac\xa0\xed\xb4\x80"));
1154
1155  // U+DB40 U+DFFF
1156  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1157      ConvertUTFResultContainer(sourceIllegal)
1158          .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1159      "\xed\xac\xa0\xed\xbf\xbf"));
1160
1161  // U+DBFF U+DC00
1162  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1163      ConvertUTFResultContainer(sourceIllegal)
1164          .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1165      "\xed\xaf\xbf\xed\xb0\x80"));
1166
1167  // U+DBFF U+DD00
1168  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1169      ConvertUTFResultContainer(sourceIllegal)
1170          .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1171      "\xed\xaf\xbf\xed\xb4\x80"));
1172
1173  // U+DBFF U+DFFF
1174  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1175      ConvertUTFResultContainer(sourceIllegal)
1176          .withScalars(0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0xfffd),
1177      "\xed\xaf\xbf\xed\xbf\xbf"));
1178
1179  //
1180  // Noncharacters
1181  //
1182
1183  // Unicode 6.3.0:
1184  //
1185  //    D14.  Noncharacter: A code point that is permanently reserved for
1186  //    internal use and that should never be interchanged. Noncharacters
1187  //    consist of the values U+nFFFE and U+nFFFF (where n is from 0 to 1016)
1188  //    and the values U+FDD0..U+FDEF.
1189
1190  // U+FFFE
1191  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1192      ConvertUTFResultContainer(conversionOK).withScalars(0xfffe),
1193      "\xef\xbf\xbe"));
1194
1195  // U+FFFF
1196  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1197      ConvertUTFResultContainer(conversionOK).withScalars(0xffff),
1198      "\xef\xbf\xbf"));
1199
1200  // U+1FFFE
1201  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1202      ConvertUTFResultContainer(conversionOK).withScalars(0x1fffe),
1203      "\xf0\x9f\xbf\xbe"));
1204
1205  // U+1FFFF
1206  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1207      ConvertUTFResultContainer(conversionOK).withScalars(0x1ffff),
1208      "\xf0\x9f\xbf\xbf"));
1209
1210  // U+2FFFE
1211  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1212      ConvertUTFResultContainer(conversionOK).withScalars(0x2fffe),
1213      "\xf0\xaf\xbf\xbe"));
1214
1215  // U+2FFFF
1216  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1217      ConvertUTFResultContainer(conversionOK).withScalars(0x2ffff),
1218      "\xf0\xaf\xbf\xbf"));
1219
1220  // U+3FFFE
1221  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1222      ConvertUTFResultContainer(conversionOK).withScalars(0x3fffe),
1223      "\xf0\xbf\xbf\xbe"));
1224
1225  // U+3FFFF
1226  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1227      ConvertUTFResultContainer(conversionOK).withScalars(0x3ffff),
1228      "\xf0\xbf\xbf\xbf"));
1229
1230  // U+4FFFE
1231  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1232      ConvertUTFResultContainer(conversionOK).withScalars(0x4fffe),
1233      "\xf1\x8f\xbf\xbe"));
1234
1235  // U+4FFFF
1236  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1237      ConvertUTFResultContainer(conversionOK).withScalars(0x4ffff),
1238      "\xf1\x8f\xbf\xbf"));
1239
1240  // U+5FFFE
1241  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1242      ConvertUTFResultContainer(conversionOK).withScalars(0x5fffe),
1243      "\xf1\x9f\xbf\xbe"));
1244
1245  // U+5FFFF
1246  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1247      ConvertUTFResultContainer(conversionOK).withScalars(0x5ffff),
1248      "\xf1\x9f\xbf\xbf"));
1249
1250  // U+6FFFE
1251  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1252      ConvertUTFResultContainer(conversionOK).withScalars(0x6fffe),
1253      "\xf1\xaf\xbf\xbe"));
1254
1255  // U+6FFFF
1256  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1257      ConvertUTFResultContainer(conversionOK).withScalars(0x6ffff),
1258      "\xf1\xaf\xbf\xbf"));
1259
1260  // U+7FFFE
1261  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1262      ConvertUTFResultContainer(conversionOK).withScalars(0x7fffe),
1263      "\xf1\xbf\xbf\xbe"));
1264
1265  // U+7FFFF
1266  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1267      ConvertUTFResultContainer(conversionOK).withScalars(0x7ffff),
1268      "\xf1\xbf\xbf\xbf"));
1269
1270  // U+8FFFE
1271  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1272      ConvertUTFResultContainer(conversionOK).withScalars(0x8fffe),
1273      "\xf2\x8f\xbf\xbe"));
1274
1275  // U+8FFFF
1276  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1277      ConvertUTFResultContainer(conversionOK).withScalars(0x8ffff),
1278      "\xf2\x8f\xbf\xbf"));
1279
1280  // U+9FFFE
1281  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1282      ConvertUTFResultContainer(conversionOK).withScalars(0x9fffe),
1283      "\xf2\x9f\xbf\xbe"));
1284
1285  // U+9FFFF
1286  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1287      ConvertUTFResultContainer(conversionOK).withScalars(0x9ffff),
1288      "\xf2\x9f\xbf\xbf"));
1289
1290  // U+AFFFE
1291  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1292      ConvertUTFResultContainer(conversionOK).withScalars(0xafffe),
1293      "\xf2\xaf\xbf\xbe"));
1294
1295  // U+AFFFF
1296  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1297      ConvertUTFResultContainer(conversionOK).withScalars(0xaffff),
1298      "\xf2\xaf\xbf\xbf"));
1299
1300  // U+BFFFE
1301  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1302      ConvertUTFResultContainer(conversionOK).withScalars(0xbfffe),
1303      "\xf2\xbf\xbf\xbe"));
1304
1305  // U+BFFFF
1306  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1307      ConvertUTFResultContainer(conversionOK).withScalars(0xbffff),
1308      "\xf2\xbf\xbf\xbf"));
1309
1310  // U+CFFFE
1311  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1312      ConvertUTFResultContainer(conversionOK).withScalars(0xcfffe),
1313      "\xf3\x8f\xbf\xbe"));
1314
1315  // U+CFFFF
1316  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1317      ConvertUTFResultContainer(conversionOK).withScalars(0xcfffF),
1318      "\xf3\x8f\xbf\xbf"));
1319
1320  // U+DFFFE
1321  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1322      ConvertUTFResultContainer(conversionOK).withScalars(0xdfffe),
1323      "\xf3\x9f\xbf\xbe"));
1324
1325  // U+DFFFF
1326  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1327      ConvertUTFResultContainer(conversionOK).withScalars(0xdffff),
1328      "\xf3\x9f\xbf\xbf"));
1329
1330  // U+EFFFE
1331  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1332      ConvertUTFResultContainer(conversionOK).withScalars(0xefffe),
1333      "\xf3\xaf\xbf\xbe"));
1334
1335  // U+EFFFF
1336  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1337      ConvertUTFResultContainer(conversionOK).withScalars(0xeffff),
1338      "\xf3\xaf\xbf\xbf"));
1339
1340  // U+FFFFE
1341  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1342      ConvertUTFResultContainer(conversionOK).withScalars(0xffffe),
1343      "\xf3\xbf\xbf\xbe"));
1344
1345  // U+FFFFF
1346  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1347      ConvertUTFResultContainer(conversionOK).withScalars(0xfffff),
1348      "\xf3\xbf\xbf\xbf"));
1349
1350  // U+10FFFE
1351  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1352      ConvertUTFResultContainer(conversionOK).withScalars(0x10fffe),
1353      "\xf4\x8f\xbf\xbe"));
1354
1355  // U+10FFFF
1356  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1357      ConvertUTFResultContainer(conversionOK).withScalars(0x10ffff),
1358      "\xf4\x8f\xbf\xbf"));
1359
1360  // U+FDD0
1361  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1362      ConvertUTFResultContainer(conversionOK).withScalars(0xfdd0),
1363      "\xef\xb7\x90"));
1364
1365  // U+FDD1
1366  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1367      ConvertUTFResultContainer(conversionOK).withScalars(0xfdd1),
1368      "\xef\xb7\x91"));
1369
1370  // U+FDD2
1371  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1372      ConvertUTFResultContainer(conversionOK).withScalars(0xfdd2),
1373      "\xef\xb7\x92"));
1374
1375  // U+FDD3
1376  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1377      ConvertUTFResultContainer(conversionOK).withScalars(0xfdd3),
1378      "\xef\xb7\x93"));
1379
1380  // U+FDD4
1381  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1382      ConvertUTFResultContainer(conversionOK).withScalars(0xfdd4),
1383      "\xef\xb7\x94"));
1384
1385  // U+FDD5
1386  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1387      ConvertUTFResultContainer(conversionOK).withScalars(0xfdd5),
1388      "\xef\xb7\x95"));
1389
1390  // U+FDD6
1391  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1392      ConvertUTFResultContainer(conversionOK).withScalars(0xfdd6),
1393      "\xef\xb7\x96"));
1394
1395  // U+FDD7
1396  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1397      ConvertUTFResultContainer(conversionOK).withScalars(0xfdd7),
1398      "\xef\xb7\x97"));
1399
1400  // U+FDD8
1401  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1402      ConvertUTFResultContainer(conversionOK).withScalars(0xfdd8),
1403      "\xef\xb7\x98"));
1404
1405  // U+FDD9
1406  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1407      ConvertUTFResultContainer(conversionOK).withScalars(0xfdd9),
1408      "\xef\xb7\x99"));
1409
1410  // U+FDDA
1411  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1412      ConvertUTFResultContainer(conversionOK).withScalars(0xfdda),
1413      "\xef\xb7\x9a"));
1414
1415  // U+FDDB
1416  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1417      ConvertUTFResultContainer(conversionOK).withScalars(0xfddb),
1418      "\xef\xb7\x9b"));
1419
1420  // U+FDDC
1421  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1422      ConvertUTFResultContainer(conversionOK).withScalars(0xfddc),
1423      "\xef\xb7\x9c"));
1424
1425  // U+FDDD
1426  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1427      ConvertUTFResultContainer(conversionOK).withScalars(0xfddd),
1428      "\xef\xb7\x9d"));
1429
1430  // U+FDDE
1431  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1432      ConvertUTFResultContainer(conversionOK).withScalars(0xfdde),
1433      "\xef\xb7\x9e"));
1434
1435  // U+FDDF
1436  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1437      ConvertUTFResultContainer(conversionOK).withScalars(0xfddf),
1438      "\xef\xb7\x9f"));
1439
1440  // U+FDE0
1441  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1442      ConvertUTFResultContainer(conversionOK).withScalars(0xfde0),
1443      "\xef\xb7\xa0"));
1444
1445  // U+FDE1
1446  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1447      ConvertUTFResultContainer(conversionOK).withScalars(0xfde1),
1448      "\xef\xb7\xa1"));
1449
1450  // U+FDE2
1451  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1452      ConvertUTFResultContainer(conversionOK).withScalars(0xfde2),
1453      "\xef\xb7\xa2"));
1454
1455  // U+FDE3
1456  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1457      ConvertUTFResultContainer(conversionOK).withScalars(0xfde3),
1458      "\xef\xb7\xa3"));
1459
1460  // U+FDE4
1461  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1462      ConvertUTFResultContainer(conversionOK).withScalars(0xfde4),
1463      "\xef\xb7\xa4"));
1464
1465  // U+FDE5
1466  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1467      ConvertUTFResultContainer(conversionOK).withScalars(0xfde5),
1468      "\xef\xb7\xa5"));
1469
1470  // U+FDE6
1471  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1472      ConvertUTFResultContainer(conversionOK).withScalars(0xfde6),
1473      "\xef\xb7\xa6"));
1474
1475  // U+FDE7
1476  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1477      ConvertUTFResultContainer(conversionOK).withScalars(0xfde7),
1478      "\xef\xb7\xa7"));
1479
1480  // U+FDE8
1481  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1482      ConvertUTFResultContainer(conversionOK).withScalars(0xfde8),
1483      "\xef\xb7\xa8"));
1484
1485  // U+FDE9
1486  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1487      ConvertUTFResultContainer(conversionOK).withScalars(0xfde9),
1488      "\xef\xb7\xa9"));
1489
1490  // U+FDEA
1491  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1492      ConvertUTFResultContainer(conversionOK).withScalars(0xfdea),
1493      "\xef\xb7\xaa"));
1494
1495  // U+FDEB
1496  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1497      ConvertUTFResultContainer(conversionOK).withScalars(0xfdeb),
1498      "\xef\xb7\xab"));
1499
1500  // U+FDEC
1501  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1502      ConvertUTFResultContainer(conversionOK).withScalars(0xfdec),
1503      "\xef\xb7\xac"));
1504
1505  // U+FDED
1506  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1507      ConvertUTFResultContainer(conversionOK).withScalars(0xfded),
1508      "\xef\xb7\xad"));
1509
1510  // U+FDEE
1511  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1512      ConvertUTFResultContainer(conversionOK).withScalars(0xfdee),
1513      "\xef\xb7\xae"));
1514
1515  // U+FDEF
1516  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1517      ConvertUTFResultContainer(conversionOK).withScalars(0xfdef),
1518      "\xef\xb7\xaf"));
1519
1520  // U+FDF0
1521  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1522      ConvertUTFResultContainer(conversionOK).withScalars(0xfdf0),
1523      "\xef\xb7\xb0"));
1524
1525  // U+FDF1
1526  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1527      ConvertUTFResultContainer(conversionOK).withScalars(0xfdf1),
1528      "\xef\xb7\xb1"));
1529
1530  // U+FDF2
1531  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1532      ConvertUTFResultContainer(conversionOK).withScalars(0xfdf2),
1533      "\xef\xb7\xb2"));
1534
1535  // U+FDF3
1536  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1537      ConvertUTFResultContainer(conversionOK).withScalars(0xfdf3),
1538      "\xef\xb7\xb3"));
1539
1540  // U+FDF4
1541  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1542      ConvertUTFResultContainer(conversionOK).withScalars(0xfdf4),
1543      "\xef\xb7\xb4"));
1544
1545  // U+FDF5
1546  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1547      ConvertUTFResultContainer(conversionOK).withScalars(0xfdf5),
1548      "\xef\xb7\xb5"));
1549
1550  // U+FDF6
1551  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1552      ConvertUTFResultContainer(conversionOK).withScalars(0xfdf6),
1553      "\xef\xb7\xb6"));
1554
1555  // U+FDF7
1556  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1557      ConvertUTFResultContainer(conversionOK).withScalars(0xfdf7),
1558      "\xef\xb7\xb7"));
1559
1560  // U+FDF8
1561  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1562      ConvertUTFResultContainer(conversionOK).withScalars(0xfdf8),
1563      "\xef\xb7\xb8"));
1564
1565  // U+FDF9
1566  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1567      ConvertUTFResultContainer(conversionOK).withScalars(0xfdf9),
1568      "\xef\xb7\xb9"));
1569
1570  // U+FDFA
1571  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1572      ConvertUTFResultContainer(conversionOK).withScalars(0xfdfa),
1573      "\xef\xb7\xba"));
1574
1575  // U+FDFB
1576  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1577      ConvertUTFResultContainer(conversionOK).withScalars(0xfdfb),
1578      "\xef\xb7\xbb"));
1579
1580  // U+FDFC
1581  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1582      ConvertUTFResultContainer(conversionOK).withScalars(0xfdfc),
1583      "\xef\xb7\xbc"));
1584
1585  // U+FDFD
1586  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1587      ConvertUTFResultContainer(conversionOK).withScalars(0xfdfd),
1588      "\xef\xb7\xbd"));
1589
1590  // U+FDFE
1591  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1592      ConvertUTFResultContainer(conversionOK).withScalars(0xfdfe),
1593      "\xef\xb7\xbe"));
1594
1595  // U+FDFF
1596  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1597      ConvertUTFResultContainer(conversionOK).withScalars(0xfdff),
1598      "\xef\xb7\xbf"));
1599}
1600
1601TEST(ConvertUTFTest, UTF8ToUTF32PartialLenient) {
1602  // U+0041 LATIN CAPITAL LETTER A
1603  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1604      ConvertUTFResultContainer(conversionOK).withScalars(0x0041),
1605      "\x41", true));
1606
1607  //
1608  // Sequences with one continuation byte missing
1609  //
1610
1611  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1612      ConvertUTFResultContainer(sourceExhausted),
1613      "\xc2", true));
1614  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1615      ConvertUTFResultContainer(sourceExhausted),
1616      "\xdf", true));
1617  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1618      ConvertUTFResultContainer(sourceExhausted),
1619      "\xe0\xa0", true));
1620  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1621      ConvertUTFResultContainer(sourceExhausted),
1622      "\xe0\xbf", true));
1623  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1624      ConvertUTFResultContainer(sourceExhausted),
1625      "\xe1\x80", true));
1626  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1627      ConvertUTFResultContainer(sourceExhausted),
1628      "\xec\xbf", true));
1629  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1630      ConvertUTFResultContainer(sourceExhausted),
1631      "\xed\x80", true));
1632  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1633      ConvertUTFResultContainer(sourceExhausted),
1634      "\xed\x9f", true));
1635  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1636      ConvertUTFResultContainer(sourceExhausted),
1637      "\xee\x80", true));
1638  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1639      ConvertUTFResultContainer(sourceExhausted),
1640      "\xef\xbf", true));
1641  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1642      ConvertUTFResultContainer(sourceExhausted),
1643      "\xf0\x90\x80", true));
1644  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1645      ConvertUTFResultContainer(sourceExhausted),
1646      "\xf0\xbf\xbf", true));
1647  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1648      ConvertUTFResultContainer(sourceExhausted),
1649      "\xf1\x80\x80", true));
1650  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1651      ConvertUTFResultContainer(sourceExhausted),
1652      "\xf3\xbf\xbf", true));
1653  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1654      ConvertUTFResultContainer(sourceExhausted),
1655      "\xf4\x80\x80", true));
1656  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1657      ConvertUTFResultContainer(sourceExhausted),
1658      "\xf4\x8f\xbf", true));
1659
1660  EXPECT_TRUE(CheckConvertUTF8ToUnicodeScalars(
1661      ConvertUTFResultContainer(sourceExhausted).withScalars(0x0041),
1662      "\x41\xc2", true));
1663}
1664
1665