1// Tencent is pleased to support the open source community by making RapidJSON available.
2//
3// Copyright (C) 2015 THL A29 Limited, a Tencent company, and Milo Yip. All rights reserved.
4//
5// Licensed under the MIT License (the "License"); you may not use this file except
6// in compliance with the License. You may obtain a copy of the License at
7//
8// http://opensource.org/licenses/MIT
9//
10// Unless required by applicable law or agreed to in writing, software distributed
11// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12// CONDITIONS OF ANY KIND, either express or implied. See the License for the
13// specific language governing permissions and limitations under the License.
14
15#include "unittest.h"
16#include "rapidjson/filereadstream.h"
17#include "rapidjson/filewritestream.h"
18#include "rapidjson/encodedstream.h"
19#include "rapidjson/stringbuffer.h"
20
21using namespace rapidjson;
22
23// Verification of encoders/decoders with Hoehrmann's UTF8 decoder
24
25// http://www.unicode.org/Public/UNIDATA/Blocks.txt
26static const unsigned kCodepointRanges[] = {
27    0x0000,     0x007F,     // Basic Latin
28    0x0080,     0x00FF,     // Latin-1 Supplement
29    0x0100,     0x017F,     // Latin Extended-A
30    0x0180,     0x024F,     // Latin Extended-B
31    0x0250,     0x02AF,     // IPA Extensions
32    0x02B0,     0x02FF,     // Spacing Modifier Letters
33    0x0300,     0x036F,     // Combining Diacritical Marks
34    0x0370,     0x03FF,     // Greek and Coptic
35    0x0400,     0x04FF,     // Cyrillic
36    0x0500,     0x052F,     // Cyrillic Supplement
37    0x0530,     0x058F,     // Armenian
38    0x0590,     0x05FF,     // Hebrew
39    0x0600,     0x06FF,     // Arabic
40    0x0700,     0x074F,     // Syriac
41    0x0750,     0x077F,     // Arabic Supplement
42    0x0780,     0x07BF,     // Thaana
43    0x07C0,     0x07FF,     // NKo
44    0x0800,     0x083F,     // Samaritan
45    0x0840,     0x085F,     // Mandaic
46    0x0900,     0x097F,     // Devanagari
47    0x0980,     0x09FF,     // Bengali
48    0x0A00,     0x0A7F,     // Gurmukhi
49    0x0A80,     0x0AFF,     // Gujarati
50    0x0B00,     0x0B7F,     // Oriya
51    0x0B80,     0x0BFF,     // Tamil
52    0x0C00,     0x0C7F,     // Telugu
53    0x0C80,     0x0CFF,     // Kannada
54    0x0D00,     0x0D7F,     // Malayalam
55    0x0D80,     0x0DFF,     // Sinhala
56    0x0E00,     0x0E7F,     // Thai
57    0x0E80,     0x0EFF,     // Lao
58    0x0F00,     0x0FFF,     // Tibetan
59    0x1000,     0x109F,     // Myanmar
60    0x10A0,     0x10FF,     // Georgian
61    0x1100,     0x11FF,     // Hangul Jamo
62    0x1200,     0x137F,     // Ethiopic
63    0x1380,     0x139F,     // Ethiopic Supplement
64    0x13A0,     0x13FF,     // Cherokee
65    0x1400,     0x167F,     // Unified Canadian Aboriginal Syllabics
66    0x1680,     0x169F,     // Ogham
67    0x16A0,     0x16FF,     // Runic
68    0x1700,     0x171F,     // Tagalog
69    0x1720,     0x173F,     // Hanunoo
70    0x1740,     0x175F,     // Buhid
71    0x1760,     0x177F,     // Tagbanwa
72    0x1780,     0x17FF,     // Khmer
73    0x1800,     0x18AF,     // Mongolian
74    0x18B0,     0x18FF,     // Unified Canadian Aboriginal Syllabics Extended
75    0x1900,     0x194F,     // Limbu
76    0x1950,     0x197F,     // Tai Le
77    0x1980,     0x19DF,     // New Tai Lue
78    0x19E0,     0x19FF,     // Khmer Symbols
79    0x1A00,     0x1A1F,     // Buginese
80    0x1A20,     0x1AAF,     // Tai Tham
81    0x1B00,     0x1B7F,     // Balinese
82    0x1B80,     0x1BBF,     // Sundanese
83    0x1BC0,     0x1BFF,     // Batak
84    0x1C00,     0x1C4F,     // Lepcha
85    0x1C50,     0x1C7F,     // Ol Chiki
86    0x1CD0,     0x1CFF,     // Vedic Extensions
87    0x1D00,     0x1D7F,     // Phonetic Extensions
88    0x1D80,     0x1DBF,     // Phonetic Extensions Supplement
89    0x1DC0,     0x1DFF,     // Combining Diacritical Marks Supplement
90    0x1E00,     0x1EFF,     // Latin Extended Additional
91    0x1F00,     0x1FFF,     // Greek Extended
92    0x2000,     0x206F,     // General Punctuation
93    0x2070,     0x209F,     // Superscripts and Subscripts
94    0x20A0,     0x20CF,     // Currency Symbols
95    0x20D0,     0x20FF,     // Combining Diacritical Marks for Symbols
96    0x2100,     0x214F,     // Letterlike Symbols
97    0x2150,     0x218F,     // Number Forms
98    0x2190,     0x21FF,     // Arrows
99    0x2200,     0x22FF,     // Mathematical Operators
100    0x2300,     0x23FF,     // Miscellaneous Technical
101    0x2400,     0x243F,     // Control Pictures
102    0x2440,     0x245F,     // Optical Character Recognition
103    0x2460,     0x24FF,     // Enclosed Alphanumerics
104    0x2500,     0x257F,     // Box Drawing
105    0x2580,     0x259F,     // Block Elements
106    0x25A0,     0x25FF,     // Geometric Shapes
107    0x2600,     0x26FF,     // Miscellaneous Symbols
108    0x2700,     0x27BF,     // Dingbats
109    0x27C0,     0x27EF,     // Miscellaneous Mathematical Symbols-A
110    0x27F0,     0x27FF,     // Supplemental Arrows-A
111    0x2800,     0x28FF,     // Braille Patterns
112    0x2900,     0x297F,     // Supplemental Arrows-B
113    0x2980,     0x29FF,     // Miscellaneous Mathematical Symbols-B
114    0x2A00,     0x2AFF,     // Supplemental Mathematical Operators
115    0x2B00,     0x2BFF,     // Miscellaneous Symbols and Arrows
116    0x2C00,     0x2C5F,     // Glagolitic
117    0x2C60,     0x2C7F,     // Latin Extended-C
118    0x2C80,     0x2CFF,     // Coptic
119    0x2D00,     0x2D2F,     // Georgian Supplement
120    0x2D30,     0x2D7F,     // Tifinagh
121    0x2D80,     0x2DDF,     // Ethiopic Extended
122    0x2DE0,     0x2DFF,     // Cyrillic Extended-A
123    0x2E00,     0x2E7F,     // Supplemental Punctuation
124    0x2E80,     0x2EFF,     // CJK Radicals Supplement
125    0x2F00,     0x2FDF,     // Kangxi Radicals
126    0x2FF0,     0x2FFF,     // Ideographic Description Characters
127    0x3000,     0x303F,     // CJK Symbols and Punctuation
128    0x3040,     0x309F,     // Hiragana
129    0x30A0,     0x30FF,     // Katakana
130    0x3100,     0x312F,     // Bopomofo
131    0x3130,     0x318F,     // Hangul Compatibility Jamo
132    0x3190,     0x319F,     // Kanbun
133    0x31A0,     0x31BF,     // Bopomofo Extended
134    0x31C0,     0x31EF,     // CJK Strokes
135    0x31F0,     0x31FF,     // Katakana Phonetic Extensions
136    0x3200,     0x32FF,     // Enclosed CJK Letters and Months
137    0x3300,     0x33FF,     // CJK Compatibility
138    0x3400,     0x4DBF,     // CJK Unified Ideographs Extension A
139    0x4DC0,     0x4DFF,     // Yijing Hexagram Symbols
140    0x4E00,     0x9FFF,     // CJK Unified Ideographs
141    0xA000,     0xA48F,     // Yi Syllables
142    0xA490,     0xA4CF,     // Yi Radicals
143    0xA4D0,     0xA4FF,     // Lisu
144    0xA500,     0xA63F,     // Vai
145    0xA640,     0xA69F,     // Cyrillic Extended-B
146    0xA6A0,     0xA6FF,     // Bamum
147    0xA700,     0xA71F,     // Modifier Tone Letters
148    0xA720,     0xA7FF,     // Latin Extended-D
149    0xA800,     0xA82F,     // Syloti Nagri
150    0xA830,     0xA83F,     // Common Indic Number Forms
151    0xA840,     0xA87F,     // Phags-pa
152    0xA880,     0xA8DF,     // Saurashtra
153    0xA8E0,     0xA8FF,     // Devanagari Extended
154    0xA900,     0xA92F,     // Kayah Li
155    0xA930,     0xA95F,     // Rejang
156    0xA960,     0xA97F,     // Hangul Jamo Extended-A
157    0xA980,     0xA9DF,     // Javanese
158    0xAA00,     0xAA5F,     // Cham
159    0xAA60,     0xAA7F,     // Myanmar Extended-A
160    0xAA80,     0xAADF,     // Tai Viet
161    0xAB00,     0xAB2F,     // Ethiopic Extended-A
162    0xABC0,     0xABFF,     // Meetei Mayek
163    0xAC00,     0xD7AF,     // Hangul Syllables
164    0xD7B0,     0xD7FF,     // Hangul Jamo Extended-B
165    //0xD800,       0xDB7F,     // High Surrogates
166    //0xDB80,       0xDBFF,     // High Private Use Surrogates
167    //0xDC00,       0xDFFF,     // Low Surrogates
168    0xE000,     0xF8FF,     // Private Use Area
169    0xF900,     0xFAFF,     // CJK Compatibility Ideographs
170    0xFB00,     0xFB4F,     // Alphabetic Presentation Forms
171    0xFB50,     0xFDFF,     // Arabic Presentation Forms-A
172    0xFE00,     0xFE0F,     // Variation Selectors
173    0xFE10,     0xFE1F,     // Vertical Forms
174    0xFE20,     0xFE2F,     // Combining Half Marks
175    0xFE30,     0xFE4F,     // CJK Compatibility Forms
176    0xFE50,     0xFE6F,     // Small Form Variants
177    0xFE70,     0xFEFF,     // Arabic Presentation Forms-B
178    0xFF00,     0xFFEF,     // Halfwidth and Fullwidth Forms
179    0xFFF0,     0xFFFF,     // Specials
180    0x10000,    0x1007F,    // Linear B Syllabary
181    0x10080,    0x100FF,    // Linear B Ideograms
182    0x10100,    0x1013F,    // Aegean Numbers
183    0x10140,    0x1018F,    // Ancient Greek Numbers
184    0x10190,    0x101CF,    // Ancient Symbols
185    0x101D0,    0x101FF,    // Phaistos Disc
186    0x10280,    0x1029F,    // Lycian
187    0x102A0,    0x102DF,    // Carian
188    0x10300,    0x1032F,    // Old Italic
189    0x10330,    0x1034F,    // Gothic
190    0x10380,    0x1039F,    // Ugaritic
191    0x103A0,    0x103DF,    // Old Persian
192    0x10400,    0x1044F,    // Deseret
193    0x10450,    0x1047F,    // Shavian
194    0x10480,    0x104AF,    // Osmanya
195    0x10800,    0x1083F,    // Cypriot Syllabary
196    0x10840,    0x1085F,    // Imperial Aramaic
197    0x10900,    0x1091F,    // Phoenician
198    0x10920,    0x1093F,    // Lydian
199    0x10A00,    0x10A5F,    // Kharoshthi
200    0x10A60,    0x10A7F,    // Old South Arabian
201    0x10B00,    0x10B3F,    // Avestan
202    0x10B40,    0x10B5F,    // Inscriptional Parthian
203    0x10B60,    0x10B7F,    // Inscriptional Pahlavi
204    0x10C00,    0x10C4F,    // Old Turkic
205    0x10E60,    0x10E7F,    // Rumi Numeral Symbols
206    0x11000,    0x1107F,    // Brahmi
207    0x11080,    0x110CF,    // Kaithi
208    0x12000,    0x123FF,    // Cuneiform
209    0x12400,    0x1247F,    // Cuneiform Numbers and Punctuation
210    0x13000,    0x1342F,    // Egyptian Hieroglyphs
211    0x16800,    0x16A3F,    // Bamum Supplement
212    0x1B000,    0x1B0FF,    // Kana Supplement
213    0x1D000,    0x1D0FF,    // Byzantine Musical Symbols
214    0x1D100,    0x1D1FF,    // Musical Symbols
215    0x1D200,    0x1D24F,    // Ancient Greek Musical Notation
216    0x1D300,    0x1D35F,    // Tai Xuan Jing Symbols
217    0x1D360,    0x1D37F,    // Counting Rod Numerals
218    0x1D400,    0x1D7FF,    // Mathematical Alphanumeric Symbols
219    0x1F000,    0x1F02F,    // Mahjong Tiles
220    0x1F030,    0x1F09F,    // Domino Tiles
221    0x1F0A0,    0x1F0FF,    // Playing Cards
222    0x1F100,    0x1F1FF,    // Enclosed Alphanumeric Supplement
223    0x1F200,    0x1F2FF,    // Enclosed Ideographic Supplement
224    0x1F300,    0x1F5FF,    // Miscellaneous Symbols And Pictographs
225    0x1F600,    0x1F64F,    // Emoticons
226    0x1F680,    0x1F6FF,    // Transport And Map Symbols
227    0x1F700,    0x1F77F,    // Alchemical Symbols
228    0x20000,    0x2A6DF,    // CJK Unified Ideographs Extension B
229    0x2A700,    0x2B73F,    // CJK Unified Ideographs Extension C
230    0x2B740,    0x2B81F,    // CJK Unified Ideographs Extension D
231    0x2F800,    0x2FA1F,    // CJK Compatibility Ideographs Supplement
232    0xE0000,    0xE007F,    // Tags
233    0xE0100,    0xE01EF,    // Variation Selectors Supplement
234    0xF0000,    0xFFFFF,    // Supplementary Private Use Area-A
235    0x100000,   0x10FFFF,   // Supplementary Private Use Area-B
236    0xFFFFFFFF
237};
238
239// Copyright (c) 2008-2010 Bjoern Hoehrmann <bjoern@hoehrmann.de>
240// See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details.
241
242#define UTF8_ACCEPT 0u
243#define UTF8_REJECT 12u
244
245static const unsigned char utf8d[] = {
246    // The first part of the table maps bytes to character classes that
247    // to reduce the size of the transition table and create bitmasks.
248    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
249    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
250    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
251    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
252    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,  9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
253    7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,  7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
254    8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,  2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
255    10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8,
256
257    // The second part is a transition table that maps a combination
258    // of a state of the automaton and a character class to a state.
259    0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12,
260    12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12,
261    12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12,
262    12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12,
263    12,36,12,12,12,12,12,12,12,12,12,12,
264};
265
266static unsigned inline decode(unsigned* state, unsigned* codep, unsigned byte) {
267    unsigned type = utf8d[byte];
268
269    *codep = (*state != UTF8_ACCEPT) ?
270        (byte & 0x3fu) | (*codep << 6) :
271    (0xff >> type) & (byte);
272
273    *state = utf8d[256 + *state + type];
274    return *state;
275}
276
277//static bool IsUTF8(unsigned char* s) {
278//  unsigned codepoint, state = 0;
279//
280//  while (*s)
281//      decode(&state, &codepoint, *s++);
282//
283//  return state == UTF8_ACCEPT;
284//}
285
286TEST(EncodingsTest, UTF8) {
287    StringBuffer os, os2;
288    for (const unsigned* range = kCodepointRanges; *range != 0xFFFFFFFF; range += 2) {
289        for (unsigned codepoint = range[0]; codepoint <= range[1]; ++codepoint) {
290            os.Clear();
291            UTF8<>::Encode(os, codepoint);
292            const char* encodedStr = os.GetString();
293
294            // Decode with Hoehrmann
295            {
296                unsigned decodedCodepoint = 0;
297                unsigned state = 0;
298
299                unsigned decodedCount = 0;
300                for (const char* s = encodedStr; *s; ++s)
301                    if (!decode(&state, &decodedCodepoint, (unsigned char)*s)) {
302                        EXPECT_EQ(codepoint, decodedCodepoint);
303                        decodedCount++;
304                    }
305
306                if (*encodedStr)                // This decoder cannot handle U+0000
307                    EXPECT_EQ(1u, decodedCount);    // Should only contain one code point
308
309                EXPECT_EQ(UTF8_ACCEPT, state);
310                if (UTF8_ACCEPT != state)
311                    std::cout << std::hex << codepoint << " " << decodedCodepoint << std::endl;
312            }
313
314            // Decode
315            {
316                StringStream is(encodedStr);
317                unsigned decodedCodepoint;
318                bool result = UTF8<>::Decode(is, &decodedCodepoint);
319                EXPECT_TRUE(result);
320                EXPECT_EQ(codepoint, decodedCodepoint);
321                if (!result || codepoint != decodedCodepoint)
322                    std::cout << std::hex << codepoint << " " << decodedCodepoint << std::endl;
323            }
324
325            // Validate
326            {
327                StringStream is(encodedStr);
328                os2.Clear();
329                bool result = UTF8<>::Validate(is, os2);
330                EXPECT_TRUE(result);
331                EXPECT_EQ(0, StrCmp(encodedStr, os2.GetString()));
332            }
333        }
334    }
335}
336
337TEST(EncodingsTest, UTF16) {
338    GenericStringBuffer<UTF16<> > os, os2;
339    GenericStringBuffer<UTF8<> > utf8os;
340    for (const unsigned* range = kCodepointRanges; *range != 0xFFFFFFFF; range += 2) {
341        for (unsigned codepoint = range[0]; codepoint <= range[1]; ++codepoint) {
342            os.Clear();
343            UTF16<>::Encode(os, codepoint);
344            const UTF16<>::Ch* encodedStr = os.GetString();
345
346            // Encode with Hoehrmann's code
347            if (codepoint != 0) // cannot handle U+0000
348            {
349                // encode with UTF8<> first
350                utf8os.Clear();
351                UTF8<>::Encode(utf8os, codepoint);
352
353                // transcode from UTF8 to UTF16 with Hoehrmann's code
354                unsigned decodedCodepoint = 0;
355                unsigned state = 0;
356                UTF16<>::Ch buffer[3], *p = &buffer[0];
357                for (const char* s = utf8os.GetString(); *s; ++s) {
358                    if (!decode(&state, &decodedCodepoint, (unsigned char)*s))
359                        break;
360                }
361
362                if (codepoint <= 0xFFFF)
363                    *p++ = static_cast<UTF16<>::Ch>(decodedCodepoint);
364                else {
365                    // Encode code points above U+FFFF as surrogate pair.
366                    *p++ = static_cast<UTF16<>::Ch>(0xD7C0 + (decodedCodepoint >> 10));
367                    *p++ = static_cast<UTF16<>::Ch>(0xDC00 + (decodedCodepoint & 0x3FF));
368                }
369                *p++ = '\0';
370
371                EXPECT_EQ(0, StrCmp(buffer, encodedStr));
372            }
373
374            // Decode
375            {
376                GenericStringStream<UTF16<> > is(encodedStr);
377                unsigned decodedCodepoint;
378                bool result = UTF16<>::Decode(is, &decodedCodepoint);
379                EXPECT_TRUE(result);
380                EXPECT_EQ(codepoint, decodedCodepoint);
381                if (!result || codepoint != decodedCodepoint)
382                    std::cout << std::hex << codepoint << " " << decodedCodepoint << std::endl;
383            }
384
385            // Validate
386            {
387                GenericStringStream<UTF16<> > is(encodedStr);
388                os2.Clear();
389                bool result = UTF16<>::Validate(is, os2);
390                EXPECT_TRUE(result);
391                EXPECT_EQ(0, StrCmp(encodedStr, os2.GetString()));
392            }
393        }
394    }
395}
396
397TEST(EncodingsTest, UTF32) {
398    GenericStringBuffer<UTF32<> > os, os2;
399    for (const unsigned* range = kCodepointRanges; *range != 0xFFFFFFFF; range += 2) {
400        for (unsigned codepoint = range[0]; codepoint <= range[1]; ++codepoint) {
401            os.Clear();
402            UTF32<>::Encode(os, codepoint);
403            const UTF32<>::Ch* encodedStr = os.GetString();
404
405            // Decode
406            {
407                GenericStringStream<UTF32<> > is(encodedStr);
408                unsigned decodedCodepoint;
409                bool result = UTF32<>::Decode(is, &decodedCodepoint);
410                EXPECT_TRUE(result);
411                EXPECT_EQ(codepoint, decodedCodepoint);
412                if (!result || codepoint != decodedCodepoint)
413                    std::cout << std::hex << codepoint << " " << decodedCodepoint << std::endl;
414            }
415
416            // Validate
417            {
418                GenericStringStream<UTF32<> > is(encodedStr);
419                os2.Clear();
420                bool result = UTF32<>::Validate(is, os2);
421                EXPECT_TRUE(result);
422                EXPECT_EQ(0, StrCmp(encodedStr, os2.GetString()));
423            }
424        }
425    }
426}
427