1/*
2 * Copyright (C) 2013 The Guava Authors
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17package com.google.common.base;
18
19import com.google.common.annotations.GwtCompatible;
20
21import junit.framework.TestCase;
22
23/**
24 * Unit tests for {@link Utf8}.
25 *
26 * @author Jon Perlow
27 * @author Martin Buchholz
28 * @author Clément Roux
29 */
30@GwtCompatible(emulated = true)
31public class Utf8Test extends TestCase {
32  public void testEncodedLength_validStrings() {
33    assertEquals(0, Utf8.encodedLength(""));
34    assertEquals(11, Utf8.encodedLength("Hello world"));
35    assertEquals(8, Utf8.encodedLength("Résumé"));
36    assertEquals(461, Utf8.encodedLength("威廉·莎士比亞(William Shakespeare,"
37        + "1564年4月26號—1616年4月23號[1])係隻英國嗰演員、劇作家同詩人,"
38        + "有時間佢簡稱莎翁;中國清末民初哈拕翻譯做舌克斯毕、沙斯皮耳、筛斯比耳、"
39        + "莎基斯庇尔、索士比尔、夏克思芘尔、希哀苦皮阿、叶斯壁、沙克皮尔、"
40        + "狹斯丕爾。[2]莎士比亞編寫過好多作品,佢嗰劇作響西洋文學好有影響,"
41        + "哈都拕人翻譯做好多話。"));
42    // A surrogate pair
43    assertEquals(4, Utf8.encodedLength(
44        newString(Character.MIN_HIGH_SURROGATE, Character.MIN_LOW_SURROGATE)));
45  }
46
47  public void testEncodedLength_invalidStrings() {
48    testEncodedLengthFails(newString(Character.MIN_HIGH_SURROGATE), 0);
49    testEncodedLengthFails("foobar" + newString(Character.MIN_HIGH_SURROGATE), 6);
50    testEncodedLengthFails(newString(Character.MIN_LOW_SURROGATE), 0);
51    testEncodedLengthFails("foobar" + newString(Character.MIN_LOW_SURROGATE), 6);
52    testEncodedLengthFails(
53        newString(
54            Character.MIN_HIGH_SURROGATE,
55            Character.MIN_HIGH_SURROGATE), 0);
56  }
57
58  private static void testEncodedLengthFails(String invalidString,
59      int invalidCodePointIndex) {
60    try {
61      Utf8.encodedLength(invalidString);
62      fail();
63    } catch (IllegalArgumentException expected) {
64      assertEquals("Unpaired surrogate at index " + invalidCodePointIndex,
65          expected.getMessage());
66    }
67  }
68
69  // 128 - [chars 0x0000 to 0x007f]
70  private static final long ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS =
71      0x007f - 0x0000 + 1;
72
73  // 128
74  private static final long EXPECTED_ONE_BYTE_ROUNDTRIPPABLE_COUNT =
75      ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS;
76
77  // 1920 [chars 0x0080 to 0x07FF]
78  private static final long TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS =
79      0x07FF - 0x0080 + 1;
80
81  // 18,304
82  private static final long EXPECTED_TWO_BYTE_ROUNDTRIPPABLE_COUNT =
83      // Both bytes are one byte characters
84      (long) Math.pow(EXPECTED_ONE_BYTE_ROUNDTRIPPABLE_COUNT, 2) +
85      // The possible number of two byte characters
86      TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS;
87
88  // 2048
89  private static final long THREE_BYTE_SURROGATES = 2 * 1024;
90
91  // 61,440 [chars 0x0800 to 0xFFFF, minus surrogates]
92  private static final long THREE_BYTE_ROUNDTRIPPABLE_CHARACTERS =
93      0xFFFF - 0x0800 + 1 - THREE_BYTE_SURROGATES;
94
95  // 2,650,112
96  private static final long EXPECTED_THREE_BYTE_ROUNDTRIPPABLE_COUNT =
97      // All one byte characters
98      (long) Math.pow(EXPECTED_ONE_BYTE_ROUNDTRIPPABLE_COUNT, 3) +
99      // One two byte character and a one byte character
100      2 * TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS *
101          ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS +
102       // Three byte characters
103      THREE_BYTE_ROUNDTRIPPABLE_CHARACTERS;
104
105  // 1,048,576 [chars 0x10000L to 0x10FFFF]
106  private static final long FOUR_BYTE_ROUNDTRIPPABLE_CHARACTERS =
107      0x10FFFF - 0x10000L + 1;
108
109  // 289,571,839
110  private static final long EXPECTED_FOUR_BYTE_ROUNDTRIPPABLE_COUNT =
111      // All one byte characters
112      (long) Math.pow(EXPECTED_ONE_BYTE_ROUNDTRIPPABLE_COUNT, 4) +
113      // One and three byte characters
114      2 * THREE_BYTE_ROUNDTRIPPABLE_CHARACTERS *
115          ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS +
116      // Two two byte characters
117      TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS * TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS +
118      // Permutations of one and two byte characters
119      3 * TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS *
120          ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS *
121          ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS +
122      // Four byte characters
123      FOUR_BYTE_ROUNDTRIPPABLE_CHARACTERS;
124
125  /**
126   * Tests that round tripping of a sample of four byte permutations work.
127   * All permutations are prohibitively expensive to test for automated runs.
128   * This method tests specific four-byte cases.
129   */
130  public void testIsWellFormed_4BytesSamples() {
131    // Valid 4 byte.
132    assertWellFormed(0xF0, 0xA4, 0xAD, 0xA2);
133    // Bad trailing bytes
134    assertNotWellFormed(0xF0, 0xA4, 0xAD, 0x7F);
135    assertNotWellFormed(0xF0, 0xA4, 0xAD, 0xC0);
136    // Special cases for byte2
137    assertNotWellFormed(0xF0, 0x8F, 0xAD, 0xA2);
138    assertNotWellFormed(0xF4, 0x90, 0xAD, 0xA2);
139  }
140
141  /** Tests some hard-coded test cases. */
142  public void testSomeSequences() {
143    // Empty
144    assertWellFormed();
145    // One-byte characters, including control characters
146    assertWellFormed(0x00, 0x61, 0x62, 0x63, 0x7F); // "\u0000abc\u007f"
147    // Two-byte characters
148    assertWellFormed(0xC2, 0xA2, 0xC2, 0xA2); // "\u00a2\u00a2"
149    // Three-byte characters
150    assertWellFormed(0xc8, 0x8a, 0x63, 0xc8, 0x8a, 0x63); // "\u020ac\u020ac"
151    // Four-byte characters
152    // "\u024B62\u024B62"
153    assertWellFormed(0xc9, 0x8b, 0x36, 0x32, 0xc9, 0x8b, 0x36, 0x32);
154    // Mixed string
155    // "a\u020ac\u00a2b\\u024B62u020acc\u00a2de\u024B62"
156    assertWellFormed(0x61, 0xc8, 0x8a, 0x63, 0xc2, 0xa2, 0x62, 0x5c, 0x75, 0x30,
157        0x32, 0x34, 0x42, 0x36, 0x32, 0x75, 0x30, 0x32, 0x30, 0x61, 0x63, 0x63,
158        0xc2, 0xa2, 0x64, 0x65, 0xc9, 0x8b, 0x36, 0x32);
159    // Not a valid string
160    assertNotWellFormed(-1, 0, -1, 0);
161  }
162
163  public void testShardsHaveExpectedRoundTrippables() {
164    // A sanity check.
165    long actual = 0;
166    for (long expected : generateFourByteShardsExpectedRunnables()) {
167      actual += expected;
168    }
169    assertEquals(EXPECTED_FOUR_BYTE_ROUNDTRIPPABLE_COUNT, actual);
170  }
171
172  private String newString(char... chars) {
173    return new String(chars);
174  }
175
176  private byte[] toByteArray(int... bytes) {
177    byte[] realBytes = new byte[bytes.length];
178    for (int i = 0; i < bytes.length; i++) {
179      realBytes[i] = (byte) bytes[i];
180    }
181    return realBytes;
182  }
183
184  private void assertWellFormed(int... bytes) {
185    assertTrue(Utf8.isWellFormed(toByteArray(bytes)));
186  }
187
188  private void assertNotWellFormed(int... bytes) {
189    assertFalse(Utf8.isWellFormed(toByteArray(bytes)));
190  }
191
192  private static long[] generateFourByteShardsExpectedRunnables() {
193    long[] expected = new long[128];
194    // 0-63 are all 5300224
195    for (int i = 0; i <= 63; i++) {
196      expected[i] = 5300224;
197    }
198    // 97-111 are all 2342912
199    for (int i = 97; i <= 111; i++) {
200     expected[i] = 2342912;
201    }
202    // 113-117 are all 1048576
203    for (int i = 113; i <= 117; i++) {
204      expected[i] = 1048576;
205    }
206    // One offs
207    expected[112] = 786432;
208    expected[118] = 786432;
209    expected[119] = 1048576;
210    expected[120] = 458752;
211    expected[121] = 524288;
212    expected[122] = 65536;
213    // Anything not assigned was the default 0.
214    return expected;
215  }
216}
217
218