1/*
2 * Copyright (C) 2013 The Guava Authors
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17package com.google.common.base;
18
19import com.google.common.annotations.GwtCompatible;
20import com.google.common.annotations.GwtIncompatible;
21
22import junit.framework.TestCase;
23
24import java.io.UnsupportedEncodingException;
25import java.util.Arrays;
26import java.util.HashMap;
27import java.util.Random;
28
29/**
30 * Unit tests for {@link Utf8}.
31 *
32 * @author Jon Perlow
33 * @author Martin Buchholz
34 * @author Clément Roux
35 */
36@GwtCompatible(emulated = true)
37public class Utf8Test extends TestCase {
38  public void testEncodedLength_validStrings() {
39    assertEquals(0, Utf8.encodedLength(""));
40    assertEquals(11, Utf8.encodedLength("Hello world"));
41    assertEquals(8, Utf8.encodedLength("Résumé"));
42    assertEquals(461, Utf8.encodedLength("威廉·莎士比亞(William Shakespeare,"
43        + "1564年4月26號—1616年4月23號[1])係隻英國嗰演員、劇作家同詩人,"
44        + "有時間佢簡稱莎翁;中國清末民初哈拕翻譯做舌克斯毕、沙斯皮耳、筛斯比耳、"
45        + "莎基斯庇尔、索士比尔、夏克思芘尔、希哀苦皮阿、叶斯壁、沙克皮尔、"
46        + "狹斯丕爾。[2]莎士比亞編寫過好多作品,佢嗰劇作響西洋文學好有影響,"
47        + "哈都拕人翻譯做好多話。"));
48    // A surrogate pair
49    assertEquals(4, Utf8.encodedLength(
50        newString(Character.MIN_HIGH_SURROGATE, Character.MIN_LOW_SURROGATE)));
51  }
52
53  @GwtIncompatible("StringBuilder.appendCodePoint()")
54  public void testEncodedLength_validStrings2() {
55    HashMap<Integer, Integer> utf8Lengths = new HashMap<Integer, Integer>();
56    utf8Lengths.put(0x00, 1);
57    utf8Lengths.put(0x7f, 1);
58    utf8Lengths.put(0x80, 2);
59    utf8Lengths.put(0x7ff, 2);
60    utf8Lengths.put(0x800, 3);
61    utf8Lengths.put(Character.MIN_SUPPLEMENTARY_CODE_POINT - 1, 3);
62    utf8Lengths.put(Character.MIN_SUPPLEMENTARY_CODE_POINT, 4);
63    utf8Lengths.put(Character.MAX_CODE_POINT, 4);
64
65    Integer[] codePoints = utf8Lengths.keySet().toArray(new Integer[]{});
66    StringBuilder sb = new StringBuilder();
67    Random rnd = new Random();
68    for (int trial = 0; trial < 100; trial++) {
69      sb.setLength(0);
70      int utf8Length = 0;
71      for (int i = 0; i < 6; i++) {
72        Integer randomCodePoint = codePoints[rnd.nextInt(codePoints.length)];
73        sb.appendCodePoint(randomCodePoint);
74        utf8Length += utf8Lengths.get(randomCodePoint);
75        if (utf8Length != Utf8.encodedLength(sb)) {
76          StringBuilder repro = new StringBuilder();
77          for (int j = 0; j < sb.length(); j++) {
78            repro.append(" " + (int) sb.charAt(j));  // GWT compatible
79          }
80          assertEquals(repro.toString(), utf8Length, Utf8.encodedLength(sb));
81        }
82      }
83    }
84  }
85
86  public void testEncodedLength_invalidStrings() {
87    testEncodedLengthFails(newString(Character.MIN_HIGH_SURROGATE), 0);
88    testEncodedLengthFails("foobar" + newString(Character.MIN_HIGH_SURROGATE), 6);
89    testEncodedLengthFails(newString(Character.MIN_LOW_SURROGATE), 0);
90    testEncodedLengthFails("foobar" + newString(Character.MIN_LOW_SURROGATE), 6);
91    testEncodedLengthFails(
92        newString(
93            Character.MIN_HIGH_SURROGATE,
94            Character.MIN_HIGH_SURROGATE), 0);
95  }
96
97  private static void testEncodedLengthFails(String invalidString,
98      int invalidCodePointIndex) {
99    try {
100      Utf8.encodedLength(invalidString);
101      fail();
102    } catch (IllegalArgumentException expected) {
103      assertEquals("Unpaired surrogate at index " + invalidCodePointIndex,
104          expected.getMessage());
105    }
106  }
107
108  // 128 - [chars 0x0000 to 0x007f]
109  private static final long ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS =
110      0x007f - 0x0000 + 1;
111
112  // 128
113  private static final long EXPECTED_ONE_BYTE_ROUNDTRIPPABLE_COUNT =
114      ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS;
115
116  // 1920 [chars 0x0080 to 0x07FF]
117  private static final long TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS =
118      0x07FF - 0x0080 + 1;
119
120  // 18,304
121  private static final long EXPECTED_TWO_BYTE_ROUNDTRIPPABLE_COUNT =
122      // Both bytes are one byte characters
123      (long) Math.pow(EXPECTED_ONE_BYTE_ROUNDTRIPPABLE_COUNT, 2) +
124      // The possible number of two byte characters
125      TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS;
126
127  // 2048
128  private static final long THREE_BYTE_SURROGATES = 2 * 1024;
129
130  // 61,440 [chars 0x0800 to 0xFFFF, minus surrogates]
131  private static final long THREE_BYTE_ROUNDTRIPPABLE_CHARACTERS =
132      0xFFFF - 0x0800 + 1 - THREE_BYTE_SURROGATES;
133
134  // 2,650,112
135  private static final long EXPECTED_THREE_BYTE_ROUNDTRIPPABLE_COUNT =
136      // All one byte characters
137      (long) Math.pow(EXPECTED_ONE_BYTE_ROUNDTRIPPABLE_COUNT, 3) +
138      // One two byte character and a one byte character
139      2 * TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS *
140          ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS +
141       // Three byte characters
142      THREE_BYTE_ROUNDTRIPPABLE_CHARACTERS;
143
144  // 1,048,576 [chars 0x10000L to 0x10FFFF]
145  private static final long FOUR_BYTE_ROUNDTRIPPABLE_CHARACTERS =
146      0x10FFFF - 0x10000L + 1;
147
148  // 289,571,839
149  private static final long EXPECTED_FOUR_BYTE_ROUNDTRIPPABLE_COUNT =
150      // All one byte characters
151      (long) Math.pow(EXPECTED_ONE_BYTE_ROUNDTRIPPABLE_COUNT, 4) +
152      // One and three byte characters
153      2 * THREE_BYTE_ROUNDTRIPPABLE_CHARACTERS *
154          ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS +
155      // Two two byte characters
156      TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS * TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS +
157      // Permutations of one and two byte characters
158      3 * TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS *
159          ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS *
160          ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS +
161      // Four byte characters
162      FOUR_BYTE_ROUNDTRIPPABLE_CHARACTERS;
163
164  /** Tests that round tripping of all two byte permutations work. */
165  @GwtIncompatible("java.nio.charset.Charset")
166  public void testIsWellFormed_1Byte() {
167    testBytes(1, EXPECTED_ONE_BYTE_ROUNDTRIPPABLE_COUNT);
168  }
169
170  /** Tests that round tripping of all two byte permutations work. */
171  @GwtIncompatible("java.nio.charset.Charset")
172  public void testIsWellFormed_2Bytes() {
173    testBytes(2, EXPECTED_TWO_BYTE_ROUNDTRIPPABLE_COUNT);
174  }
175
176  /** Tests that round tripping of all three byte permutations work. */
177  @GwtIncompatible("java.nio.charset.Charset")
178  public void testIsWellFormed_3Bytes() {
179    testBytes(3, EXPECTED_THREE_BYTE_ROUNDTRIPPABLE_COUNT);
180  }
181
182  /**
183   * Tests that round tripping of a sample of four byte permutations work.
184   * All permutations are prohibitively expensive to test for automated runs.
185   * This method tests specific four-byte cases.
186   */
187  public void testIsWellFormed_4BytesSamples() {
188    // Valid 4 byte.
189    assertWellFormed(0xF0, 0xA4, 0xAD, 0xA2);
190    // Bad trailing bytes
191    assertNotWellFormed(0xF0, 0xA4, 0xAD, 0x7F);
192    assertNotWellFormed(0xF0, 0xA4, 0xAD, 0xC0);
193    // Special cases for byte2
194    assertNotWellFormed(0xF0, 0x8F, 0xAD, 0xA2);
195    assertNotWellFormed(0xF4, 0x90, 0xAD, 0xA2);
196  }
197
198  /** Tests some hard-coded test cases. */
199  public void testSomeSequences() {
200    // Empty
201    assertWellFormed();
202    // One-byte characters, including control characters
203    assertWellFormed(0x00, 0x61, 0x62, 0x63, 0x7F); // "\u0000abc\u007f"
204    // Two-byte characters
205    assertWellFormed(0xC2, 0xA2, 0xC2, 0xA2); // "\u00a2\u00a2"
206    // Three-byte characters
207    assertWellFormed(0xc8, 0x8a, 0x63, 0xc8, 0x8a, 0x63); // "\u020ac\u020ac"
208    // Four-byte characters
209    // "\u024B62\u024B62"
210    assertWellFormed(0xc9, 0x8b, 0x36, 0x32, 0xc9, 0x8b, 0x36, 0x32);
211    // Mixed string
212    // "a\u020ac\u00a2b\\u024B62u020acc\u00a2de\u024B62"
213    assertWellFormed(0x61, 0xc8, 0x8a, 0x63, 0xc2, 0xa2, 0x62, 0x5c, 0x75, 0x30,
214        0x32, 0x34, 0x42, 0x36, 0x32, 0x75, 0x30, 0x32, 0x30, 0x61, 0x63, 0x63,
215        0xc2, 0xa2, 0x64, 0x65, 0xc9, 0x8b, 0x36, 0x32);
216    // Not a valid string
217    assertNotWellFormed(-1, 0, -1, 0);
218  }
219
220  public void testShardsHaveExpectedRoundTrippables() {
221    // A sanity check.
222    long actual = 0;
223    for (long expected : generateFourByteShardsExpectedRunnables()) {
224      actual += expected;
225    }
226    assertEquals(EXPECTED_FOUR_BYTE_ROUNDTRIPPABLE_COUNT, actual);
227  }
228
229  private String newString(char... chars) {
230    return new String(chars);
231  }
232
233  private byte[] toByteArray(int... bytes) {
234    byte[] realBytes = new byte[bytes.length];
235    for (int i = 0; i < bytes.length; i++) {
236      realBytes[i] = (byte) bytes[i];
237    }
238    return realBytes;
239  }
240
241  private void assertWellFormed(int... bytes) {
242    assertTrue(Utf8.isWellFormed(toByteArray(bytes)));
243  }
244
245  private void assertNotWellFormed(int... bytes) {
246    assertFalse(Utf8.isWellFormed(toByteArray(bytes)));
247  }
248
249  private static long[] generateFourByteShardsExpectedRunnables() {
250    long[] expected = new long[128];
251    // 0-63 are all 5300224
252    for (int i = 0; i <= 63; i++) {
253      expected[i] = 5300224;
254    }
255    // 97-111 are all 2342912
256    for (int i = 97; i <= 111; i++) {
257     expected[i] = 2342912;
258    }
259    // 113-117 are all 1048576
260    for (int i = 113; i <= 117; i++) {
261      expected[i] = 1048576;
262    }
263    // One offs
264    expected[112] = 786432;
265    expected[118] = 786432;
266    expected[119] = 1048576;
267    expected[120] = 458752;
268    expected[121] = 524288;
269    expected[122] = 65536;
270    // Anything not assigned was the default 0.
271    return expected;
272  }
273
274  /**
275   * Helper to run the loop to test all the permutations for the number of bytes
276   * specified.
277   *
278   * @param numBytes the number of bytes in the byte array
279   * @param expectedCount the expected number of roundtrippable permutations
280   */
281  @GwtIncompatible("java.nio.charset.Charset")
282  private static void testBytes(int numBytes, long expectedCount) {
283    testBytes(numBytes, expectedCount, 0, -1);
284  }
285
286  /**
287   * Helper to run the loop to test all the permutations for the number of bytes
288   * specified. This overload is useful for debugging to get the loop to start
289   * at a certain character.
290   *
291   * @param numBytes the number of bytes in the byte array
292   * @param expectedCount the expected number of roundtrippable permutations
293   * @param start the starting bytes encoded as a long as big-endian
294   * @param lim the limit of bytes to process encoded as a long as big-endian,
295   *     or -1 to mean the max limit for numBytes
296   */
297  @GwtIncompatible("java.nio.charset.Charset")
298  private static void testBytes(int numBytes, long expectedCount, long start,
299      long lim) {
300    byte[] bytes = new byte[numBytes];
301    if (lim == -1) {
302      lim = 1L << (numBytes * 8);
303    }
304    long countRoundTripped = 0;
305    for (long byteChar = start; byteChar < lim; byteChar++) {
306      long tmpByteChar = byteChar;
307      for (int i = 0; i < numBytes; i++) {
308        bytes[bytes.length - i - 1] = (byte) tmpByteChar;
309        tmpByteChar = tmpByteChar >> 8;
310      }
311      boolean isRoundTrippable = Utf8.isWellFormed(bytes);
312      assertEquals(isRoundTrippable, Utf8.isWellFormed(bytes, 0, numBytes));
313      boolean bytesEqual;
314      try {
315        String s = new String(bytes, Charsets.UTF_8.name());
316        byte[] bytesReencoded = s.getBytes(Charsets.UTF_8.name());
317        bytesEqual = Arrays.equals(bytes, bytesReencoded);
318      } catch (UnsupportedEncodingException e) {
319        throw new AssertionError(e);
320      }
321
322      if (bytesEqual != isRoundTrippable) {
323        fail();
324      }
325      if (isRoundTrippable) {
326        countRoundTripped++;
327      }
328    }
329    assertEquals(expectedCount, countRoundTripped);
330  }
331}
332