Utf8Test.java revision 3ecfa412eddc4b084663f38d562537b86b9734d5
1/*
2 * Copyright (C) 2013 The Guava Authors
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17package com.google.common.base;
18
19import com.google.common.annotations.GwtCompatible;
20import com.google.common.annotations.GwtIncompatible;
21
22import junit.framework.TestCase;
23
24import java.util.Arrays;
25import java.util.HashMap;
26import java.util.Random;
27
28/**
29 * Unit tests for {@link Utf8}.
30 *
31 * @author Jon Perlow
32 * @author Martin Buchholz
33 * @author Clément Roux
34 */
35@GwtCompatible(emulated = true)
36public class Utf8Test extends TestCase {
37  public void testEncodedLength_validStrings() {
38    assertEquals(0, Utf8.encodedLength(""));
39    assertEquals(11, Utf8.encodedLength("Hello world"));
40    assertEquals(8, Utf8.encodedLength("Résumé"));
41    assertEquals(461, Utf8.encodedLength("威廉·莎士比亞(William Shakespeare,"
42        + "1564年4月26號—1616年4月23號[1])係隻英國嗰演員、劇作家同詩人,"
43        + "有時間佢簡稱莎翁;中國清末民初哈拕翻譯做舌克斯毕、沙斯皮耳、筛斯比耳、"
44        + "莎基斯庇尔、索士比尔、夏克思芘尔、希哀苦皮阿、叶斯壁、沙克皮尔、"
45        + "狹斯丕爾。[2]莎士比亞編寫過好多作品,佢嗰劇作響西洋文學好有影響,"
46        + "哈都拕人翻譯做好多話。"));
47    // A surrogate pair
48    assertEquals(4, Utf8.encodedLength(
49        newString(Character.MIN_HIGH_SURROGATE, Character.MIN_LOW_SURROGATE)));
50  }
51
52  @GwtIncompatible("StringBuilder.appendCodePoint()")
53  public void testEncodedLength_validStrings2() {
54    HashMap<Integer, Integer> utf8Lengths = new HashMap<Integer, Integer>();
55    utf8Lengths.put(0x00, 1);
56    utf8Lengths.put(0x7f, 1);
57    utf8Lengths.put(0x80, 2);
58    utf8Lengths.put(0x7ff, 2);
59    utf8Lengths.put(0x800, 3);
60    utf8Lengths.put(Character.MIN_SUPPLEMENTARY_CODE_POINT - 1, 3);
61    utf8Lengths.put(Character.MIN_SUPPLEMENTARY_CODE_POINT, 4);
62    utf8Lengths.put(Character.MAX_CODE_POINT, 4);
63
64    Integer[] codePoints = utf8Lengths.keySet().toArray(new Integer[]{});
65    StringBuilder sb = new StringBuilder();
66    Random rnd = new Random();
67    for (int trial = 0; trial < 100; trial++) {
68      sb.setLength(0);
69      int utf8Length = 0;
70      for (int i = 0; i < 6; i++) {
71        Integer randomCodePoint = codePoints[rnd.nextInt(codePoints.length)];
72        sb.appendCodePoint(randomCodePoint);
73        utf8Length += utf8Lengths.get(randomCodePoint);
74        if (utf8Length != Utf8.encodedLength(sb)) {
75          StringBuilder repro = new StringBuilder();
76          for (int j = 0; j < sb.length(); j++) {
77            repro.append(" " + (int) sb.charAt(j));  // GWT compatible
78          }
79          assertEquals(repro.toString(), utf8Length, Utf8.encodedLength(sb));
80        }
81      }
82    }
83  }
84
85  public void testEncodedLength_invalidStrings() {
86    testEncodedLengthFails(newString(Character.MIN_HIGH_SURROGATE), 0);
87    testEncodedLengthFails("foobar" + newString(Character.MIN_HIGH_SURROGATE), 6);
88    testEncodedLengthFails(newString(Character.MIN_LOW_SURROGATE), 0);
89    testEncodedLengthFails("foobar" + newString(Character.MIN_LOW_SURROGATE), 6);
90    testEncodedLengthFails(
91        newString(
92            Character.MIN_HIGH_SURROGATE,
93            Character.MIN_HIGH_SURROGATE), 0);
94  }
95
96  private static void testEncodedLengthFails(String invalidString,
97      int invalidCodePointIndex) {
98    try {
99      Utf8.encodedLength(invalidString);
100      fail();
101    } catch (IllegalArgumentException expected) {
102      assertEquals("Unpaired surrogate at index " + invalidCodePointIndex,
103          expected.getMessage());
104    }
105  }
106
107  // 128 - [chars 0x0000 to 0x007f]
108  private static final long ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS =
109      0x007f - 0x0000 + 1;
110
111  // 128
112  private static final long EXPECTED_ONE_BYTE_ROUNDTRIPPABLE_COUNT =
113      ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS;
114
115  // 1920 [chars 0x0080 to 0x07FF]
116  private static final long TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS =
117      0x07FF - 0x0080 + 1;
118
119  // 18,304
120  private static final long EXPECTED_TWO_BYTE_ROUNDTRIPPABLE_COUNT =
121      // Both bytes are one byte characters
122      (long) Math.pow(EXPECTED_ONE_BYTE_ROUNDTRIPPABLE_COUNT, 2) +
123      // The possible number of two byte characters
124      TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS;
125
126  // 2048
127  private static final long THREE_BYTE_SURROGATES = 2 * 1024;
128
129  // 61,440 [chars 0x0800 to 0xFFFF, minus surrogates]
130  private static final long THREE_BYTE_ROUNDTRIPPABLE_CHARACTERS =
131      0xFFFF - 0x0800 + 1 - THREE_BYTE_SURROGATES;
132
133  // 2,650,112
134  private static final long EXPECTED_THREE_BYTE_ROUNDTRIPPABLE_COUNT =
135      // All one byte characters
136      (long) Math.pow(EXPECTED_ONE_BYTE_ROUNDTRIPPABLE_COUNT, 3) +
137      // One two byte character and a one byte character
138      2 * TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS *
139          ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS +
140       // Three byte characters
141      THREE_BYTE_ROUNDTRIPPABLE_CHARACTERS;
142
143  // 1,048,576 [chars 0x10000L to 0x10FFFF]
144  private static final long FOUR_BYTE_ROUNDTRIPPABLE_CHARACTERS =
145      0x10FFFF - 0x10000L + 1;
146
147  // 289,571,839
148  private static final long EXPECTED_FOUR_BYTE_ROUNDTRIPPABLE_COUNT =
149      // All one byte characters
150      (long) Math.pow(EXPECTED_ONE_BYTE_ROUNDTRIPPABLE_COUNT, 4) +
151      // One and three byte characters
152      2 * THREE_BYTE_ROUNDTRIPPABLE_CHARACTERS *
153          ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS +
154      // Two two byte characters
155      TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS * TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS +
156      // Permutations of one and two byte characters
157      3 * TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS *
158          ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS *
159          ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS +
160      // Four byte characters
161      FOUR_BYTE_ROUNDTRIPPABLE_CHARACTERS;
162
163  /** Tests that round tripping of all two byte permutations work. */
164  @GwtIncompatible("java.nio.charset.Charset")
165  public void testIsWellFormed_1Byte() {
166    testBytes(1, EXPECTED_ONE_BYTE_ROUNDTRIPPABLE_COUNT);
167  }
168
169  /** Tests that round tripping of all two byte permutations work. */
170  @GwtIncompatible("java.nio.charset.Charset")
171  public void testIsWellFormed_2Bytes() {
172    testBytes(2, EXPECTED_TWO_BYTE_ROUNDTRIPPABLE_COUNT);
173  }
174
175  /** Tests that round tripping of all three byte permutations work. */
176  @GwtIncompatible("java.nio.charset.Charset")
177  public void testIsWellFormed_3Bytes() {
178    testBytes(3, EXPECTED_THREE_BYTE_ROUNDTRIPPABLE_COUNT);
179  }
180
181  /**
182   * Tests that round tripping of a sample of four byte permutations work.
183   * All permutations are prohibitively expensive to test for automated runs.
184   * This method tests specific four-byte cases.
185   */
186  public void testIsWellFormed_4BytesSamples() {
187    // Valid 4 byte.
188    assertWellFormed(0xF0, 0xA4, 0xAD, 0xA2);
189    // Bad trailing bytes
190    assertNotWellFormed(0xF0, 0xA4, 0xAD, 0x7F);
191    assertNotWellFormed(0xF0, 0xA4, 0xAD, 0xC0);
192    // Special cases for byte2
193    assertNotWellFormed(0xF0, 0x8F, 0xAD, 0xA2);
194    assertNotWellFormed(0xF4, 0x90, 0xAD, 0xA2);
195  }
196
197  /** Tests some hard-coded test cases. */
198  public void testSomeSequences() {
199    // Empty
200    assertWellFormed();
201    // One-byte characters, including control characters
202    assertWellFormed(0x00, 0x61, 0x62, 0x63, 0x7F); // "\u0000abc\u007f"
203    // Two-byte characters
204    assertWellFormed(0xC2, 0xA2, 0xC2, 0xA2); // "\u00a2\u00a2"
205    // Three-byte characters
206    assertWellFormed(0xc8, 0x8a, 0x63, 0xc8, 0x8a, 0x63); // "\u020ac\u020ac"
207    // Four-byte characters
208    // "\u024B62\u024B62"
209    assertWellFormed(0xc9, 0x8b, 0x36, 0x32, 0xc9, 0x8b, 0x36, 0x32);
210    // Mixed string
211    // "a\u020ac\u00a2b\\u024B62u020acc\u00a2de\u024B62"
212    assertWellFormed(0x61, 0xc8, 0x8a, 0x63, 0xc2, 0xa2, 0x62, 0x5c, 0x75, 0x30,
213        0x32, 0x34, 0x42, 0x36, 0x32, 0x75, 0x30, 0x32, 0x30, 0x61, 0x63, 0x63,
214        0xc2, 0xa2, 0x64, 0x65, 0xc9, 0x8b, 0x36, 0x32);
215    // Not a valid string
216    assertNotWellFormed(-1, 0, -1, 0);
217  }
218
219  public void testShardsHaveExpectedRoundTrippables() {
220    // A sanity check.
221    long actual = 0;
222    for (long expected : generateFourByteShardsExpectedRunnables()) {
223      actual += expected;
224    }
225    assertEquals(EXPECTED_FOUR_BYTE_ROUNDTRIPPABLE_COUNT, actual);
226  }
227
228  private String newString(char... chars) {
229    return new String(chars);
230  }
231
232  private byte[] toByteArray(int... bytes) {
233    byte[] realBytes = new byte[bytes.length];
234    for (int i = 0; i < bytes.length; i++) {
235      realBytes[i] = (byte) bytes[i];
236    }
237    return realBytes;
238  }
239
240  private void assertWellFormed(int... bytes) {
241    assertTrue(Utf8.isWellFormed(toByteArray(bytes)));
242  }
243
244  private void assertNotWellFormed(int... bytes) {
245    assertFalse(Utf8.isWellFormed(toByteArray(bytes)));
246  }
247
248  private static long[] generateFourByteShardsExpectedRunnables() {
249    long[] expected = new long[128];
250    // 0-63 are all 5300224
251    for (int i = 0; i <= 63; i++) {
252      expected[i] = 5300224;
253    }
254    // 97-111 are all 2342912
255    for (int i = 97; i <= 111; i++) {
256     expected[i] = 2342912;
257    }
258    // 113-117 are all 1048576
259    for (int i = 113; i <= 117; i++) {
260      expected[i] = 1048576;
261    }
262    // One offs
263    expected[112] = 786432;
264    expected[118] = 786432;
265    expected[119] = 1048576;
266    expected[120] = 458752;
267    expected[121] = 524288;
268    expected[122] = 65536;
269    // Anything not assigned was the default 0.
270    return expected;
271  }
272
273  /**
274   * Helper to run the loop to test all the permutations for the number of bytes
275   * specified.
276   *
277   * @param numBytes the number of bytes in the byte array
278   * @param expectedCount the expected number of roundtrippable permutations
279   */
280  @GwtIncompatible("java.nio.charset.Charset")
281  private static void testBytes(int numBytes, long expectedCount) {
282    testBytes(numBytes, expectedCount, 0, -1);
283  }
284
285  /**
286   * Helper to run the loop to test all the permutations for the number of bytes
287   * specified. This overload is useful for debugging to get the loop to start
288   * at a certain character.
289   *
290   * @param numBytes the number of bytes in the byte array
291   * @param expectedCount the expected number of roundtrippable permutations
292   * @param start the starting bytes encoded as a long as big-endian
293   * @param lim the limit of bytes to process encoded as a long as big-endian,
294   *     or -1 to mean the max limit for numBytes
295   */
296  @GwtIncompatible("java.nio.charset.Charset")
297  private static void testBytes(int numBytes, long expectedCount, long start,
298      long lim) {
299    byte[] bytes = new byte[numBytes];
300    if (lim == -1) {
301      lim = 1L << (numBytes * 8);
302    }
303    long countRoundTripped = 0;
304    for (long byteChar = start; byteChar < lim; byteChar++) {
305      long tmpByteChar = byteChar;
306      for (int i = 0; i < numBytes; i++) {
307        bytes[bytes.length - i - 1] = (byte) tmpByteChar;
308        tmpByteChar = tmpByteChar >> 8;
309      }
310      boolean isRoundTrippable = Utf8.isWellFormed(bytes);
311      assertEquals(isRoundTrippable, Utf8.isWellFormed(bytes, 0, numBytes));
312      String s = new String(bytes, Charsets.UTF_8);
313      byte[] bytesReencoded = s.getBytes(Charsets.UTF_8);
314      boolean bytesEqual = Arrays.equals(bytes, bytesReencoded);
315
316      if (bytesEqual != isRoundTrippable) {
317        fail();
318      }
319      if (isRoundTrippable) {
320        countRoundTripped++;
321      }
322    }
323    assertEquals(expectedCount, countRoundTripped);
324  }
325}
326