1/*
2 * Copyright (C) 2011 The Guava Authors
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17package com.google.common.base;
18
19import com.google.caliper.BeforeExperiment;
20import com.google.caliper.Benchmark;
21import com.google.caliper.Param;
22
23import java.util.Random;
24
25/**
26 * Benchmark for the {@link Utf8} class.
27 *
28 *
29 * @author Martin Buchholz
30 */
31public class Utf8Benchmark {
32
33  static class MaxCodePoint {
34    final int value;
35
36    /**
37     * Convert the input string to a code point.  Accepts regular
38     * decimal numerals, hex strings, and some symbolic names
39     * meaningful to humans.
40     */
41    private static int decode(String userFriendly) {
42      try {
43        return Integer.decode(userFriendly);
44      } catch (NumberFormatException ignored) {
45        if (userFriendly.matches("(?i)(?:American|English|ASCII)")) {
46          // 1-byte UTF-8 sequences - "American" ASCII text
47          return 0x80;
48        } else if (userFriendly.matches("(?i)(?:French|Latin|Western.*European)")) {
49          // Mostly 1-byte UTF-8 sequences, mixed with occasional 2-byte
50          // sequences - "Western European" text
51          return 0x90;
52        } else if (userFriendly.matches("(?i)(?:Branch.*Prediction.*Hostile)")) {
53          // Defeat branch predictor for: c < 0x80 ; branch taken 50% of the time.
54          return 0x100;
55        } else if (userFriendly.matches("(?i)(?:Greek|Cyrillic|European|ISO.?8859)")) {
56          // Mostly 2-byte UTF-8 sequences - "European" text
57          return 0x800;
58        } else if (userFriendly.matches("(?i)(?:Chinese|Han|Asian|BMP)")) {
59          // Mostly 3-byte UTF-8 sequences - "Asian" text
60          return Character.MIN_SUPPLEMENTARY_CODE_POINT;
61        } else if (userFriendly.matches("(?i)(?:Cuneiform|rare|exotic|supplementary.*)")) {
62          // Mostly 4-byte UTF-8 sequences - "rare exotic" text
63          return Character.MAX_CODE_POINT;
64        } else {
65          throw new IllegalArgumentException("Can't decode codepoint " + userFriendly);
66        }
67      }
68    }
69
70    public static MaxCodePoint valueOf(String userFriendly) {
71      return new MaxCodePoint(userFriendly);
72    }
73
74    public MaxCodePoint(String userFriendly) {
75      value = decode(userFriendly);
76    }
77  }
78
79  /**
80   * The default values of maxCodePoint below provide pretty good
81   * performance models of different kinds of common human text.
82   * @see MaxCodePoint#decode
83   */
84  @Param({"0x80", "0x90", "0x100", "0x800", "0x10000", "0x10ffff"}) MaxCodePoint maxCodePoint;
85
86  @Param({"100"}) int stringCount;
87  @Param({"16384"}) int charCount;
88  private CharSequence[] seqs;  // actually, all StringBuilders
89  private String[] strings;
90  private byte[][] byteArrays;
91
92  /**
93   * Compute arrays of valid unicode text, and store it in 3 forms:
94   * byte arrays, Strings, and StringBuilders (in a CharSequence[] to
95   * make it a little harder for the JVM).
96   */
97  @BeforeExperiment void setUp() {
98    final long seed = 99;
99    final Random rnd = new Random(seed);
100    seqs = new CharSequence[stringCount];
101    strings = new String[stringCount];
102    byteArrays = new byte[stringCount][];
103    for (int i = 0; i < stringCount; i++) {
104      StringBuilder sb = new StringBuilder();
105      for (int j = 0; j < charCount; j++) {
106        int codePoint;
107        // discard illegal surrogate "codepoints"
108        do {
109          codePoint = rnd.nextInt(maxCodePoint.value);
110        } while (isSurrogate(codePoint));
111        sb.appendCodePoint(codePoint);
112      }
113      seqs[i] = sb;
114      strings[i] = sb.toString();
115      byteArrays[i] = strings[i].getBytes(Charsets.UTF_8);
116    }
117  }
118
119  /**
120   * Benchmarks {@link Utf8#isWellFormed} on valid byte arrays
121   * containing pseudo-randomly-generated codePoints less than {@code
122   * maxCodePoint}.  A constant seed is used, so separate runs perform
123   * identical computations.
124   */
125  @Benchmark void isWellFormed(int reps) {
126    for (int i = 0; i < reps; i++) {
127      for (byte[] byteArray : byteArrays) {
128        if (!Utf8.isWellFormed(byteArray)) {
129          throw new Error("unexpected invalid UTF-8");
130        }
131      }
132    }
133  }
134
135  /**
136   * Benchmarks {@link Utf8#length} on valid strings containing
137   * pseudo-randomly-generated codePoints less than {@code
138   * maxCodePoint}.  A constant seed is used, so separate runs perform
139   * identical computations.
140   */
141  @Benchmark void lengthOfString(int reps) {
142    for (int i = 0; i < reps; i++) {
143      for (String string : strings) {
144        if (Utf8.encodedLength(string) == 1237482374) {
145          throw new Error("Unlikely! We're just defeating the optimizer!");
146        }
147      }
148    }
149  }
150
151  /**
152   * Benchmarks {@link Utf8#length} on valid StringBuilders containing
153   * pseudo-randomly-generated codePoints less than {@code
154   * maxCodePoint}.  A constant seed is used, so separate runs perform
155   * identical computations.
156   */
157  @Benchmark void lengthOfStringBuilder(int reps) {
158    for (int i = 0; i < reps; i++) {
159      for (CharSequence seq : seqs) {
160        if (Utf8.encodedLength(seq) == 1237482374) {
161          throw new Error("Unlikely! We're just defeating the optimizer!");
162        }
163      }
164    }
165  }
166
167  /** Character.isSurrogate was added in Java SE 7. */
168  private boolean isSurrogate(int c) {
169    return (Character.MIN_HIGH_SURROGATE <= c &&
170            c <= Character.MAX_LOW_SURROGATE);
171  }
172}
173