10888a09821a98ac0680fad765217302858e70fa4Paul Duffin/*
20888a09821a98ac0680fad765217302858e70fa4Paul Duffin * Copyright (C) 2011 The Guava Authors
30888a09821a98ac0680fad765217302858e70fa4Paul Duffin *
40888a09821a98ac0680fad765217302858e70fa4Paul Duffin * Licensed under the Apache License, Version 2.0 (the "License");
50888a09821a98ac0680fad765217302858e70fa4Paul Duffin * you may not use this file except in compliance with the License.
60888a09821a98ac0680fad765217302858e70fa4Paul Duffin * You may obtain a copy of the License at
70888a09821a98ac0680fad765217302858e70fa4Paul Duffin *
80888a09821a98ac0680fad765217302858e70fa4Paul Duffin * http://www.apache.org/licenses/LICENSE-2.0
90888a09821a98ac0680fad765217302858e70fa4Paul Duffin *
100888a09821a98ac0680fad765217302858e70fa4Paul Duffin * Unless required by applicable law or agreed to in writing, software
110888a09821a98ac0680fad765217302858e70fa4Paul Duffin * distributed under the License is distributed on an "AS IS" BASIS,
120888a09821a98ac0680fad765217302858e70fa4Paul Duffin * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
130888a09821a98ac0680fad765217302858e70fa4Paul Duffin * See the License for the specific language governing permissions and
140888a09821a98ac0680fad765217302858e70fa4Paul Duffin * limitations under the License.
150888a09821a98ac0680fad765217302858e70fa4Paul Duffin */
160888a09821a98ac0680fad765217302858e70fa4Paul Duffin
170888a09821a98ac0680fad765217302858e70fa4Paul Duffinpackage com.google.common.base;
180888a09821a98ac0680fad765217302858e70fa4Paul Duffin
190888a09821a98ac0680fad765217302858e70fa4Paul Duffinimport com.google.caliper.BeforeExperiment;
200888a09821a98ac0680fad765217302858e70fa4Paul Duffinimport com.google.caliper.Benchmark;
210888a09821a98ac0680fad765217302858e70fa4Paul Duffinimport com.google.caliper.Param;
220888a09821a98ac0680fad765217302858e70fa4Paul Duffin
230888a09821a98ac0680fad765217302858e70fa4Paul Duffinimport java.util.Random;
240888a09821a98ac0680fad765217302858e70fa4Paul Duffin
250888a09821a98ac0680fad765217302858e70fa4Paul Duffin/**
260888a09821a98ac0680fad765217302858e70fa4Paul Duffin * Benchmark for the {@link Utf8} class.
270888a09821a98ac0680fad765217302858e70fa4Paul Duffin *
280888a09821a98ac0680fad765217302858e70fa4Paul Duffin *
290888a09821a98ac0680fad765217302858e70fa4Paul Duffin * @author Martin Buchholz
300888a09821a98ac0680fad765217302858e70fa4Paul Duffin */
310888a09821a98ac0680fad765217302858e70fa4Paul Duffinpublic class Utf8Benchmark {
320888a09821a98ac0680fad765217302858e70fa4Paul Duffin
330888a09821a98ac0680fad765217302858e70fa4Paul Duffin  static class MaxCodePoint {
340888a09821a98ac0680fad765217302858e70fa4Paul Duffin    final int value;
350888a09821a98ac0680fad765217302858e70fa4Paul Duffin
360888a09821a98ac0680fad765217302858e70fa4Paul Duffin    /**
370888a09821a98ac0680fad765217302858e70fa4Paul Duffin     * Convert the input string to a code point.  Accepts regular
380888a09821a98ac0680fad765217302858e70fa4Paul Duffin     * decimal numerals, hex strings, and some symbolic names
390888a09821a98ac0680fad765217302858e70fa4Paul Duffin     * meaningful to humans.
400888a09821a98ac0680fad765217302858e70fa4Paul Duffin     */
410888a09821a98ac0680fad765217302858e70fa4Paul Duffin    private static int decode(String userFriendly) {
420888a09821a98ac0680fad765217302858e70fa4Paul Duffin      try {
430888a09821a98ac0680fad765217302858e70fa4Paul Duffin        return Integer.decode(userFriendly);
440888a09821a98ac0680fad765217302858e70fa4Paul Duffin      } catch (NumberFormatException ignored) {
450888a09821a98ac0680fad765217302858e70fa4Paul Duffin        if (userFriendly.matches("(?i)(?:American|English|ASCII)")) {
460888a09821a98ac0680fad765217302858e70fa4Paul Duffin          // 1-byte UTF-8 sequences - "American" ASCII text
470888a09821a98ac0680fad765217302858e70fa4Paul Duffin          return 0x80;
480888a09821a98ac0680fad765217302858e70fa4Paul Duffin        } else if (userFriendly.matches("(?i)(?:French|Latin|Western.*European)")) {
490888a09821a98ac0680fad765217302858e70fa4Paul Duffin          // Mostly 1-byte UTF-8 sequences, mixed with occasional 2-byte
500888a09821a98ac0680fad765217302858e70fa4Paul Duffin          // sequences - "Western European" text
510888a09821a98ac0680fad765217302858e70fa4Paul Duffin          return 0x90;
520888a09821a98ac0680fad765217302858e70fa4Paul Duffin        } else if (userFriendly.matches("(?i)(?:Branch.*Prediction.*Hostile)")) {
530888a09821a98ac0680fad765217302858e70fa4Paul Duffin          // Defeat branch predictor for: c < 0x80 ; branch taken 50% of the time.
540888a09821a98ac0680fad765217302858e70fa4Paul Duffin          return 0x100;
550888a09821a98ac0680fad765217302858e70fa4Paul Duffin        } else if (userFriendly.matches("(?i)(?:Greek|Cyrillic|European|ISO.?8859)")) {
560888a09821a98ac0680fad765217302858e70fa4Paul Duffin          // Mostly 2-byte UTF-8 sequences - "European" text
570888a09821a98ac0680fad765217302858e70fa4Paul Duffin          return 0x800;
580888a09821a98ac0680fad765217302858e70fa4Paul Duffin        } else if (userFriendly.matches("(?i)(?:Chinese|Han|Asian|BMP)")) {
590888a09821a98ac0680fad765217302858e70fa4Paul Duffin          // Mostly 3-byte UTF-8 sequences - "Asian" text
600888a09821a98ac0680fad765217302858e70fa4Paul Duffin          return Character.MIN_SUPPLEMENTARY_CODE_POINT;
610888a09821a98ac0680fad765217302858e70fa4Paul Duffin        } else if (userFriendly.matches("(?i)(?:Cuneiform|rare|exotic|supplementary.*)")) {
620888a09821a98ac0680fad765217302858e70fa4Paul Duffin          // Mostly 4-byte UTF-8 sequences - "rare exotic" text
630888a09821a98ac0680fad765217302858e70fa4Paul Duffin          return Character.MAX_CODE_POINT;
640888a09821a98ac0680fad765217302858e70fa4Paul Duffin        } else {
650888a09821a98ac0680fad765217302858e70fa4Paul Duffin          throw new IllegalArgumentException("Can't decode codepoint " + userFriendly);
660888a09821a98ac0680fad765217302858e70fa4Paul Duffin        }
670888a09821a98ac0680fad765217302858e70fa4Paul Duffin      }
680888a09821a98ac0680fad765217302858e70fa4Paul Duffin    }
690888a09821a98ac0680fad765217302858e70fa4Paul Duffin
700888a09821a98ac0680fad765217302858e70fa4Paul Duffin    public static MaxCodePoint valueOf(String userFriendly) {
710888a09821a98ac0680fad765217302858e70fa4Paul Duffin      return new MaxCodePoint(userFriendly);
720888a09821a98ac0680fad765217302858e70fa4Paul Duffin    }
730888a09821a98ac0680fad765217302858e70fa4Paul Duffin
740888a09821a98ac0680fad765217302858e70fa4Paul Duffin    public MaxCodePoint(String userFriendly) {
750888a09821a98ac0680fad765217302858e70fa4Paul Duffin      value = decode(userFriendly);
760888a09821a98ac0680fad765217302858e70fa4Paul Duffin    }
770888a09821a98ac0680fad765217302858e70fa4Paul Duffin  }
780888a09821a98ac0680fad765217302858e70fa4Paul Duffin
790888a09821a98ac0680fad765217302858e70fa4Paul Duffin  /**
800888a09821a98ac0680fad765217302858e70fa4Paul Duffin   * The default values of maxCodePoint below provide pretty good
810888a09821a98ac0680fad765217302858e70fa4Paul Duffin   * performance models of different kinds of common human text.
820888a09821a98ac0680fad765217302858e70fa4Paul Duffin   * @see MaxCodePoint#decode
830888a09821a98ac0680fad765217302858e70fa4Paul Duffin   */
840888a09821a98ac0680fad765217302858e70fa4Paul Duffin  @Param({"0x80", "0x90", "0x100", "0x800", "0x10000", "0x10ffff"}) MaxCodePoint maxCodePoint;
850888a09821a98ac0680fad765217302858e70fa4Paul Duffin
860888a09821a98ac0680fad765217302858e70fa4Paul Duffin  @Param({"100"}) int stringCount;
870888a09821a98ac0680fad765217302858e70fa4Paul Duffin  @Param({"16384"}) int charCount;
880888a09821a98ac0680fad765217302858e70fa4Paul Duffin  private CharSequence[] seqs;  // actually, all StringBuilders
890888a09821a98ac0680fad765217302858e70fa4Paul Duffin  private String[] strings;
900888a09821a98ac0680fad765217302858e70fa4Paul Duffin  private byte[][] byteArrays;
910888a09821a98ac0680fad765217302858e70fa4Paul Duffin
920888a09821a98ac0680fad765217302858e70fa4Paul Duffin  /**
930888a09821a98ac0680fad765217302858e70fa4Paul Duffin   * Compute arrays of valid unicode text, and store it in 3 forms:
940888a09821a98ac0680fad765217302858e70fa4Paul Duffin   * byte arrays, Strings, and StringBuilders (in a CharSequence[] to
950888a09821a98ac0680fad765217302858e70fa4Paul Duffin   * make it a little harder for the JVM).
960888a09821a98ac0680fad765217302858e70fa4Paul Duffin   */
970888a09821a98ac0680fad765217302858e70fa4Paul Duffin  @BeforeExperiment void setUp() {
980888a09821a98ac0680fad765217302858e70fa4Paul Duffin    final long seed = 99;
990888a09821a98ac0680fad765217302858e70fa4Paul Duffin    final Random rnd = new Random(seed);
1000888a09821a98ac0680fad765217302858e70fa4Paul Duffin    seqs = new CharSequence[stringCount];
1010888a09821a98ac0680fad765217302858e70fa4Paul Duffin    strings = new String[stringCount];
1020888a09821a98ac0680fad765217302858e70fa4Paul Duffin    byteArrays = new byte[stringCount][];
1030888a09821a98ac0680fad765217302858e70fa4Paul Duffin    for (int i = 0; i < stringCount; i++) {
1040888a09821a98ac0680fad765217302858e70fa4Paul Duffin      StringBuilder sb = new StringBuilder();
1050888a09821a98ac0680fad765217302858e70fa4Paul Duffin      for (int j = 0; j < charCount; j++) {
1060888a09821a98ac0680fad765217302858e70fa4Paul Duffin        int codePoint;
1070888a09821a98ac0680fad765217302858e70fa4Paul Duffin        // discard illegal surrogate "codepoints"
1080888a09821a98ac0680fad765217302858e70fa4Paul Duffin        do {
1090888a09821a98ac0680fad765217302858e70fa4Paul Duffin          codePoint = rnd.nextInt(maxCodePoint.value);
1100888a09821a98ac0680fad765217302858e70fa4Paul Duffin        } while (isSurrogate(codePoint));
1110888a09821a98ac0680fad765217302858e70fa4Paul Duffin        sb.appendCodePoint(codePoint);
1120888a09821a98ac0680fad765217302858e70fa4Paul Duffin      }
1130888a09821a98ac0680fad765217302858e70fa4Paul Duffin      seqs[i] = sb;
1140888a09821a98ac0680fad765217302858e70fa4Paul Duffin      strings[i] = sb.toString();
1150888a09821a98ac0680fad765217302858e70fa4Paul Duffin      byteArrays[i] = strings[i].getBytes(Charsets.UTF_8);
1160888a09821a98ac0680fad765217302858e70fa4Paul Duffin    }
1170888a09821a98ac0680fad765217302858e70fa4Paul Duffin  }
1180888a09821a98ac0680fad765217302858e70fa4Paul Duffin
1190888a09821a98ac0680fad765217302858e70fa4Paul Duffin  /**
1200888a09821a98ac0680fad765217302858e70fa4Paul Duffin   * Benchmarks {@link Utf8#isWellFormed} on valid byte arrays
1210888a09821a98ac0680fad765217302858e70fa4Paul Duffin   * containing pseudo-randomly-generated codePoints less than {@code
1220888a09821a98ac0680fad765217302858e70fa4Paul Duffin   * maxCodePoint}.  A constant seed is used, so separate runs perform
1230888a09821a98ac0680fad765217302858e70fa4Paul Duffin   * identical computations.
1240888a09821a98ac0680fad765217302858e70fa4Paul Duffin   */
1250888a09821a98ac0680fad765217302858e70fa4Paul Duffin  @Benchmark void isWellFormed(int reps) {
1260888a09821a98ac0680fad765217302858e70fa4Paul Duffin    for (int i = 0; i < reps; i++) {
1270888a09821a98ac0680fad765217302858e70fa4Paul Duffin      for (byte[] byteArray : byteArrays) {
1280888a09821a98ac0680fad765217302858e70fa4Paul Duffin        if (!Utf8.isWellFormed(byteArray)) {
1290888a09821a98ac0680fad765217302858e70fa4Paul Duffin          throw new Error("unexpected invalid UTF-8");
1300888a09821a98ac0680fad765217302858e70fa4Paul Duffin        }
1310888a09821a98ac0680fad765217302858e70fa4Paul Duffin      }
1320888a09821a98ac0680fad765217302858e70fa4Paul Duffin    }
1330888a09821a98ac0680fad765217302858e70fa4Paul Duffin  }
1340888a09821a98ac0680fad765217302858e70fa4Paul Duffin
1350888a09821a98ac0680fad765217302858e70fa4Paul Duffin  /**
1360888a09821a98ac0680fad765217302858e70fa4Paul Duffin   * Benchmarks {@link Utf8#length} on valid strings containing
1370888a09821a98ac0680fad765217302858e70fa4Paul Duffin   * pseudo-randomly-generated codePoints less than {@code
1380888a09821a98ac0680fad765217302858e70fa4Paul Duffin   * maxCodePoint}.  A constant seed is used, so separate runs perform
1390888a09821a98ac0680fad765217302858e70fa4Paul Duffin   * identical computations.
1400888a09821a98ac0680fad765217302858e70fa4Paul Duffin   */
1410888a09821a98ac0680fad765217302858e70fa4Paul Duffin  @Benchmark void lengthOfString(int reps) {
1420888a09821a98ac0680fad765217302858e70fa4Paul Duffin    for (int i = 0; i < reps; i++) {
1430888a09821a98ac0680fad765217302858e70fa4Paul Duffin      for (String string : strings) {
1440888a09821a98ac0680fad765217302858e70fa4Paul Duffin        if (Utf8.encodedLength(string) == 1237482374) {
1450888a09821a98ac0680fad765217302858e70fa4Paul Duffin          throw new Error("Unlikely! We're just defeating the optimizer!");
1460888a09821a98ac0680fad765217302858e70fa4Paul Duffin        }
1470888a09821a98ac0680fad765217302858e70fa4Paul Duffin      }
1480888a09821a98ac0680fad765217302858e70fa4Paul Duffin    }
1490888a09821a98ac0680fad765217302858e70fa4Paul Duffin  }
1500888a09821a98ac0680fad765217302858e70fa4Paul Duffin
1510888a09821a98ac0680fad765217302858e70fa4Paul Duffin  /**
1520888a09821a98ac0680fad765217302858e70fa4Paul Duffin   * Benchmarks {@link Utf8#length} on valid StringBuilders containing
1530888a09821a98ac0680fad765217302858e70fa4Paul Duffin   * pseudo-randomly-generated codePoints less than {@code
1540888a09821a98ac0680fad765217302858e70fa4Paul Duffin   * maxCodePoint}.  A constant seed is used, so separate runs perform
1550888a09821a98ac0680fad765217302858e70fa4Paul Duffin   * identical computations.
1560888a09821a98ac0680fad765217302858e70fa4Paul Duffin   */
1570888a09821a98ac0680fad765217302858e70fa4Paul Duffin  @Benchmark void lengthOfStringBuilder(int reps) {
1580888a09821a98ac0680fad765217302858e70fa4Paul Duffin    for (int i = 0; i < reps; i++) {
1590888a09821a98ac0680fad765217302858e70fa4Paul Duffin      for (CharSequence seq : seqs) {
1600888a09821a98ac0680fad765217302858e70fa4Paul Duffin        if (Utf8.encodedLength(seq) == 1237482374) {
1610888a09821a98ac0680fad765217302858e70fa4Paul Duffin          throw new Error("Unlikely! We're just defeating the optimizer!");
1620888a09821a98ac0680fad765217302858e70fa4Paul Duffin        }
1630888a09821a98ac0680fad765217302858e70fa4Paul Duffin      }
1640888a09821a98ac0680fad765217302858e70fa4Paul Duffin    }
1650888a09821a98ac0680fad765217302858e70fa4Paul Duffin  }
1660888a09821a98ac0680fad765217302858e70fa4Paul Duffin
1670888a09821a98ac0680fad765217302858e70fa4Paul Duffin  /** Character.isSurrogate was added in Java SE 7. */
1680888a09821a98ac0680fad765217302858e70fa4Paul Duffin  private boolean isSurrogate(int c) {
1690888a09821a98ac0680fad765217302858e70fa4Paul Duffin    return (Character.MIN_HIGH_SURROGATE <= c &&
1700888a09821a98ac0680fad765217302858e70fa4Paul Duffin            c <= Character.MAX_LOW_SURROGATE);
1710888a09821a98ac0680fad765217302858e70fa4Paul Duffin  }
1720888a09821a98ac0680fad765217302858e70fa4Paul Duffin}
173