10888a09821a98ac0680fad765217302858e70fa4Paul Duffin/* 20888a09821a98ac0680fad765217302858e70fa4Paul Duffin * Copyright (C) 2011 The Guava Authors 30888a09821a98ac0680fad765217302858e70fa4Paul Duffin * 40888a09821a98ac0680fad765217302858e70fa4Paul Duffin * Licensed under the Apache License, Version 2.0 (the "License"); 50888a09821a98ac0680fad765217302858e70fa4Paul Duffin * you may not use this file except in compliance with the License. 60888a09821a98ac0680fad765217302858e70fa4Paul Duffin * You may obtain a copy of the License at 70888a09821a98ac0680fad765217302858e70fa4Paul Duffin * 80888a09821a98ac0680fad765217302858e70fa4Paul Duffin * http://www.apache.org/licenses/LICENSE-2.0 90888a09821a98ac0680fad765217302858e70fa4Paul Duffin * 100888a09821a98ac0680fad765217302858e70fa4Paul Duffin * Unless required by applicable law or agreed to in writing, software 110888a09821a98ac0680fad765217302858e70fa4Paul Duffin * distributed under the License is distributed on an "AS IS" BASIS, 120888a09821a98ac0680fad765217302858e70fa4Paul Duffin * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 130888a09821a98ac0680fad765217302858e70fa4Paul Duffin * See the License for the specific language governing permissions and 140888a09821a98ac0680fad765217302858e70fa4Paul Duffin * limitations under the License. 150888a09821a98ac0680fad765217302858e70fa4Paul Duffin */ 160888a09821a98ac0680fad765217302858e70fa4Paul Duffin 170888a09821a98ac0680fad765217302858e70fa4Paul Duffinpackage com.google.common.base; 180888a09821a98ac0680fad765217302858e70fa4Paul Duffin 190888a09821a98ac0680fad765217302858e70fa4Paul Duffinimport com.google.caliper.BeforeExperiment; 200888a09821a98ac0680fad765217302858e70fa4Paul Duffinimport com.google.caliper.Benchmark; 210888a09821a98ac0680fad765217302858e70fa4Paul Duffinimport com.google.caliper.Param; 220888a09821a98ac0680fad765217302858e70fa4Paul Duffin 230888a09821a98ac0680fad765217302858e70fa4Paul Duffinimport java.util.Random; 240888a09821a98ac0680fad765217302858e70fa4Paul Duffin 250888a09821a98ac0680fad765217302858e70fa4Paul Duffin/** 260888a09821a98ac0680fad765217302858e70fa4Paul Duffin * Benchmark for the {@link Utf8} class. 270888a09821a98ac0680fad765217302858e70fa4Paul Duffin * 280888a09821a98ac0680fad765217302858e70fa4Paul Duffin * 290888a09821a98ac0680fad765217302858e70fa4Paul Duffin * @author Martin Buchholz 300888a09821a98ac0680fad765217302858e70fa4Paul Duffin */ 310888a09821a98ac0680fad765217302858e70fa4Paul Duffinpublic class Utf8Benchmark { 320888a09821a98ac0680fad765217302858e70fa4Paul Duffin 330888a09821a98ac0680fad765217302858e70fa4Paul Duffin static class MaxCodePoint { 340888a09821a98ac0680fad765217302858e70fa4Paul Duffin final int value; 350888a09821a98ac0680fad765217302858e70fa4Paul Duffin 360888a09821a98ac0680fad765217302858e70fa4Paul Duffin /** 370888a09821a98ac0680fad765217302858e70fa4Paul Duffin * Convert the input string to a code point. Accepts regular 380888a09821a98ac0680fad765217302858e70fa4Paul Duffin * decimal numerals, hex strings, and some symbolic names 390888a09821a98ac0680fad765217302858e70fa4Paul Duffin * meaningful to humans. 400888a09821a98ac0680fad765217302858e70fa4Paul Duffin */ 410888a09821a98ac0680fad765217302858e70fa4Paul Duffin private static int decode(String userFriendly) { 420888a09821a98ac0680fad765217302858e70fa4Paul Duffin try { 430888a09821a98ac0680fad765217302858e70fa4Paul Duffin return Integer.decode(userFriendly); 440888a09821a98ac0680fad765217302858e70fa4Paul Duffin } catch (NumberFormatException ignored) { 450888a09821a98ac0680fad765217302858e70fa4Paul Duffin if (userFriendly.matches("(?i)(?:American|English|ASCII)")) { 460888a09821a98ac0680fad765217302858e70fa4Paul Duffin // 1-byte UTF-8 sequences - "American" ASCII text 470888a09821a98ac0680fad765217302858e70fa4Paul Duffin return 0x80; 480888a09821a98ac0680fad765217302858e70fa4Paul Duffin } else if (userFriendly.matches("(?i)(?:French|Latin|Western.*European)")) { 490888a09821a98ac0680fad765217302858e70fa4Paul Duffin // Mostly 1-byte UTF-8 sequences, mixed with occasional 2-byte 500888a09821a98ac0680fad765217302858e70fa4Paul Duffin // sequences - "Western European" text 510888a09821a98ac0680fad765217302858e70fa4Paul Duffin return 0x90; 520888a09821a98ac0680fad765217302858e70fa4Paul Duffin } else if (userFriendly.matches("(?i)(?:Branch.*Prediction.*Hostile)")) { 530888a09821a98ac0680fad765217302858e70fa4Paul Duffin // Defeat branch predictor for: c < 0x80 ; branch taken 50% of the time. 540888a09821a98ac0680fad765217302858e70fa4Paul Duffin return 0x100; 550888a09821a98ac0680fad765217302858e70fa4Paul Duffin } else if (userFriendly.matches("(?i)(?:Greek|Cyrillic|European|ISO.?8859)")) { 560888a09821a98ac0680fad765217302858e70fa4Paul Duffin // Mostly 2-byte UTF-8 sequences - "European" text 570888a09821a98ac0680fad765217302858e70fa4Paul Duffin return 0x800; 580888a09821a98ac0680fad765217302858e70fa4Paul Duffin } else if (userFriendly.matches("(?i)(?:Chinese|Han|Asian|BMP)")) { 590888a09821a98ac0680fad765217302858e70fa4Paul Duffin // Mostly 3-byte UTF-8 sequences - "Asian" text 600888a09821a98ac0680fad765217302858e70fa4Paul Duffin return Character.MIN_SUPPLEMENTARY_CODE_POINT; 610888a09821a98ac0680fad765217302858e70fa4Paul Duffin } else if (userFriendly.matches("(?i)(?:Cuneiform|rare|exotic|supplementary.*)")) { 620888a09821a98ac0680fad765217302858e70fa4Paul Duffin // Mostly 4-byte UTF-8 sequences - "rare exotic" text 630888a09821a98ac0680fad765217302858e70fa4Paul Duffin return Character.MAX_CODE_POINT; 640888a09821a98ac0680fad765217302858e70fa4Paul Duffin } else { 650888a09821a98ac0680fad765217302858e70fa4Paul Duffin throw new IllegalArgumentException("Can't decode codepoint " + userFriendly); 660888a09821a98ac0680fad765217302858e70fa4Paul Duffin } 670888a09821a98ac0680fad765217302858e70fa4Paul Duffin } 680888a09821a98ac0680fad765217302858e70fa4Paul Duffin } 690888a09821a98ac0680fad765217302858e70fa4Paul Duffin 700888a09821a98ac0680fad765217302858e70fa4Paul Duffin public static MaxCodePoint valueOf(String userFriendly) { 710888a09821a98ac0680fad765217302858e70fa4Paul Duffin return new MaxCodePoint(userFriendly); 720888a09821a98ac0680fad765217302858e70fa4Paul Duffin } 730888a09821a98ac0680fad765217302858e70fa4Paul Duffin 740888a09821a98ac0680fad765217302858e70fa4Paul Duffin public MaxCodePoint(String userFriendly) { 750888a09821a98ac0680fad765217302858e70fa4Paul Duffin value = decode(userFriendly); 760888a09821a98ac0680fad765217302858e70fa4Paul Duffin } 770888a09821a98ac0680fad765217302858e70fa4Paul Duffin } 780888a09821a98ac0680fad765217302858e70fa4Paul Duffin 790888a09821a98ac0680fad765217302858e70fa4Paul Duffin /** 800888a09821a98ac0680fad765217302858e70fa4Paul Duffin * The default values of maxCodePoint below provide pretty good 810888a09821a98ac0680fad765217302858e70fa4Paul Duffin * performance models of different kinds of common human text. 820888a09821a98ac0680fad765217302858e70fa4Paul Duffin * @see MaxCodePoint#decode 830888a09821a98ac0680fad765217302858e70fa4Paul Duffin */ 840888a09821a98ac0680fad765217302858e70fa4Paul Duffin @Param({"0x80", "0x90", "0x100", "0x800", "0x10000", "0x10ffff"}) MaxCodePoint maxCodePoint; 850888a09821a98ac0680fad765217302858e70fa4Paul Duffin 860888a09821a98ac0680fad765217302858e70fa4Paul Duffin @Param({"100"}) int stringCount; 870888a09821a98ac0680fad765217302858e70fa4Paul Duffin @Param({"16384"}) int charCount; 880888a09821a98ac0680fad765217302858e70fa4Paul Duffin private CharSequence[] seqs; // actually, all StringBuilders 890888a09821a98ac0680fad765217302858e70fa4Paul Duffin private String[] strings; 900888a09821a98ac0680fad765217302858e70fa4Paul Duffin private byte[][] byteArrays; 910888a09821a98ac0680fad765217302858e70fa4Paul Duffin 920888a09821a98ac0680fad765217302858e70fa4Paul Duffin /** 930888a09821a98ac0680fad765217302858e70fa4Paul Duffin * Compute arrays of valid unicode text, and store it in 3 forms: 940888a09821a98ac0680fad765217302858e70fa4Paul Duffin * byte arrays, Strings, and StringBuilders (in a CharSequence[] to 950888a09821a98ac0680fad765217302858e70fa4Paul Duffin * make it a little harder for the JVM). 960888a09821a98ac0680fad765217302858e70fa4Paul Duffin */ 970888a09821a98ac0680fad765217302858e70fa4Paul Duffin @BeforeExperiment void setUp() { 980888a09821a98ac0680fad765217302858e70fa4Paul Duffin final long seed = 99; 990888a09821a98ac0680fad765217302858e70fa4Paul Duffin final Random rnd = new Random(seed); 1000888a09821a98ac0680fad765217302858e70fa4Paul Duffin seqs = new CharSequence[stringCount]; 1010888a09821a98ac0680fad765217302858e70fa4Paul Duffin strings = new String[stringCount]; 1020888a09821a98ac0680fad765217302858e70fa4Paul Duffin byteArrays = new byte[stringCount][]; 1030888a09821a98ac0680fad765217302858e70fa4Paul Duffin for (int i = 0; i < stringCount; i++) { 1040888a09821a98ac0680fad765217302858e70fa4Paul Duffin StringBuilder sb = new StringBuilder(); 1050888a09821a98ac0680fad765217302858e70fa4Paul Duffin for (int j = 0; j < charCount; j++) { 1060888a09821a98ac0680fad765217302858e70fa4Paul Duffin int codePoint; 1070888a09821a98ac0680fad765217302858e70fa4Paul Duffin // discard illegal surrogate "codepoints" 1080888a09821a98ac0680fad765217302858e70fa4Paul Duffin do { 1090888a09821a98ac0680fad765217302858e70fa4Paul Duffin codePoint = rnd.nextInt(maxCodePoint.value); 1100888a09821a98ac0680fad765217302858e70fa4Paul Duffin } while (isSurrogate(codePoint)); 1110888a09821a98ac0680fad765217302858e70fa4Paul Duffin sb.appendCodePoint(codePoint); 1120888a09821a98ac0680fad765217302858e70fa4Paul Duffin } 1130888a09821a98ac0680fad765217302858e70fa4Paul Duffin seqs[i] = sb; 1140888a09821a98ac0680fad765217302858e70fa4Paul Duffin strings[i] = sb.toString(); 1150888a09821a98ac0680fad765217302858e70fa4Paul Duffin byteArrays[i] = strings[i].getBytes(Charsets.UTF_8); 1160888a09821a98ac0680fad765217302858e70fa4Paul Duffin } 1170888a09821a98ac0680fad765217302858e70fa4Paul Duffin } 1180888a09821a98ac0680fad765217302858e70fa4Paul Duffin 1190888a09821a98ac0680fad765217302858e70fa4Paul Duffin /** 1200888a09821a98ac0680fad765217302858e70fa4Paul Duffin * Benchmarks {@link Utf8#isWellFormed} on valid byte arrays 1210888a09821a98ac0680fad765217302858e70fa4Paul Duffin * containing pseudo-randomly-generated codePoints less than {@code 1220888a09821a98ac0680fad765217302858e70fa4Paul Duffin * maxCodePoint}. A constant seed is used, so separate runs perform 1230888a09821a98ac0680fad765217302858e70fa4Paul Duffin * identical computations. 1240888a09821a98ac0680fad765217302858e70fa4Paul Duffin */ 1250888a09821a98ac0680fad765217302858e70fa4Paul Duffin @Benchmark void isWellFormed(int reps) { 1260888a09821a98ac0680fad765217302858e70fa4Paul Duffin for (int i = 0; i < reps; i++) { 1270888a09821a98ac0680fad765217302858e70fa4Paul Duffin for (byte[] byteArray : byteArrays) { 1280888a09821a98ac0680fad765217302858e70fa4Paul Duffin if (!Utf8.isWellFormed(byteArray)) { 1290888a09821a98ac0680fad765217302858e70fa4Paul Duffin throw new Error("unexpected invalid UTF-8"); 1300888a09821a98ac0680fad765217302858e70fa4Paul Duffin } 1310888a09821a98ac0680fad765217302858e70fa4Paul Duffin } 1320888a09821a98ac0680fad765217302858e70fa4Paul Duffin } 1330888a09821a98ac0680fad765217302858e70fa4Paul Duffin } 1340888a09821a98ac0680fad765217302858e70fa4Paul Duffin 1350888a09821a98ac0680fad765217302858e70fa4Paul Duffin /** 1360888a09821a98ac0680fad765217302858e70fa4Paul Duffin * Benchmarks {@link Utf8#length} on valid strings containing 1370888a09821a98ac0680fad765217302858e70fa4Paul Duffin * pseudo-randomly-generated codePoints less than {@code 1380888a09821a98ac0680fad765217302858e70fa4Paul Duffin * maxCodePoint}. A constant seed is used, so separate runs perform 1390888a09821a98ac0680fad765217302858e70fa4Paul Duffin * identical computations. 1400888a09821a98ac0680fad765217302858e70fa4Paul Duffin */ 1410888a09821a98ac0680fad765217302858e70fa4Paul Duffin @Benchmark void lengthOfString(int reps) { 1420888a09821a98ac0680fad765217302858e70fa4Paul Duffin for (int i = 0; i < reps; i++) { 1430888a09821a98ac0680fad765217302858e70fa4Paul Duffin for (String string : strings) { 1440888a09821a98ac0680fad765217302858e70fa4Paul Duffin if (Utf8.encodedLength(string) == 1237482374) { 1450888a09821a98ac0680fad765217302858e70fa4Paul Duffin throw new Error("Unlikely! We're just defeating the optimizer!"); 1460888a09821a98ac0680fad765217302858e70fa4Paul Duffin } 1470888a09821a98ac0680fad765217302858e70fa4Paul Duffin } 1480888a09821a98ac0680fad765217302858e70fa4Paul Duffin } 1490888a09821a98ac0680fad765217302858e70fa4Paul Duffin } 1500888a09821a98ac0680fad765217302858e70fa4Paul Duffin 1510888a09821a98ac0680fad765217302858e70fa4Paul Duffin /** 1520888a09821a98ac0680fad765217302858e70fa4Paul Duffin * Benchmarks {@link Utf8#length} on valid StringBuilders containing 1530888a09821a98ac0680fad765217302858e70fa4Paul Duffin * pseudo-randomly-generated codePoints less than {@code 1540888a09821a98ac0680fad765217302858e70fa4Paul Duffin * maxCodePoint}. A constant seed is used, so separate runs perform 1550888a09821a98ac0680fad765217302858e70fa4Paul Duffin * identical computations. 1560888a09821a98ac0680fad765217302858e70fa4Paul Duffin */ 1570888a09821a98ac0680fad765217302858e70fa4Paul Duffin @Benchmark void lengthOfStringBuilder(int reps) { 1580888a09821a98ac0680fad765217302858e70fa4Paul Duffin for (int i = 0; i < reps; i++) { 1590888a09821a98ac0680fad765217302858e70fa4Paul Duffin for (CharSequence seq : seqs) { 1600888a09821a98ac0680fad765217302858e70fa4Paul Duffin if (Utf8.encodedLength(seq) == 1237482374) { 1610888a09821a98ac0680fad765217302858e70fa4Paul Duffin throw new Error("Unlikely! We're just defeating the optimizer!"); 1620888a09821a98ac0680fad765217302858e70fa4Paul Duffin } 1630888a09821a98ac0680fad765217302858e70fa4Paul Duffin } 1640888a09821a98ac0680fad765217302858e70fa4Paul Duffin } 1650888a09821a98ac0680fad765217302858e70fa4Paul Duffin } 1660888a09821a98ac0680fad765217302858e70fa4Paul Duffin 1670888a09821a98ac0680fad765217302858e70fa4Paul Duffin /** Character.isSurrogate was added in Java SE 7. */ 1680888a09821a98ac0680fad765217302858e70fa4Paul Duffin private boolean isSurrogate(int c) { 1690888a09821a98ac0680fad765217302858e70fa4Paul Duffin return (Character.MIN_HIGH_SURROGATE <= c && 1700888a09821a98ac0680fad765217302858e70fa4Paul Duffin c <= Character.MAX_LOW_SURROGATE); 1710888a09821a98ac0680fad765217302858e70fa4Paul Duffin } 1720888a09821a98ac0680fad765217302858e70fa4Paul Duffin} 173