1 /* 2 * Copyright (C) 2011 Google Inc. 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 package examples; 18 19 import com.google.caliper.BeforeExperiment; 20 import com.google.caliper.Benchmark; 21 import com.google.caliper.Param; 22 23 import java.nio.charset.Charset; 24 import java.util.Random; 25 26 /** 27 * Benchmark for operations with the UTF-8 charset. 28 */ 29 public class Utf8Benchmark { 30 31 static final Charset UTF_8 = Charset.forName("UTF-8"); 32 33 /** 34 * The maximum code point used in generated text. Different values 35 * provide reasonable models of different real-world human text. 36 */ 37 static class MaxCodePoint { 38 final int value; 39 40 /** 41 * Convert the input string to a code point. Accepts regular 42 * decimal numerals, hex strings, and some symbolic names 43 * meaningful to humans. 44 */ decode(String userFriendly)45 private static int decode(String userFriendly) { 46 try { 47 return Integer.decode(userFriendly); 48 } catch (NumberFormatException ignored) { 49 if (userFriendly.matches("(?i)(?:American|English|ASCII)")) { 50 // 1-byte UTF-8 sequences - "American" ASCII text 51 return 0x80; 52 } else if (userFriendly.matches("(?i)(?:French|Latin|Western.*European)")) { 53 // Mostly 1-byte UTF-8 sequences, mixed with occasional 2-byte 54 // sequences - "Western European" text 55 return 0x90; 56 } else if (userFriendly.matches("(?i)(?:Branch.*Prediction.*Hostile)")) { 57 // Defeat branch predictor for: c < 0x80 ; branch taken 50% of the time. 58 return 0x100; 59 } else if (userFriendly.matches("(?i)(?:Greek|Cyrillic|European|ISO.?8859)")) { 60 // Mostly 2-byte UTF-8 sequences - "European" text 61 return 0x800; 62 } else if (userFriendly.matches("(?i)(?:Chinese|Han|Asian|BMP)")) { 63 // Mostly 3-byte UTF-8 sequences - "Asian" text 64 return Character.MIN_SUPPLEMENTARY_CODE_POINT; 65 } else if (userFriendly.matches("(?i)(?:Cuneiform|rare|exotic|supplementary.*)")) { 66 // Mostly 4-byte UTF-8 sequences - "rare exotic" text 67 return Character.MAX_CODE_POINT; 68 } else { 69 throw new IllegalArgumentException("Can't decode codepoint " + userFriendly); 70 } 71 } 72 } 73 valueOf(String userFriendly)74 public static MaxCodePoint valueOf(String userFriendly) { 75 return new MaxCodePoint(userFriendly); 76 } 77 MaxCodePoint(String userFriendly)78 private MaxCodePoint(String userFriendly) { 79 value = decode(userFriendly); 80 } 81 } 82 83 /** 84 * The default values of maxCodePoint below provide pretty good 85 * performance models of different kinds of common human text. 86 * @see MaxCodePoint#decode 87 */ 88 @Param({"0x80", "0x100", "0x800", "0x10000", "0x10ffff"}) MaxCodePoint maxCodePoint; 89 90 static final int STRING_COUNT = 1 << 7; 91 92 @Param({"65536"}) int charCount; 93 private String[] strings; 94 95 /** 96 * Computes arrays of valid unicode Strings. 97 */ setUp()98 @BeforeExperiment void setUp() { 99 final long seed = 99; 100 final Random rnd = new Random(seed); 101 strings = new String[STRING_COUNT]; 102 for (int i = 0; i < STRING_COUNT; i++) { 103 StringBuilder sb = new StringBuilder(); 104 for (int j = 0; j < charCount; j++) { 105 int codePoint; 106 // discard illegal surrogate "codepoints" 107 do { 108 codePoint = rnd.nextInt(maxCodePoint.value); 109 } while (isSurrogate(codePoint)); 110 sb.appendCodePoint(codePoint); 111 } 112 strings[i] = sb.toString(); 113 } 114 // The reps will continue until the non-determinism detector is pacified! 115 getBytes(100); 116 } 117 118 /** 119 * Benchmarks {@link String#getBytes} on valid strings containing 120 * pseudo-randomly-generated codePoints less than {@code 121 * maxCodePoint}. A constant seed is used, so separate runs perform 122 * identical computations. 123 */ getBytes(int reps)124 @Benchmark void getBytes(int reps) { 125 final String[] strings = this.strings; 126 final int mask = STRING_COUNT - 1; 127 for (int i = 0; i < reps; i++) { 128 String string = strings[i & mask]; 129 byte[] bytes = string.getBytes(UTF_8); 130 if (bytes[0] == 86 && bytes[bytes.length - 1] == 99) { 131 throw new Error("Unlikely! We're just defeating the optimizer!"); 132 } 133 } 134 } 135 136 /** Character.isSurrogate was added in Java SE 7. */ isSurrogate(int c)137 private boolean isSurrogate(int c) { 138 return (Character.MIN_HIGH_SURROGATE <= c && 139 c <= Character.MAX_LOW_SURROGATE); 140 } 141 } 142