1 /*
2  * Copyright (C) 2011 Google Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 package examples;
18 
19 import com.google.caliper.BeforeExperiment;
20 import com.google.caliper.Benchmark;
21 import com.google.caliper.Param;
22 
23 import java.nio.charset.Charset;
24 import java.util.Random;
25 
26 /**
27  * Benchmark for operations with the UTF-8 charset.
28  */
29 public class Utf8Benchmark {
30 
31   static final Charset UTF_8 = Charset.forName("UTF-8");
32 
33   /**
34    * The maximum code point used in generated text.  Different values
35    * provide reasonable models of different real-world human text.
36    */
37   static class MaxCodePoint {
38     final int value;
39 
40     /**
41      * Convert the input string to a code point.  Accepts regular
42      * decimal numerals, hex strings, and some symbolic names
43      * meaningful to humans.
44      */
decode(String userFriendly)45     private static int decode(String userFriendly) {
46       try {
47         return Integer.decode(userFriendly);
48       } catch (NumberFormatException ignored) {
49         if (userFriendly.matches("(?i)(?:American|English|ASCII)")) {
50           // 1-byte UTF-8 sequences - "American" ASCII text
51           return 0x80;
52         } else if (userFriendly.matches("(?i)(?:French|Latin|Western.*European)")) {
53           // Mostly 1-byte UTF-8 sequences, mixed with occasional 2-byte
54           // sequences - "Western European" text
55           return 0x90;
56         } else if (userFriendly.matches("(?i)(?:Branch.*Prediction.*Hostile)")) {
57           // Defeat branch predictor for: c < 0x80 ; branch taken 50% of the time.
58           return 0x100;
59         } else if (userFriendly.matches("(?i)(?:Greek|Cyrillic|European|ISO.?8859)")) {
60           // Mostly 2-byte UTF-8 sequences - "European" text
61           return 0x800;
62         } else if (userFriendly.matches("(?i)(?:Chinese|Han|Asian|BMP)")) {
63           // Mostly 3-byte UTF-8 sequences - "Asian" text
64           return Character.MIN_SUPPLEMENTARY_CODE_POINT;
65         } else if (userFriendly.matches("(?i)(?:Cuneiform|rare|exotic|supplementary.*)")) {
66           // Mostly 4-byte UTF-8 sequences - "rare exotic" text
67           return Character.MAX_CODE_POINT;
68         } else {
69           throw new IllegalArgumentException("Can't decode codepoint " + userFriendly);
70         }
71       }
72     }
73 
valueOf(String userFriendly)74     public static MaxCodePoint valueOf(String userFriendly) {
75       return new MaxCodePoint(userFriendly);
76     }
77 
MaxCodePoint(String userFriendly)78     private MaxCodePoint(String userFriendly) {
79       value = decode(userFriendly);
80     }
81   }
82 
83   /**
84    * The default values of maxCodePoint below provide pretty good
85    * performance models of different kinds of common human text.
86    * @see MaxCodePoint#decode
87    */
88   @Param({"0x80", "0x100", "0x800", "0x10000", "0x10ffff"}) MaxCodePoint maxCodePoint;
89 
90   static final int STRING_COUNT = 1 << 7;
91 
92   @Param({"65536"}) int charCount;
93   private String[] strings;
94 
95   /**
96    * Computes arrays of valid unicode Strings.
97    */
setUp()98   @BeforeExperiment void setUp() {
99     final long seed = 99;
100     final Random rnd = new Random(seed);
101     strings = new String[STRING_COUNT];
102     for (int i = 0; i < STRING_COUNT; i++) {
103       StringBuilder sb = new StringBuilder();
104       for (int j = 0; j < charCount; j++) {
105         int codePoint;
106         // discard illegal surrogate "codepoints"
107         do {
108           codePoint = rnd.nextInt(maxCodePoint.value);
109         } while (isSurrogate(codePoint));
110         sb.appendCodePoint(codePoint);
111       }
112       strings[i] = sb.toString();
113     }
114     // The reps will continue until the non-determinism detector is pacified!
115     getBytes(100);
116   }
117 
118   /**
119    * Benchmarks {@link String#getBytes} on valid strings containing
120    * pseudo-randomly-generated codePoints less than {@code
121    * maxCodePoint}.  A constant seed is used, so separate runs perform
122    * identical computations.
123    */
getBytes(int reps)124   @Benchmark void getBytes(int reps) {
125     final String[] strings = this.strings;
126     final int mask = STRING_COUNT - 1;
127     for (int i = 0; i < reps; i++) {
128       String string = strings[i & mask];
129       byte[] bytes = string.getBytes(UTF_8);
130       if (bytes[0] == 86 && bytes[bytes.length - 1] == 99) {
131         throw new Error("Unlikely! We're just defeating the optimizer!");
132       }
133     }
134   }
135 
136   /** Character.isSurrogate was added in Java SE 7. */
isSurrogate(int c)137   private boolean isSurrogate(int c) {
138     return (Character.MIN_HIGH_SURROGATE <= c &&
139             c <= Character.MAX_LOW_SURROGATE);
140   }
141 }
142