1 /*
2  * Copyright (C) 2017 The Guava Authors
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 package com.google.common.hash;
18 
19 import com.google.caliper.BeforeExperiment;
20 import com.google.caliper.Benchmark;
21 import com.google.caliper.Param;
22 import java.nio.charset.StandardCharsets;
23 import java.util.Random;
24 
25 /** Benchmarks for the hashing of UTF-8 strings. */
26 public class HashStringBenchmark {
27   static class MaxCodePoint {
28     final int value;
29 
30     /**
31      * Convert the input string to a code point. Accepts regular decimal numerals, hex strings, and
32      * some symbolic names meaningful to humans.
33      */
decode(String userFriendly)34     private static int decode(String userFriendly) {
35       try {
36         return Integer.decode(userFriendly);
37       } catch (NumberFormatException ignored) {
38         if (userFriendly.matches("(?i)(?:American|English|ASCII)")) {
39           // 1-byte UTF-8 sequences - "American" ASCII text
40           return 0x80;
41         } else if (userFriendly.matches("(?i)(?:French|Latin|Western.*European)")) {
42           // Mostly 1-byte UTF-8 sequences, mixed with occasional 2-byte
43           // sequences - "Western European" text
44           return 0x90;
45         } else if (userFriendly.matches("(?i)(?:Branch.*Prediction.*Hostile)")) {
46           // Defeat branch predictor for: c < 0x80 ; branch taken 50% of the time.
47           return 0x100;
48         } else if (userFriendly.matches("(?i)(?:Greek|Cyrillic|European|ISO.?8859)")) {
49           // Mostly 2-byte UTF-8 sequences - "European" text
50           return 0x800;
51         } else if (userFriendly.matches("(?i)(?:Chinese|Han|Asian|BMP)")) {
52           // Mostly 3-byte UTF-8 sequences - "Asian" text
53           return Character.MIN_SUPPLEMENTARY_CODE_POINT;
54         } else if (userFriendly.matches("(?i)(?:Cuneiform|rare|exotic|supplementary.*)")) {
55           // Mostly 4-byte UTF-8 sequences - "rare exotic" text
56           return Character.MAX_CODE_POINT;
57         } else {
58           throw new IllegalArgumentException("Can't decode codepoint " + userFriendly);
59         }
60       }
61     }
62 
valueOf(String userFriendly)63     public static MaxCodePoint valueOf(String userFriendly) {
64       return new MaxCodePoint(userFriendly);
65     }
66 
MaxCodePoint(String userFriendly)67     public MaxCodePoint(String userFriendly) {
68       value = decode(userFriendly);
69     }
70   }
71 
72   /**
73    * The default values of maxCodePoint below provide pretty good performance models of different
74    * kinds of common human text.
75    *
76    * @see MaxCodePoint#decode
77    */
78   @Param({"0x80", "0x90", "0x100", "0x800", "0x10000", "0x10ffff"})
79   MaxCodePoint maxCodePoint;
80 
81   @Param({"16384"})
82   int charCount;
83 
84   @Param({"MURMUR3_32", "MURMUR3_128", "SHA1"})
85   HashFunctionEnum hashFunctionEnum;
86 
87   private String[] strings;
88 
89   static final int SAMPLES = 0x100;
90   static final int SAMPLE_MASK = 0xFF;
91 
92   /**
93    * Compute arrays of valid unicode text, and store it in 3 forms: byte arrays, Strings, and
94    * StringBuilders (in a CharSequence[] to make it a little harder for the JVM).
95    */
96   @BeforeExperiment
setUp()97   void setUp() {
98     final long seed = 99;
99     final Random rnd = new Random(seed);
100     strings = new String[SAMPLES];
101     for (int i = 0; i < SAMPLES; i++) {
102       StringBuilder sb = new StringBuilder();
103       for (int j = 0; j < charCount; j++) {
104         int codePoint;
105         // discard illegal surrogate "codepoints"
106         do {
107           codePoint = rnd.nextInt(maxCodePoint.value);
108         } while (Character.isSurrogate((char) codePoint));
109         sb.appendCodePoint(codePoint);
110       }
111       strings[i] = sb.toString();
112     }
113   }
114 
115   @Benchmark
hashUtf8(int reps)116   int hashUtf8(int reps) {
117     int res = 0;
118     for (int i = 0; i < reps; i++) {
119       res +=
120           System.identityHashCode(
121               hashFunctionEnum
122                   .getHashFunction()
123                   .hashString(strings[i & SAMPLE_MASK], StandardCharsets.UTF_8));
124     }
125     return res;
126   }
127 
128   @Benchmark
hashUtf8Hasher(int reps)129   int hashUtf8Hasher(int reps) {
130     int res = 0;
131     for (int i = 0; i < reps; i++) {
132       res +=
133           System.identityHashCode(
134               hashFunctionEnum
135                   .getHashFunction()
136                   .newHasher()
137                   .putString(strings[i & SAMPLE_MASK], StandardCharsets.UTF_8)
138                   .hash());
139     }
140     return res;
141   }
142 
143   @Benchmark
hashUtf8GetBytes(int reps)144   int hashUtf8GetBytes(int reps) {
145     int res = 0;
146     for (int i = 0; i < reps; i++) {
147       res +=
148           System.identityHashCode(
149               hashFunctionEnum
150                   .getHashFunction()
151                   .hashBytes(strings[i & SAMPLE_MASK].getBytes(StandardCharsets.UTF_8)));
152     }
153     return res;
154   }
155 
156   @Benchmark
hashUtf8GetBytesHasher(int reps)157   int hashUtf8GetBytesHasher(int reps) {
158     int res = 0;
159     for (int i = 0; i < reps; i++) {
160       res +=
161           System.identityHashCode(
162               hashFunctionEnum
163                   .getHashFunction()
164                   .newHasher()
165                   .putBytes(strings[i & SAMPLE_MASK].getBytes(StandardCharsets.UTF_8))
166                   .hash());
167     }
168     return res;
169   }
170 }
171