1 /* 2 * Copyright (C) 2017 The Guava Authors 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 package com.google.common.hash; 18 19 import com.google.caliper.BeforeExperiment; 20 import com.google.caliper.Benchmark; 21 import com.google.caliper.Param; 22 import java.nio.charset.StandardCharsets; 23 import java.util.Random; 24 25 /** Benchmarks for the hashing of UTF-8 strings. */ 26 public class HashStringBenchmark { 27 static class MaxCodePoint { 28 final int value; 29 30 /** 31 * Convert the input string to a code point. Accepts regular decimal numerals, hex strings, and 32 * some symbolic names meaningful to humans. 33 */ decode(String userFriendly)34 private static int decode(String userFriendly) { 35 try { 36 return Integer.decode(userFriendly); 37 } catch (NumberFormatException ignored) { 38 if (userFriendly.matches("(?i)(?:American|English|ASCII)")) { 39 // 1-byte UTF-8 sequences - "American" ASCII text 40 return 0x80; 41 } else if (userFriendly.matches("(?i)(?:French|Latin|Western.*European)")) { 42 // Mostly 1-byte UTF-8 sequences, mixed with occasional 2-byte 43 // sequences - "Western European" text 44 return 0x90; 45 } else if (userFriendly.matches("(?i)(?:Branch.*Prediction.*Hostile)")) { 46 // Defeat branch predictor for: c < 0x80 ; branch taken 50% of the time. 47 return 0x100; 48 } else if (userFriendly.matches("(?i)(?:Greek|Cyrillic|European|ISO.?8859)")) { 49 // Mostly 2-byte UTF-8 sequences - "European" text 50 return 0x800; 51 } else if (userFriendly.matches("(?i)(?:Chinese|Han|Asian|BMP)")) { 52 // Mostly 3-byte UTF-8 sequences - "Asian" text 53 return Character.MIN_SUPPLEMENTARY_CODE_POINT; 54 } else if (userFriendly.matches("(?i)(?:Cuneiform|rare|exotic|supplementary.*)")) { 55 // Mostly 4-byte UTF-8 sequences - "rare exotic" text 56 return Character.MAX_CODE_POINT; 57 } else { 58 throw new IllegalArgumentException("Can't decode codepoint " + userFriendly); 59 } 60 } 61 } 62 valueOf(String userFriendly)63 public static MaxCodePoint valueOf(String userFriendly) { 64 return new MaxCodePoint(userFriendly); 65 } 66 MaxCodePoint(String userFriendly)67 public MaxCodePoint(String userFriendly) { 68 value = decode(userFriendly); 69 } 70 } 71 72 /** 73 * The default values of maxCodePoint below provide pretty good performance models of different 74 * kinds of common human text. 75 * 76 * @see MaxCodePoint#decode 77 */ 78 @Param({"0x80", "0x90", "0x100", "0x800", "0x10000", "0x10ffff"}) 79 MaxCodePoint maxCodePoint; 80 81 @Param({"16384"}) 82 int charCount; 83 84 @Param({"MURMUR3_32", "MURMUR3_128", "SHA1"}) 85 HashFunctionEnum hashFunctionEnum; 86 87 private String[] strings; 88 89 static final int SAMPLES = 0x100; 90 static final int SAMPLE_MASK = 0xFF; 91 92 /** 93 * Compute arrays of valid unicode text, and store it in 3 forms: byte arrays, Strings, and 94 * StringBuilders (in a CharSequence[] to make it a little harder for the JVM). 95 */ 96 @BeforeExperiment setUp()97 void setUp() { 98 final long seed = 99; 99 final Random rnd = new Random(seed); 100 strings = new String[SAMPLES]; 101 for (int i = 0; i < SAMPLES; i++) { 102 StringBuilder sb = new StringBuilder(); 103 for (int j = 0; j < charCount; j++) { 104 int codePoint; 105 // discard illegal surrogate "codepoints" 106 do { 107 codePoint = rnd.nextInt(maxCodePoint.value); 108 } while (Character.isSurrogate((char) codePoint)); 109 sb.appendCodePoint(codePoint); 110 } 111 strings[i] = sb.toString(); 112 } 113 } 114 115 @Benchmark hashUtf8(int reps)116 int hashUtf8(int reps) { 117 int res = 0; 118 for (int i = 0; i < reps; i++) { 119 res += 120 System.identityHashCode( 121 hashFunctionEnum 122 .getHashFunction() 123 .hashString(strings[i & SAMPLE_MASK], StandardCharsets.UTF_8)); 124 } 125 return res; 126 } 127 128 @Benchmark hashUtf8Hasher(int reps)129 int hashUtf8Hasher(int reps) { 130 int res = 0; 131 for (int i = 0; i < reps; i++) { 132 res += 133 System.identityHashCode( 134 hashFunctionEnum 135 .getHashFunction() 136 .newHasher() 137 .putString(strings[i & SAMPLE_MASK], StandardCharsets.UTF_8) 138 .hash()); 139 } 140 return res; 141 } 142 143 @Benchmark hashUtf8GetBytes(int reps)144 int hashUtf8GetBytes(int reps) { 145 int res = 0; 146 for (int i = 0; i < reps; i++) { 147 res += 148 System.identityHashCode( 149 hashFunctionEnum 150 .getHashFunction() 151 .hashBytes(strings[i & SAMPLE_MASK].getBytes(StandardCharsets.UTF_8))); 152 } 153 return res; 154 } 155 156 @Benchmark hashUtf8GetBytesHasher(int reps)157 int hashUtf8GetBytesHasher(int reps) { 158 int res = 0; 159 for (int i = 0; i < reps; i++) { 160 res += 161 System.identityHashCode( 162 hashFunctionEnum 163 .getHashFunction() 164 .newHasher() 165 .putBytes(strings[i & SAMPLE_MASK].getBytes(StandardCharsets.UTF_8)) 166 .hash()); 167 } 168 return res; 169 } 170 } 171