/* * Copyright (C) 2017 The Guava Authors * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.google.common.hash; import com.google.caliper.BeforeExperiment; import com.google.caliper.Benchmark; import com.google.caliper.Param; import java.nio.charset.StandardCharsets; import java.util.Random; /** Benchmarks for the hashing of UTF-8 strings. */ public class HashStringBenchmark { static class MaxCodePoint { final int value; /** * Convert the input string to a code point. Accepts regular decimal numerals, hex strings, and * some symbolic names meaningful to humans. */ private static int decode(String userFriendly) { try { return Integer.decode(userFriendly); } catch (NumberFormatException ignored) { if (userFriendly.matches("(?i)(?:American|English|ASCII)")) { // 1-byte UTF-8 sequences - "American" ASCII text return 0x80; } else if (userFriendly.matches("(?i)(?:French|Latin|Western.*European)")) { // Mostly 1-byte UTF-8 sequences, mixed with occasional 2-byte // sequences - "Western European" text return 0x90; } else if (userFriendly.matches("(?i)(?:Branch.*Prediction.*Hostile)")) { // Defeat branch predictor for: c < 0x80 ; branch taken 50% of the time. return 0x100; } else if (userFriendly.matches("(?i)(?:Greek|Cyrillic|European|ISO.?8859)")) { // Mostly 2-byte UTF-8 sequences - "European" text return 0x800; } else if (userFriendly.matches("(?i)(?:Chinese|Han|Asian|BMP)")) { // Mostly 3-byte UTF-8 sequences - "Asian" text return Character.MIN_SUPPLEMENTARY_CODE_POINT; } else if (userFriendly.matches("(?i)(?:Cuneiform|rare|exotic|supplementary.*)")) { // Mostly 4-byte UTF-8 sequences - "rare exotic" text return Character.MAX_CODE_POINT; } else { throw new IllegalArgumentException("Can't decode codepoint " + userFriendly); } } } public static MaxCodePoint valueOf(String userFriendly) { return new MaxCodePoint(userFriendly); } public MaxCodePoint(String userFriendly) { value = decode(userFriendly); } } /** * The default values of maxCodePoint below provide pretty good performance models of different * kinds of common human text. * * @see MaxCodePoint#decode */ @Param({"0x80", "0x90", "0x100", "0x800", "0x10000", "0x10ffff"}) MaxCodePoint maxCodePoint; @Param({"16384"}) int charCount; @Param({"MURMUR3_32", "MURMUR3_128", "SHA1"}) HashFunctionEnum hashFunctionEnum; private String[] strings; static final int SAMPLES = 0x100; static final int SAMPLE_MASK = 0xFF; /** * Compute arrays of valid unicode text, and store it in 3 forms: byte arrays, Strings, and * StringBuilders (in a CharSequence[] to make it a little harder for the JVM). */ @BeforeExperiment void setUp() { final long seed = 99; final Random rnd = new Random(seed); strings = new String[SAMPLES]; for (int i = 0; i < SAMPLES; i++) { StringBuilder sb = new StringBuilder(); for (int j = 0; j < charCount; j++) { int codePoint; // discard illegal surrogate "codepoints" do { codePoint = rnd.nextInt(maxCodePoint.value); } while (Character.isSurrogate((char) codePoint)); sb.appendCodePoint(codePoint); } strings[i] = sb.toString(); } } @Benchmark int hashUtf8(int reps) { int res = 0; for (int i = 0; i < reps; i++) { res += System.identityHashCode( hashFunctionEnum .getHashFunction() .hashString(strings[i & SAMPLE_MASK], StandardCharsets.UTF_8)); } return res; } @Benchmark int hashUtf8Hasher(int reps) { int res = 0; for (int i = 0; i < reps; i++) { res += System.identityHashCode( hashFunctionEnum .getHashFunction() .newHasher() .putString(strings[i & SAMPLE_MASK], StandardCharsets.UTF_8) .hash()); } return res; } @Benchmark int hashUtf8GetBytes(int reps) { int res = 0; for (int i = 0; i < reps; i++) { res += System.identityHashCode( hashFunctionEnum .getHashFunction() .hashBytes(strings[i & SAMPLE_MASK].getBytes(StandardCharsets.UTF_8))); } return res; } @Benchmark int hashUtf8GetBytesHasher(int reps) { int res = 0; for (int i = 0; i < reps; i++) { res += System.identityHashCode( hashFunctionEnum .getHashFunction() .newHasher() .putBytes(strings[i & SAMPLE_MASK].getBytes(StandardCharsets.UTF_8)) .hash()); } return res; } }