1 /* 2 * Copyright 2016 Google Inc. 3 * 4 * Use of this source code is governed by a BSD-style license that can be 5 * found in the LICENSE file. 6 */ 7 8 #include "Benchmark.h" 9 #include "SkTypes.h" 10 11 /** 12 * There's a good variety of ways to pack from int down to uint16_t with SSE, 13 * depending on the specific instructions available. 14 * 15 * SSE2 offers an int -> int16_t pack instruction. We can use this in two ways: 16 * - subtract off 32768, int -> int16_t, add 32768 back (sse2_a) 17 * - first artificially sign extend the (positive) value in our int, then int -> int16_t (sse2_b) 18 * SSSE3 adds a byte shuffle, so we just put the bytes where we want them. (ssse3) 19 * SSE41 added an int -> uint16_t pack instruction. (sse41) 20 * 21 * Findings so far: 22 * - sse41 < ssse3 <<< sse2_b < sse2_a; 23 * - the ssse3 version is only slightly slower than the sse41 version, maybe not at all 24 * - the sse2_a is only slightly slower than the sse2_b version 25 * - the ssse3 and sse41 versions are about 3x faster than either sse2 version 26 * - the sse41 version seems to cause some code generation trouble. 27 */ 28 29 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2 30 31 #include <immintrin.h> 32 33 template <__m128i (kernel)(__m128i)> 34 class pack_int_uint16_t_Bench : public Benchmark { 35 public: pack_int_uint16_t_Bench(const char * impl)36 pack_int_uint16_t_Bench(const char* impl) { 37 fName.append("pack_int_uint16_t_"); 38 fName.append(impl); 39 } 40 isSuitableFor(Backend backend)41 bool isSuitableFor(Backend backend) override { return backend == kNonRendering_Backend; } onGetName()42 const char* onGetName() override { return fName.c_str(); } 43 onDraw(int loops,SkCanvas *)44 void onDraw(int loops, SkCanvas*) override { 45 __m128i x = _mm_set1_epi32(0x42424242); 46 while (loops --> 0) { 47 x = kernel(x); 48 } 49 50 volatile int blackhole = 0; 51 blackhole ^= _mm_cvtsi128_si32(x); 52 } 53 54 SkString fName; 55 }; 56 57 namespace { sse2_a(__m128i x)58 __m128i sse2_a(__m128i x) { 59 x = _mm_sub_epi32(x, _mm_set1_epi32(0x8000)); 60 return _mm_add_epi16(_mm_packs_epi32(x,x), _mm_set1_epi16((short)0x8000)); 61 } 62 } 63 DEF_BENCH( return new pack_int_uint16_t_Bench<sse2_a>("sse2_a"); ) 64 65 namespace { sse2_b(__m128i x)66 __m128i sse2_b(__m128i x) { 67 x = _mm_srai_epi32(_mm_slli_epi32(x, 16), 16); 68 return _mm_packs_epi32(x,x); 69 } 70 } 71 DEF_BENCH( return new pack_int_uint16_t_Bench<sse2_b>("sse2_b"); ) 72 73 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3 74 namespace { ssse3(__m128i x)75 __m128i ssse3(__m128i x) { 76 // TODO: Can we force the bench to load the mask inside the loop? Would be more realistic. 77 const int _ = ~0; 78 return _mm_shuffle_epi8(x, _mm_setr_epi8(0,1, 4,5, 8,9, 12,13, _,_,_,_,_,_,_,_)); 79 } 80 } 81 DEF_BENCH( return new pack_int_uint16_t_Bench<ssse3>("ssse3"); ) 82 #endif 83 84 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE41 85 namespace { sse41(__m128i x)86 __m128i sse41(__m128i x) { 87 return _mm_packus_epi32(x,x); 88 } 89 } 90 DEF_BENCH( return new pack_int_uint16_t_Bench<sse41>("sse41"); ) 91 #endif 92 93 #endif // SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2 94