1 /*
2  * Copyright 2009 The Android Open Source Project
3  *
4  * Use of this source code is governed by a BSD-style license that can be
5  * found in the LICENSE file.
6  */
7 
8 #include "SkBitmapFilter_opts_SSE2.h"
9 #include "SkBitmapProcState_opts_SSE2.h"
10 #include "SkBitmapProcState_opts_SSSE3.h"
11 #include "SkBitmapScaler.h"
12 #include "SkBlitMask.h"
13 #include "SkBlitRow.h"
14 #include "SkBlitRow_opts_SSE2.h"
15 #include "SkBlitRow_opts_SSE4.h"
16 #include "SkOncePtr.h"
17 #include "SkRTConf.h"
18 
19 #if defined(_MSC_VER) && defined(_WIN64)
20 #include <intrin.h>
21 #endif
22 
23 /* This file must *not* be compiled with -msse or any other optional SIMD
24    extension, otherwise gcc may generate SIMD instructions even for scalar ops
25    (and thus give an invalid instruction on Pentium3 on the code below).
26    For example, only files named *_SSE2.cpp in this directory should be
27    compiled with -msse2 or higher. */
28 
29 
30 /* Function to get the CPU SSE-level in runtime, for different compilers. */
31 #ifdef _MSC_VER
getcpuid(int info_type,int info[4])32 static inline void getcpuid(int info_type, int info[4]) {
33 #if defined(_WIN64)
34     __cpuid(info, info_type);
35 #else
36     __asm {
37         mov    eax, [info_type]
38         cpuid
39         mov    edi, [info]
40         mov    [edi], eax
41         mov    [edi+4], ebx
42         mov    [edi+8], ecx
43         mov    [edi+12], edx
44     }
45 #endif
46 }
47 #elif defined(__x86_64__)
getcpuid(int info_type,int info[4])48 static inline void getcpuid(int info_type, int info[4]) {
49     asm volatile (
50         "cpuid \n\t"
51         : "=a"(info[0]), "=b"(info[1]), "=c"(info[2]), "=d"(info[3])
52         : "a"(info_type)
53     );
54 }
55 #else
getcpuid(int info_type,int info[4])56 static inline void getcpuid(int info_type, int info[4]) {
57     // We save and restore ebx, so this code can be compatible with -fPIC
58     asm volatile (
59         "pushl %%ebx      \n\t"
60         "cpuid            \n\t"
61         "movl %%ebx, %1   \n\t"
62         "popl %%ebx       \n\t"
63         : "=a"(info[0]), "=r"(info[1]), "=c"(info[2]), "=d"(info[3])
64         : "a"(info_type)
65     );
66 }
67 #endif
68 
69 ////////////////////////////////////////////////////////////////////////////////
70 
71 /* Fetch the SIMD level directly from the CPU, at run-time.
72  * Only checks the levels needed by the optimizations in this file.
73  */
get_SIMD_level()74 static int* get_SIMD_level() {
75     int cpu_info[4] = { 0, 0, 0, 0 };
76     getcpuid(1, cpu_info);
77 
78     int* level = new int;
79 
80     if ((cpu_info[2] & (1<<20)) != 0) {
81         *level = SK_CPU_SSE_LEVEL_SSE42;
82     } else if ((cpu_info[2] & (1<<19)) != 0) {
83         *level = SK_CPU_SSE_LEVEL_SSE41;
84     } else if ((cpu_info[2] & (1<<9)) != 0) {
85         *level = SK_CPU_SSE_LEVEL_SSSE3;
86     } else if ((cpu_info[3] & (1<<26)) != 0) {
87         *level = SK_CPU_SSE_LEVEL_SSE2;
88     } else {
89         *level = 0;
90     }
91     return level;
92 }
93 
94 SK_DECLARE_STATIC_ONCE_PTR(int, gSIMDLevel);
95 
96 /* Verify that the requested SIMD level is supported in the build.
97  * If not, check if the platform supports it.
98  */
supports_simd(int minLevel)99 static inline bool supports_simd(int minLevel) {
100 #if defined(SK_CPU_SSE_LEVEL)
101     if (minLevel <= SK_CPU_SSE_LEVEL) {
102         return true;
103     } else
104 #endif
105     {
106 #if defined(SK_BUILD_FOR_ANDROID_FRAMEWORK)
107         /* For the Android framework we should always know at compile time if the device
108          * we are building for supports SSSE3.  The one exception to this rule is on the
109          * emulator where we are compiled without the -mssse3 option (so we have no
110          * SSSE3 procs) but can be run on a host machine that supports SSSE3
111          * instructions. So for that particular case we disable our SSSE3 options.
112          */
113         return false;
114 #else
115         return minLevel <= *gSIMDLevel.get(get_SIMD_level);
116 #endif
117     }
118 }
119 
120 ////////////////////////////////////////////////////////////////////////////////
121 
PlatformConvolutionProcs(SkConvolutionProcs * procs)122 void SkBitmapScaler::PlatformConvolutionProcs(SkConvolutionProcs* procs) {
123     if (supports_simd(SK_CPU_SSE_LEVEL_SSE2)) {
124         procs->fExtraHorizontalReads = 3;
125         procs->fConvolveVertically = &convolveVertically_SSE2;
126         procs->fConvolve4RowsHorizontally = &convolve4RowsHorizontally_SSE2;
127         procs->fConvolveHorizontally = &convolveHorizontally_SSE2;
128         procs->fApplySIMDPadding = &applySIMDPadding_SSE2;
129     }
130 }
131 
132 ////////////////////////////////////////////////////////////////////////////////
133 
platformProcs()134 void SkBitmapProcState::platformProcs() {
135     /* Every optimization in the function requires at least SSE2 */
136     if (!supports_simd(SK_CPU_SSE_LEVEL_SSE2)) {
137         return;
138     }
139     const bool ssse3 = supports_simd(SK_CPU_SSE_LEVEL_SSSE3);
140 
141     /* Check fSampleProc32 */
142     if (fSampleProc32 == S32_opaque_D32_filter_DX) {
143         if (ssse3) {
144             fSampleProc32 = S32_opaque_D32_filter_DX_SSSE3;
145         } else {
146             fSampleProc32 = S32_opaque_D32_filter_DX_SSE2;
147         }
148     } else if (fSampleProc32 == S32_opaque_D32_filter_DXDY) {
149         if (ssse3) {
150             fSampleProc32 = S32_opaque_D32_filter_DXDY_SSSE3;
151         }
152     } else if (fSampleProc32 == S32_alpha_D32_filter_DX) {
153         if (ssse3) {
154             fSampleProc32 = S32_alpha_D32_filter_DX_SSSE3;
155         } else {
156             fSampleProc32 = S32_alpha_D32_filter_DX_SSE2;
157         }
158     } else if (fSampleProc32 == S32_alpha_D32_filter_DXDY) {
159         if (ssse3) {
160             fSampleProc32 = S32_alpha_D32_filter_DXDY_SSSE3;
161         }
162     }
163 
164     /* Check fMatrixProc */
165     if (fMatrixProc == ClampX_ClampY_filter_scale) {
166         fMatrixProc = ClampX_ClampY_filter_scale_SSE2;
167     } else if (fMatrixProc == ClampX_ClampY_nofilter_scale) {
168         fMatrixProc = ClampX_ClampY_nofilter_scale_SSE2;
169     } else if (fMatrixProc == ClampX_ClampY_filter_affine) {
170         fMatrixProc = ClampX_ClampY_filter_affine_SSE2;
171     } else if (fMatrixProc == ClampX_ClampY_nofilter_affine) {
172         fMatrixProc = ClampX_ClampY_nofilter_affine_SSE2;
173     }
174 }
175 
176 ////////////////////////////////////////////////////////////////////////////////
177 
178 static const SkBlitRow::Proc16 platform_16_procs[] = {
179     S32_D565_Opaque_SSE2,               // S32_D565_Opaque
180     nullptr,                               // S32_D565_Blend
181     S32A_D565_Opaque_SSE2,              // S32A_D565_Opaque
182     nullptr,                               // S32A_D565_Blend
183     S32_D565_Opaque_Dither_SSE2,        // S32_D565_Opaque_Dither
184     nullptr,                               // S32_D565_Blend_Dither
185     S32A_D565_Opaque_Dither_SSE2,       // S32A_D565_Opaque_Dither
186     nullptr,                               // S32A_D565_Blend_Dither
187 };
188 
PlatformFactory565(unsigned flags)189 SkBlitRow::Proc16 SkBlitRow::PlatformFactory565(unsigned flags) {
190     if (supports_simd(SK_CPU_SSE_LEVEL_SSE2)) {
191         return platform_16_procs[flags];
192     } else {
193         return nullptr;
194     }
195 }
196 
197 static const SkBlitRow::ColorProc16 platform_565_colorprocs_SSE2[] = {
198     Color32A_D565_SSE2,                 // Color32A_D565,
199     nullptr,                               // Color32A_D565_Dither
200 };
201 
PlatformColorFactory565(unsigned flags)202 SkBlitRow::ColorProc16 SkBlitRow::PlatformColorFactory565(unsigned flags) {
203 /* If you're thinking about writing an SSE4 version of this, do check it's
204  * actually faster on Atom. Our original SSE4 version was slower than this
205  * SSE2 version on Silvermont, and only marginally faster on a Core i7,
206  * mainly due to the MULLD timings.
207  */
208     if (supports_simd(SK_CPU_SSE_LEVEL_SSE2)) {
209         return platform_565_colorprocs_SSE2[flags];
210     } else {
211         return nullptr;
212     }
213 }
214 
215 static const SkBlitRow::Proc32 platform_32_procs_SSE2[] = {
216     nullptr,                               // S32_Opaque,
217     S32_Blend_BlitRow32_SSE2,           // S32_Blend,
218     S32A_Opaque_BlitRow32_SSE2,         // S32A_Opaque
219     S32A_Blend_BlitRow32_SSE2,          // S32A_Blend,
220 };
221 
222 static const SkBlitRow::Proc32 platform_32_procs_SSE4[] = {
223     nullptr,                               // S32_Opaque,
224     S32_Blend_BlitRow32_SSE2,           // S32_Blend,
225     S32A_Opaque_BlitRow32_SSE4,         // S32A_Opaque
226     S32A_Blend_BlitRow32_SSE2,          // S32A_Blend,
227 };
228 
PlatformProcs32(unsigned flags)229 SkBlitRow::Proc32 SkBlitRow::PlatformProcs32(unsigned flags) {
230     if (supports_simd(SK_CPU_SSE_LEVEL_SSE41)) {
231         return platform_32_procs_SSE4[flags];
232     } else
233     if (supports_simd(SK_CPU_SSE_LEVEL_SSE2)) {
234         return platform_32_procs_SSE2[flags];
235     } else {
236         return nullptr;
237     }
238 }
239 
240 ////////////////////////////////////////////////////////////////////////////////
241 
PlatformBlitRowProcs16(bool isOpaque)242 SkBlitMask::BlitLCD16RowProc SkBlitMask::PlatformBlitRowProcs16(bool isOpaque) {
243     if (supports_simd(SK_CPU_SSE_LEVEL_SSE2)) {
244         if (isOpaque) {
245             return SkBlitLCD16OpaqueRow_SSE2;
246         } else {
247             return SkBlitLCD16Row_SSE2;
248         }
249     } else {
250         return nullptr;
251     }
252 
253 }
254 
PlatformRowProcs(SkColorType,SkMask::Format,RowFlags)255 SkBlitMask::RowProc SkBlitMask::PlatformRowProcs(SkColorType, SkMask::Format, RowFlags) {
256     return nullptr;
257 }
258