1 /*
2 * Copyright 2011 The LibYuv Project Authors. All rights reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include "libyuv/row.h"
12
13 #if !defined(LIBYUV_DISABLE_X86) && defined(_M_X64) && \
14 defined(_MSC_VER) && !defined(__clang__)
15 #include <emmintrin.h>
16 #include <tmmintrin.h> // For _mm_maddubs_epi16
17 #endif
18
19 #ifdef __cplusplus
20 namespace libyuv {
21 extern "C" {
22 #endif
23
24 // This module is for Visual C.
25 #if !defined(LIBYUV_DISABLE_X86) && (defined(_M_IX86) || defined(_M_X64)) && \
26 defined(_MSC_VER) && !defined(__clang__)
27
28 struct YuvConstants {
29 lvec8 kUVToB; // 0
30 lvec8 kUVToG; // 32
31 lvec8 kUVToR; // 64
32 lvec16 kUVBiasB; // 96
33 lvec16 kUVBiasG; // 128
34 lvec16 kUVBiasR; // 160
35 lvec16 kYToRgb; // 192
36 };
37
38 // BT.601 YUV to RGB reference
39 // R = (Y - 16) * 1.164 - V * -1.596
40 // G = (Y - 16) * 1.164 - U * 0.391 - V * 0.813
41 // B = (Y - 16) * 1.164 - U * -2.018
42
43 // Y contribution to R,G,B. Scale and bias.
44 // TODO(fbarchard): Consider moving constants into a common header.
45 #define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */
46 #define YGB -1160 /* 1.164 * 64 * -16 + 64 / 2 */
47
48 // U and V contributions to R,G,B.
49 #define UB -128 /* max(-128, round(-2.018 * 64)) */
50 #define UG 25 /* round(0.391 * 64) */
51 #define VG 52 /* round(0.813 * 64) */
52 #define VR -102 /* round(-1.596 * 64) */
53
54 // Bias values to subtract 16 from Y and 128 from U and V.
55 #define BB (UB * 128 + YGB)
56 #define BG (UG * 128 + VG * 128 + YGB)
57 #define BR (VR * 128 + YGB)
58
59 // BT601 constants for YUV to RGB.
60 static YuvConstants SIMD_ALIGNED(kYuvConstants) = {
61 { UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0,
62 UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0 },
63 { UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG,
64 UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG },
65 { 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR,
66 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR },
67 { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB },
68 { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG },
69 { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR },
70 { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG }
71 };
72
73 // BT601 constants for NV21 where chroma plane is VU instead of UV.
74 static YuvConstants SIMD_ALIGNED(kYvuConstants) = {
75 { 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB,
76 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB },
77 { VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG,
78 VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG },
79 { VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0,
80 VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0 },
81 { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB },
82 { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG },
83 { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR },
84 { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG }
85 };
86
87 #undef YG
88 #undef YGB
89 #undef UB
90 #undef UG
91 #undef VG
92 #undef VR
93 #undef BB
94 #undef BG
95 #undef BR
96
97 // JPEG YUV to RGB reference
98 // * R = Y - V * -1.40200
99 // * G = Y - U * 0.34414 - V * 0.71414
100 // * B = Y - U * -1.77200
101
102 // Y contribution to R,G,B. Scale and bias.
103 // TODO(fbarchard): Consider moving constants into a common header.
104 #define YGJ 16320 /* round(1.000 * 64 * 256 * 256 / 257) */
105 #define YGBJ 32 /* 64 / 2 */
106
107 // U and V contributions to R,G,B.
108 #define UBJ -113 /* round(-1.77200 * 64) */
109 #define UGJ 22 /* round(0.34414 * 64) */
110 #define VGJ 46 /* round(0.71414 * 64) */
111 #define VRJ -90 /* round(-1.40200 * 64) */
112
113 // Bias values to subtract 16 from Y and 128 from U and V.
114 #define BBJ (UBJ * 128 + YGBJ)
115 #define BGJ (UGJ * 128 + VGJ * 128 + YGBJ)
116 #define BRJ (VRJ * 128 + YGBJ)
117
118 // JPEG constants for YUV to RGB.
119 static YuvConstants SIMD_ALIGNED(kYuvJConstants) = {
120 { UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0,
121 UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0 },
122 { UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, UGJ, VGJ,
123 UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, UGJ, VGJ,
124 UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, UGJ, VGJ,
125 UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, UGJ, VGJ },
126 { 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ,
127 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ },
128 { BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, BBJ,
129 BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, BBJ },
130 { BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, BGJ,
131 BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, BGJ },
132 { BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, BRJ,
133 BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, BRJ },
134 { YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, YGJ,
135 YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, YGJ }
136 };
137
138 #undef YGJ
139 #undef YGBJ
140 #undef UBJ
141 #undef UGJ
142 #undef VGJ
143 #undef VRJ
144 #undef BBJ
145 #undef BGJ
146 #undef BRJ
147
148 // 64 bit
149 #if defined(_M_X64)
150 #if defined(HAS_I422TOARGBROW_SSSE3)
I422ToARGBRow_SSSE3(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * dst_argb,int width)151 void I422ToARGBRow_SSSE3(const uint8* y_buf,
152 const uint8* u_buf,
153 const uint8* v_buf,
154 uint8* dst_argb,
155 int width) {
156 __m128i xmm0, xmm1, xmm2, xmm3;
157 const __m128i xmm5 = _mm_set1_epi8(-1);
158 const ptrdiff_t offset = (uint8*)v_buf - (uint8*)u_buf;
159
160 while (width > 0) {
161 xmm0 = _mm_cvtsi32_si128(*(uint32*)u_buf);
162 xmm1 = _mm_cvtsi32_si128(*(uint32*)(u_buf + offset));
163 xmm0 = _mm_unpacklo_epi8(xmm0, xmm1);
164 xmm0 = _mm_unpacklo_epi16(xmm0, xmm0);
165 xmm1 = _mm_loadu_si128(&xmm0);
166 xmm2 = _mm_loadu_si128(&xmm0);
167 xmm0 = _mm_maddubs_epi16(xmm0, *(__m128i*)kYuvConstants.kUVToB);
168 xmm1 = _mm_maddubs_epi16(xmm1, *(__m128i*)kYuvConstants.kUVToG);
169 xmm2 = _mm_maddubs_epi16(xmm2, *(__m128i*)kYuvConstants.kUVToR);
170 xmm0 = _mm_sub_epi16(*(__m128i*)kYuvConstants.kUVBiasB, xmm0);
171 xmm1 = _mm_sub_epi16(*(__m128i*)kYuvConstants.kUVBiasG, xmm1);
172 xmm2 = _mm_sub_epi16(*(__m128i*)kYuvConstants.kUVBiasR, xmm2);
173 xmm3 = _mm_loadl_epi64((__m128i*)y_buf);
174 xmm3 = _mm_unpacklo_epi8(xmm3, xmm3);
175 xmm3 = _mm_mulhi_epu16(xmm3, *(__m128i*)kYuvConstants.kYToRgb);
176 xmm0 = _mm_adds_epi16(xmm0, xmm3);
177 xmm1 = _mm_adds_epi16(xmm1, xmm3);
178 xmm2 = _mm_adds_epi16(xmm2, xmm3);
179 xmm0 = _mm_srai_epi16(xmm0, 6);
180 xmm1 = _mm_srai_epi16(xmm1, 6);
181 xmm2 = _mm_srai_epi16(xmm2, 6);
182 xmm0 = _mm_packus_epi16(xmm0, xmm0);
183 xmm1 = _mm_packus_epi16(xmm1, xmm1);
184 xmm2 = _mm_packus_epi16(xmm2, xmm2);
185 xmm0 = _mm_unpacklo_epi8(xmm0, xmm1);
186 xmm2 = _mm_unpacklo_epi8(xmm2, xmm5);
187 xmm1 = _mm_loadu_si128(&xmm0);
188 xmm0 = _mm_unpacklo_epi16(xmm0, xmm2);
189 xmm1 = _mm_unpackhi_epi16(xmm1, xmm2);
190
191 _mm_storeu_si128((__m128i *)dst_argb, xmm0);
192 _mm_storeu_si128((__m128i *)(dst_argb + 16), xmm1);
193
194 y_buf += 8;
195 u_buf += 4;
196 dst_argb += 32;
197 width -= 8;
198 }
199 }
200 #endif
201 // 32 bit
202 #else // defined(_M_X64)
203 #ifdef HAS_ARGBTOYROW_SSSE3
204
205 // Constants for ARGB.
206 static const vec8 kARGBToY = {
207 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0
208 };
209
210 // JPeg full range.
211 static const vec8 kARGBToYJ = {
212 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0
213 };
214
215 static const vec8 kARGBToU = {
216 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0
217 };
218
219 static const vec8 kARGBToUJ = {
220 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0
221 };
222
223 static const vec8 kARGBToV = {
224 -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
225 };
226
227 static const vec8 kARGBToVJ = {
228 -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0
229 };
230
231 // vpshufb for vphaddw + vpackuswb packed to shorts.
232 static const lvec8 kShufARGBToUV_AVX = {
233 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
234 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15
235 };
236
237 // Constants for BGRA.
238 static const vec8 kBGRAToY = {
239 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13
240 };
241
242 static const vec8 kBGRAToU = {
243 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112
244 };
245
246 static const vec8 kBGRAToV = {
247 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18
248 };
249
250 // Constants for ABGR.
251 static const vec8 kABGRToY = {
252 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0
253 };
254
255 static const vec8 kABGRToU = {
256 -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0
257 };
258
259 static const vec8 kABGRToV = {
260 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0
261 };
262
263 // Constants for RGBA.
264 static const vec8 kRGBAToY = {
265 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33
266 };
267
268 static const vec8 kRGBAToU = {
269 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38
270 };
271
272 static const vec8 kRGBAToV = {
273 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112
274 };
275
276 static const uvec8 kAddY16 = {
277 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u
278 };
279
280 // 7 bit fixed point 0.5.
281 static const vec16 kAddYJ64 = {
282 64, 64, 64, 64, 64, 64, 64, 64
283 };
284
285 static const uvec8 kAddUV128 = {
286 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
287 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
288 };
289
290 static const uvec16 kAddUVJ128 = {
291 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u
292 };
293
294 // Shuffle table for converting RGB24 to ARGB.
295 static const uvec8 kShuffleMaskRGB24ToARGB = {
296 0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u
297 };
298
299 // Shuffle table for converting RAW to ARGB.
300 static const uvec8 kShuffleMaskRAWToARGB = {
301 2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u
302 };
303
304 // Shuffle table for converting ARGB to RGB24.
305 static const uvec8 kShuffleMaskARGBToRGB24 = {
306 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u
307 };
308
309 // Shuffle table for converting ARGB to RAW.
310 static const uvec8 kShuffleMaskARGBToRAW = {
311 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u
312 };
313
314 // Shuffle table for converting ARGBToRGB24 for I422ToRGB24. First 8 + next 4
315 static const uvec8 kShuffleMaskARGBToRGB24_0 = {
316 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u
317 };
318
319 // Shuffle table for converting ARGB to RAW.
320 static const uvec8 kShuffleMaskARGBToRAW_0 = {
321 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 128u, 128u, 128u, 128u, 8u, 14u, 13u, 12u
322 };
323
324 // Duplicates gray value 3 times and fills in alpha opaque.
325 __declspec(naked)
J400ToARGBRow_SSE2(const uint8 * src_y,uint8 * dst_argb,int pix)326 void J400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
327 __asm {
328 mov eax, [esp + 4] // src_y
329 mov edx, [esp + 8] // dst_argb
330 mov ecx, [esp + 12] // pix
331 pcmpeqb xmm5, xmm5 // generate mask 0xff000000
332 pslld xmm5, 24
333
334 convertloop:
335 movq xmm0, qword ptr [eax]
336 lea eax, [eax + 8]
337 punpcklbw xmm0, xmm0
338 movdqa xmm1, xmm0
339 punpcklwd xmm0, xmm0
340 punpckhwd xmm1, xmm1
341 por xmm0, xmm5
342 por xmm1, xmm5
343 movdqu [edx], xmm0
344 movdqu [edx + 16], xmm1
345 lea edx, [edx + 32]
346 sub ecx, 8
347 jg convertloop
348 ret
349 }
350 }
351
352 #ifdef HAS_J400TOARGBROW_AVX2
353 // Duplicates gray value 3 times and fills in alpha opaque.
354 __declspec(naked)
J400ToARGBRow_AVX2(const uint8 * src_y,uint8 * dst_argb,int pix)355 void J400ToARGBRow_AVX2(const uint8* src_y, uint8* dst_argb, int pix) {
356 __asm {
357 mov eax, [esp + 4] // src_y
358 mov edx, [esp + 8] // dst_argb
359 mov ecx, [esp + 12] // pix
360 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xff000000
361 vpslld ymm5, ymm5, 24
362
363 convertloop:
364 vmovdqu xmm0, [eax]
365 lea eax, [eax + 16]
366 vpermq ymm0, ymm0, 0xd8
367 vpunpcklbw ymm0, ymm0, ymm0
368 vpermq ymm0, ymm0, 0xd8
369 vpunpckhwd ymm1, ymm0, ymm0
370 vpunpcklwd ymm0, ymm0, ymm0
371 vpor ymm0, ymm0, ymm5
372 vpor ymm1, ymm1, ymm5
373 vmovdqu [edx], ymm0
374 vmovdqu [edx + 32], ymm1
375 lea edx, [edx + 64]
376 sub ecx, 16
377 jg convertloop
378 vzeroupper
379 ret
380 }
381 }
382 #endif // HAS_J400TOARGBROW_AVX2
383
384 __declspec(naked)
RGB24ToARGBRow_SSSE3(const uint8 * src_rgb24,uint8 * dst_argb,int pix)385 void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) {
386 __asm {
387 mov eax, [esp + 4] // src_rgb24
388 mov edx, [esp + 8] // dst_argb
389 mov ecx, [esp + 12] // pix
390 pcmpeqb xmm5, xmm5 // generate mask 0xff000000
391 pslld xmm5, 24
392 movdqa xmm4, kShuffleMaskRGB24ToARGB
393
394 convertloop:
395 movdqu xmm0, [eax]
396 movdqu xmm1, [eax + 16]
397 movdqu xmm3, [eax + 32]
398 lea eax, [eax + 48]
399 movdqa xmm2, xmm3
400 palignr xmm2, xmm1, 8 // xmm2 = { xmm3[0:3] xmm1[8:15]}
401 pshufb xmm2, xmm4
402 por xmm2, xmm5
403 palignr xmm1, xmm0, 12 // xmm1 = { xmm3[0:7] xmm0[12:15]}
404 pshufb xmm0, xmm4
405 movdqu [edx + 32], xmm2
406 por xmm0, xmm5
407 pshufb xmm1, xmm4
408 movdqu [edx], xmm0
409 por xmm1, xmm5
410 palignr xmm3, xmm3, 4 // xmm3 = { xmm3[4:15]}
411 pshufb xmm3, xmm4
412 movdqu [edx + 16], xmm1
413 por xmm3, xmm5
414 movdqu [edx + 48], xmm3
415 lea edx, [edx + 64]
416 sub ecx, 16
417 jg convertloop
418 ret
419 }
420 }
421
422 __declspec(naked)
RAWToARGBRow_SSSE3(const uint8 * src_raw,uint8 * dst_argb,int pix)423 void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb,
424 int pix) {
425 __asm {
426 mov eax, [esp + 4] // src_raw
427 mov edx, [esp + 8] // dst_argb
428 mov ecx, [esp + 12] // pix
429 pcmpeqb xmm5, xmm5 // generate mask 0xff000000
430 pslld xmm5, 24
431 movdqa xmm4, kShuffleMaskRAWToARGB
432
433 convertloop:
434 movdqu xmm0, [eax]
435 movdqu xmm1, [eax + 16]
436 movdqu xmm3, [eax + 32]
437 lea eax, [eax + 48]
438 movdqa xmm2, xmm3
439 palignr xmm2, xmm1, 8 // xmm2 = { xmm3[0:3] xmm1[8:15]}
440 pshufb xmm2, xmm4
441 por xmm2, xmm5
442 palignr xmm1, xmm0, 12 // xmm1 = { xmm3[0:7] xmm0[12:15]}
443 pshufb xmm0, xmm4
444 movdqu [edx + 32], xmm2
445 por xmm0, xmm5
446 pshufb xmm1, xmm4
447 movdqu [edx], xmm0
448 por xmm1, xmm5
449 palignr xmm3, xmm3, 4 // xmm3 = { xmm3[4:15]}
450 pshufb xmm3, xmm4
451 movdqu [edx + 16], xmm1
452 por xmm3, xmm5
453 movdqu [edx + 48], xmm3
454 lea edx, [edx + 64]
455 sub ecx, 16
456 jg convertloop
457 ret
458 }
459 }
460
461 // pmul method to replicate bits.
462 // Math to replicate bits:
463 // (v << 8) | (v << 3)
464 // v * 256 + v * 8
465 // v * (256 + 8)
466 // G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3
467 // 20 instructions.
468 __declspec(naked)
RGB565ToARGBRow_SSE2(const uint8 * src_rgb565,uint8 * dst_argb,int pix)469 void RGB565ToARGBRow_SSE2(const uint8* src_rgb565, uint8* dst_argb,
470 int pix) {
471 __asm {
472 mov eax, 0x01080108 // generate multiplier to repeat 5 bits
473 movd xmm5, eax
474 pshufd xmm5, xmm5, 0
475 mov eax, 0x20802080 // multiplier shift by 5 and then repeat 6 bits
476 movd xmm6, eax
477 pshufd xmm6, xmm6, 0
478 pcmpeqb xmm3, xmm3 // generate mask 0xf800f800 for Red
479 psllw xmm3, 11
480 pcmpeqb xmm4, xmm4 // generate mask 0x07e007e0 for Green
481 psllw xmm4, 10
482 psrlw xmm4, 5
483 pcmpeqb xmm7, xmm7 // generate mask 0xff00ff00 for Alpha
484 psllw xmm7, 8
485
486 mov eax, [esp + 4] // src_rgb565
487 mov edx, [esp + 8] // dst_argb
488 mov ecx, [esp + 12] // pix
489 sub edx, eax
490 sub edx, eax
491
492 convertloop:
493 movdqu xmm0, [eax] // fetch 8 pixels of bgr565
494 movdqa xmm1, xmm0
495 movdqa xmm2, xmm0
496 pand xmm1, xmm3 // R in upper 5 bits
497 psllw xmm2, 11 // B in upper 5 bits
498 pmulhuw xmm1, xmm5 // * (256 + 8)
499 pmulhuw xmm2, xmm5 // * (256 + 8)
500 psllw xmm1, 8
501 por xmm1, xmm2 // RB
502 pand xmm0, xmm4 // G in middle 6 bits
503 pmulhuw xmm0, xmm6 // << 5 * (256 + 4)
504 por xmm0, xmm7 // AG
505 movdqa xmm2, xmm1
506 punpcklbw xmm1, xmm0
507 punpckhbw xmm2, xmm0
508 movdqu [eax * 2 + edx], xmm1 // store 4 pixels of ARGB
509 movdqu [eax * 2 + edx + 16], xmm2 // store next 4 pixels of ARGB
510 lea eax, [eax + 16]
511 sub ecx, 8
512 jg convertloop
513 ret
514 }
515 }
516
517 #ifdef HAS_RGB565TOARGBROW_AVX2
518 // pmul method to replicate bits.
519 // Math to replicate bits:
520 // (v << 8) | (v << 3)
521 // v * 256 + v * 8
522 // v * (256 + 8)
523 // G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3
524 __declspec(naked)
RGB565ToARGBRow_AVX2(const uint8 * src_rgb565,uint8 * dst_argb,int pix)525 void RGB565ToARGBRow_AVX2(const uint8* src_rgb565, uint8* dst_argb,
526 int pix) {
527 __asm {
528 mov eax, 0x01080108 // generate multiplier to repeat 5 bits
529 vmovd xmm5, eax
530 vbroadcastss ymm5, xmm5
531 mov eax, 0x20802080 // multiplier shift by 5 and then repeat 6 bits
532 movd xmm6, eax
533 vbroadcastss ymm6, xmm6
534 vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0xf800f800 for Red
535 vpsllw ymm3, ymm3, 11
536 vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0x07e007e0 for Green
537 vpsllw ymm4, ymm4, 10
538 vpsrlw ymm4, ymm4, 5
539 vpcmpeqb ymm7, ymm7, ymm7 // generate mask 0xff00ff00 for Alpha
540 vpsllw ymm7, ymm7, 8
541
542 mov eax, [esp + 4] // src_rgb565
543 mov edx, [esp + 8] // dst_argb
544 mov ecx, [esp + 12] // pix
545 sub edx, eax
546 sub edx, eax
547
548 convertloop:
549 vmovdqu ymm0, [eax] // fetch 16 pixels of bgr565
550 vpand ymm1, ymm0, ymm3 // R in upper 5 bits
551 vpsllw ymm2, ymm0, 11 // B in upper 5 bits
552 vpmulhuw ymm1, ymm1, ymm5 // * (256 + 8)
553 vpmulhuw ymm2, ymm2, ymm5 // * (256 + 8)
554 vpsllw ymm1, ymm1, 8
555 vpor ymm1, ymm1, ymm2 // RB
556 vpand ymm0, ymm0, ymm4 // G in middle 6 bits
557 vpmulhuw ymm0, ymm0, ymm6 // << 5 * (256 + 4)
558 vpor ymm0, ymm0, ymm7 // AG
559 vpermq ymm0, ymm0, 0xd8 // mutate for unpack
560 vpermq ymm1, ymm1, 0xd8
561 vpunpckhbw ymm2, ymm1, ymm0
562 vpunpcklbw ymm1, ymm1, ymm0
563 vmovdqu [eax * 2 + edx], ymm1 // store 4 pixels of ARGB
564 vmovdqu [eax * 2 + edx + 32], ymm2 // store next 4 pixels of ARGB
565 lea eax, [eax + 32]
566 sub ecx, 16
567 jg convertloop
568 vzeroupper
569 ret
570 }
571 }
572 #endif // HAS_RGB565TOARGBROW_AVX2
573
574 #ifdef HAS_ARGB1555TOARGBROW_AVX2
575 __declspec(naked)
ARGB1555ToARGBRow_AVX2(const uint8 * src_argb1555,uint8 * dst_argb,int pix)576 void ARGB1555ToARGBRow_AVX2(const uint8* src_argb1555, uint8* dst_argb,
577 int pix) {
578 __asm {
579 mov eax, 0x01080108 // generate multiplier to repeat 5 bits
580 vmovd xmm5, eax
581 vbroadcastss ymm5, xmm5
582 mov eax, 0x42004200 // multiplier shift by 6 and then repeat 5 bits
583 movd xmm6, eax
584 vbroadcastss ymm6, xmm6
585 vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0xf800f800 for Red
586 vpsllw ymm3, ymm3, 11
587 vpsrlw ymm4, ymm3, 6 // generate mask 0x03e003e0 for Green
588 vpcmpeqb ymm7, ymm7, ymm7 // generate mask 0xff00ff00 for Alpha
589 vpsllw ymm7, ymm7, 8
590
591 mov eax, [esp + 4] // src_argb1555
592 mov edx, [esp + 8] // dst_argb
593 mov ecx, [esp + 12] // pix
594 sub edx, eax
595 sub edx, eax
596
597 convertloop:
598 vmovdqu ymm0, [eax] // fetch 16 pixels of 1555
599 vpsllw ymm1, ymm0, 1 // R in upper 5 bits
600 vpsllw ymm2, ymm0, 11 // B in upper 5 bits
601 vpand ymm1, ymm1, ymm3
602 vpmulhuw ymm2, ymm2, ymm5 // * (256 + 8)
603 vpmulhuw ymm1, ymm1, ymm5 // * (256 + 8)
604 vpsllw ymm1, ymm1, 8
605 vpor ymm1, ymm1, ymm2 // RB
606 vpsraw ymm2, ymm0, 8 // A
607 vpand ymm0, ymm0, ymm4 // G in middle 5 bits
608 vpmulhuw ymm0, ymm0, ymm6 // << 6 * (256 + 8)
609 vpand ymm2, ymm2, ymm7
610 vpor ymm0, ymm0, ymm2 // AG
611 vpermq ymm0, ymm0, 0xd8 // mutate for unpack
612 vpermq ymm1, ymm1, 0xd8
613 vpunpckhbw ymm2, ymm1, ymm0
614 vpunpcklbw ymm1, ymm1, ymm0
615 vmovdqu [eax * 2 + edx], ymm1 // store 8 pixels of ARGB
616 vmovdqu [eax * 2 + edx + 32], ymm2 // store next 8 pixels of ARGB
617 lea eax, [eax + 32]
618 sub ecx, 16
619 jg convertloop
620 vzeroupper
621 ret
622 }
623 }
624 #endif // HAS_ARGB1555TOARGBROW_AVX2
625
626 #ifdef HAS_ARGB4444TOARGBROW_AVX2
627 __declspec(naked)
ARGB4444ToARGBRow_AVX2(const uint8 * src_argb4444,uint8 * dst_argb,int pix)628 void ARGB4444ToARGBRow_AVX2(const uint8* src_argb4444, uint8* dst_argb,
629 int pix) {
630 __asm {
631 mov eax, 0x0f0f0f0f // generate mask 0x0f0f0f0f
632 vmovd xmm4, eax
633 vbroadcastss ymm4, xmm4
634 vpslld ymm5, ymm4, 4 // 0xf0f0f0f0 for high nibbles
635 mov eax, [esp + 4] // src_argb4444
636 mov edx, [esp + 8] // dst_argb
637 mov ecx, [esp + 12] // pix
638 sub edx, eax
639 sub edx, eax
640
641 convertloop:
642 vmovdqu ymm0, [eax] // fetch 16 pixels of bgra4444
643 vpand ymm2, ymm0, ymm5 // mask high nibbles
644 vpand ymm0, ymm0, ymm4 // mask low nibbles
645 vpsrlw ymm3, ymm2, 4
646 vpsllw ymm1, ymm0, 4
647 vpor ymm2, ymm2, ymm3
648 vpor ymm0, ymm0, ymm1
649 vpermq ymm0, ymm0, 0xd8 // mutate for unpack
650 vpermq ymm2, ymm2, 0xd8
651 vpunpckhbw ymm1, ymm0, ymm2
652 vpunpcklbw ymm0, ymm0, ymm2
653 vmovdqu [eax * 2 + edx], ymm0 // store 8 pixels of ARGB
654 vmovdqu [eax * 2 + edx + 32], ymm1 // store next 8 pixels of ARGB
655 lea eax, [eax + 32]
656 sub ecx, 16
657 jg convertloop
658 vzeroupper
659 ret
660 }
661 }
662 #endif // HAS_ARGB4444TOARGBROW_AVX2
663
664 // 24 instructions
665 __declspec(naked)
ARGB1555ToARGBRow_SSE2(const uint8 * src_argb1555,uint8 * dst_argb,int pix)666 void ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555, uint8* dst_argb,
667 int pix) {
668 __asm {
669 mov eax, 0x01080108 // generate multiplier to repeat 5 bits
670 movd xmm5, eax
671 pshufd xmm5, xmm5, 0
672 mov eax, 0x42004200 // multiplier shift by 6 and then repeat 5 bits
673 movd xmm6, eax
674 pshufd xmm6, xmm6, 0
675 pcmpeqb xmm3, xmm3 // generate mask 0xf800f800 for Red
676 psllw xmm3, 11
677 movdqa xmm4, xmm3 // generate mask 0x03e003e0 for Green
678 psrlw xmm4, 6
679 pcmpeqb xmm7, xmm7 // generate mask 0xff00ff00 for Alpha
680 psllw xmm7, 8
681
682 mov eax, [esp + 4] // src_argb1555
683 mov edx, [esp + 8] // dst_argb
684 mov ecx, [esp + 12] // pix
685 sub edx, eax
686 sub edx, eax
687
688 convertloop:
689 movdqu xmm0, [eax] // fetch 8 pixels of 1555
690 movdqa xmm1, xmm0
691 movdqa xmm2, xmm0
692 psllw xmm1, 1 // R in upper 5 bits
693 psllw xmm2, 11 // B in upper 5 bits
694 pand xmm1, xmm3
695 pmulhuw xmm2, xmm5 // * (256 + 8)
696 pmulhuw xmm1, xmm5 // * (256 + 8)
697 psllw xmm1, 8
698 por xmm1, xmm2 // RB
699 movdqa xmm2, xmm0
700 pand xmm0, xmm4 // G in middle 5 bits
701 psraw xmm2, 8 // A
702 pmulhuw xmm0, xmm6 // << 6 * (256 + 8)
703 pand xmm2, xmm7
704 por xmm0, xmm2 // AG
705 movdqa xmm2, xmm1
706 punpcklbw xmm1, xmm0
707 punpckhbw xmm2, xmm0
708 movdqu [eax * 2 + edx], xmm1 // store 4 pixels of ARGB
709 movdqu [eax * 2 + edx + 16], xmm2 // store next 4 pixels of ARGB
710 lea eax, [eax + 16]
711 sub ecx, 8
712 jg convertloop
713 ret
714 }
715 }
716
717 // 18 instructions.
718 __declspec(naked)
ARGB4444ToARGBRow_SSE2(const uint8 * src_argb4444,uint8 * dst_argb,int pix)719 void ARGB4444ToARGBRow_SSE2(const uint8* src_argb4444, uint8* dst_argb,
720 int pix) {
721 __asm {
722 mov eax, 0x0f0f0f0f // generate mask 0x0f0f0f0f
723 movd xmm4, eax
724 pshufd xmm4, xmm4, 0
725 movdqa xmm5, xmm4 // 0xf0f0f0f0 for high nibbles
726 pslld xmm5, 4
727 mov eax, [esp + 4] // src_argb4444
728 mov edx, [esp + 8] // dst_argb
729 mov ecx, [esp + 12] // pix
730 sub edx, eax
731 sub edx, eax
732
733 convertloop:
734 movdqu xmm0, [eax] // fetch 8 pixels of bgra4444
735 movdqa xmm2, xmm0
736 pand xmm0, xmm4 // mask low nibbles
737 pand xmm2, xmm5 // mask high nibbles
738 movdqa xmm1, xmm0
739 movdqa xmm3, xmm2
740 psllw xmm1, 4
741 psrlw xmm3, 4
742 por xmm0, xmm1
743 por xmm2, xmm3
744 movdqa xmm1, xmm0
745 punpcklbw xmm0, xmm2
746 punpckhbw xmm1, xmm2
747 movdqu [eax * 2 + edx], xmm0 // store 4 pixels of ARGB
748 movdqu [eax * 2 + edx + 16], xmm1 // store next 4 pixels of ARGB
749 lea eax, [eax + 16]
750 sub ecx, 8
751 jg convertloop
752 ret
753 }
754 }
755
756 __declspec(naked)
ARGBToRGB24Row_SSSE3(const uint8 * src_argb,uint8 * dst_rgb,int pix)757 void ARGBToRGB24Row_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) {
758 __asm {
759 mov eax, [esp + 4] // src_argb
760 mov edx, [esp + 8] // dst_rgb
761 mov ecx, [esp + 12] // pix
762 movdqa xmm6, kShuffleMaskARGBToRGB24
763
764 convertloop:
765 movdqu xmm0, [eax] // fetch 16 pixels of argb
766 movdqu xmm1, [eax + 16]
767 movdqu xmm2, [eax + 32]
768 movdqu xmm3, [eax + 48]
769 lea eax, [eax + 64]
770 pshufb xmm0, xmm6 // pack 16 bytes of ARGB to 12 bytes of RGB
771 pshufb xmm1, xmm6
772 pshufb xmm2, xmm6
773 pshufb xmm3, xmm6
774 movdqa xmm4, xmm1 // 4 bytes from 1 for 0
775 psrldq xmm1, 4 // 8 bytes from 1
776 pslldq xmm4, 12 // 4 bytes from 1 for 0
777 movdqa xmm5, xmm2 // 8 bytes from 2 for 1
778 por xmm0, xmm4 // 4 bytes from 1 for 0
779 pslldq xmm5, 8 // 8 bytes from 2 for 1
780 movdqu [edx], xmm0 // store 0
781 por xmm1, xmm5 // 8 bytes from 2 for 1
782 psrldq xmm2, 8 // 4 bytes from 2
783 pslldq xmm3, 4 // 12 bytes from 3 for 2
784 por xmm2, xmm3 // 12 bytes from 3 for 2
785 movdqu [edx + 16], xmm1 // store 1
786 movdqu [edx + 32], xmm2 // store 2
787 lea edx, [edx + 48]
788 sub ecx, 16
789 jg convertloop
790 ret
791 }
792 }
793
794 __declspec(naked)
ARGBToRAWRow_SSSE3(const uint8 * src_argb,uint8 * dst_rgb,int pix)795 void ARGBToRAWRow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) {
796 __asm {
797 mov eax, [esp + 4] // src_argb
798 mov edx, [esp + 8] // dst_rgb
799 mov ecx, [esp + 12] // pix
800 movdqa xmm6, kShuffleMaskARGBToRAW
801
802 convertloop:
803 movdqu xmm0, [eax] // fetch 16 pixels of argb
804 movdqu xmm1, [eax + 16]
805 movdqu xmm2, [eax + 32]
806 movdqu xmm3, [eax + 48]
807 lea eax, [eax + 64]
808 pshufb xmm0, xmm6 // pack 16 bytes of ARGB to 12 bytes of RGB
809 pshufb xmm1, xmm6
810 pshufb xmm2, xmm6
811 pshufb xmm3, xmm6
812 movdqa xmm4, xmm1 // 4 bytes from 1 for 0
813 psrldq xmm1, 4 // 8 bytes from 1
814 pslldq xmm4, 12 // 4 bytes from 1 for 0
815 movdqa xmm5, xmm2 // 8 bytes from 2 for 1
816 por xmm0, xmm4 // 4 bytes from 1 for 0
817 pslldq xmm5, 8 // 8 bytes from 2 for 1
818 movdqu [edx], xmm0 // store 0
819 por xmm1, xmm5 // 8 bytes from 2 for 1
820 psrldq xmm2, 8 // 4 bytes from 2
821 pslldq xmm3, 4 // 12 bytes from 3 for 2
822 por xmm2, xmm3 // 12 bytes from 3 for 2
823 movdqu [edx + 16], xmm1 // store 1
824 movdqu [edx + 32], xmm2 // store 2
825 lea edx, [edx + 48]
826 sub ecx, 16
827 jg convertloop
828 ret
829 }
830 }
831
832 // 4 pixels
833 __declspec(naked)
ARGBToRGB565Row_SSE2(const uint8 * src_argb,uint8 * dst_rgb,int pix)834 void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
835 __asm {
836 mov eax, [esp + 4] // src_argb
837 mov edx, [esp + 8] // dst_rgb
838 mov ecx, [esp + 12] // pix
839 pcmpeqb xmm3, xmm3 // generate mask 0x0000001f
840 psrld xmm3, 27
841 pcmpeqb xmm4, xmm4 // generate mask 0x000007e0
842 psrld xmm4, 26
843 pslld xmm4, 5
844 pcmpeqb xmm5, xmm5 // generate mask 0xfffff800
845 pslld xmm5, 11
846
847 convertloop:
848 movdqu xmm0, [eax] // fetch 4 pixels of argb
849 movdqa xmm1, xmm0 // B
850 movdqa xmm2, xmm0 // G
851 pslld xmm0, 8 // R
852 psrld xmm1, 3 // B
853 psrld xmm2, 5 // G
854 psrad xmm0, 16 // R
855 pand xmm1, xmm3 // B
856 pand xmm2, xmm4 // G
857 pand xmm0, xmm5 // R
858 por xmm1, xmm2 // BG
859 por xmm0, xmm1 // BGR
860 packssdw xmm0, xmm0
861 lea eax, [eax + 16]
862 movq qword ptr [edx], xmm0 // store 4 pixels of RGB565
863 lea edx, [edx + 8]
864 sub ecx, 4
865 jg convertloop
866 ret
867 }
868 }
869
870 // 8 pixels
871 __declspec(naked)
ARGBToRGB565DitherRow_SSE2(const uint8 * src_argb,uint8 * dst_rgb,const uint32 dither4,int pix)872 void ARGBToRGB565DitherRow_SSE2(const uint8* src_argb, uint8* dst_rgb,
873 const uint32 dither4, int pix) {
874 __asm {
875
876 mov eax, [esp + 4] // src_argb
877 mov edx, [esp + 8] // dst_rgb
878 movd xmm6, [esp + 12] // dither4
879 mov ecx, [esp + 16] // pix
880 punpcklbw xmm6, xmm6 // make dither 16 bytes
881 movdqa xmm7, xmm6
882 punpcklwd xmm6, xmm6
883 punpckhwd xmm7, xmm7
884 pcmpeqb xmm3, xmm3 // generate mask 0x0000001f
885 psrld xmm3, 27
886 pcmpeqb xmm4, xmm4 // generate mask 0x000007e0
887 psrld xmm4, 26
888 pslld xmm4, 5
889 pcmpeqb xmm5, xmm5 // generate mask 0xfffff800
890 pslld xmm5, 11
891
892 convertloop:
893 movdqu xmm0, [eax] // fetch 4 pixels of argb
894 paddusb xmm0, xmm6 // add dither
895 movdqa xmm1, xmm0 // B
896 movdqa xmm2, xmm0 // G
897 pslld xmm0, 8 // R
898 psrld xmm1, 3 // B
899 psrld xmm2, 5 // G
900 psrad xmm0, 16 // R
901 pand xmm1, xmm3 // B
902 pand xmm2, xmm4 // G
903 pand xmm0, xmm5 // R
904 por xmm1, xmm2 // BG
905 por xmm0, xmm1 // BGR
906 packssdw xmm0, xmm0
907 lea eax, [eax + 16]
908 movq qword ptr [edx], xmm0 // store 4 pixels of RGB565
909 lea edx, [edx + 8]
910 sub ecx, 4
911 jg convertloop
912 ret
913 }
914 }
915
916 #ifdef HAS_ARGBTORGB565DITHERROW_AVX2
917 __declspec(naked)
ARGBToRGB565DitherRow_AVX2(const uint8 * src_argb,uint8 * dst_rgb,const uint32 dither4,int pix)918 void ARGBToRGB565DitherRow_AVX2(const uint8* src_argb, uint8* dst_rgb,
919 const uint32 dither4, int pix) {
920 __asm {
921 mov eax, [esp + 4] // src_argb
922 mov edx, [esp + 8] // dst_rgb
923 vbroadcastss xmm6, [esp + 12] // dither4
924 mov ecx, [esp + 16] // pix
925 vpunpcklbw xmm6, xmm6, xmm6 // make dither 32 bytes
926 vpermq ymm6, ymm6, 0xd8
927 vpunpcklwd ymm6, ymm6, ymm6
928 vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0x0000001f
929 vpsrld ymm3, ymm3, 27
930 vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0x000007e0
931 vpsrld ymm4, ymm4, 26
932 vpslld ymm4, ymm4, 5
933 vpslld ymm5, ymm3, 11 // generate mask 0x0000f800
934
935 convertloop:
936 vmovdqu ymm0, [eax] // fetch 8 pixels of argb
937 vpaddusb ymm0, ymm0, ymm6 // add dither
938 vpsrld ymm2, ymm0, 5 // G
939 vpsrld ymm1, ymm0, 3 // B
940 vpsrld ymm0, ymm0, 8 // R
941 vpand ymm2, ymm2, ymm4 // G
942 vpand ymm1, ymm1, ymm3 // B
943 vpand ymm0, ymm0, ymm5 // R
944 vpor ymm1, ymm1, ymm2 // BG
945 vpor ymm0, ymm0, ymm1 // BGR
946 vpackusdw ymm0, ymm0, ymm0
947 vpermq ymm0, ymm0, 0xd8
948 lea eax, [eax + 32]
949 vmovdqu [edx], xmm0 // store 8 pixels of RGB565
950 lea edx, [edx + 16]
951 sub ecx, 8
952 jg convertloop
953 vzeroupper
954 ret
955 }
956 }
957 #endif // HAS_ARGBTORGB565DITHERROW_AVX2
958
959 // TODO(fbarchard): Improve sign extension/packing.
960 __declspec(naked)
ARGBToARGB1555Row_SSE2(const uint8 * src_argb,uint8 * dst_rgb,int pix)961 void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
962 __asm {
963 mov eax, [esp + 4] // src_argb
964 mov edx, [esp + 8] // dst_rgb
965 mov ecx, [esp + 12] // pix
966 pcmpeqb xmm4, xmm4 // generate mask 0x0000001f
967 psrld xmm4, 27
968 movdqa xmm5, xmm4 // generate mask 0x000003e0
969 pslld xmm5, 5
970 movdqa xmm6, xmm4 // generate mask 0x00007c00
971 pslld xmm6, 10
972 pcmpeqb xmm7, xmm7 // generate mask 0xffff8000
973 pslld xmm7, 15
974
975 convertloop:
976 movdqu xmm0, [eax] // fetch 4 pixels of argb
977 movdqa xmm1, xmm0 // B
978 movdqa xmm2, xmm0 // G
979 movdqa xmm3, xmm0 // R
980 psrad xmm0, 16 // A
981 psrld xmm1, 3 // B
982 psrld xmm2, 6 // G
983 psrld xmm3, 9 // R
984 pand xmm0, xmm7 // A
985 pand xmm1, xmm4 // B
986 pand xmm2, xmm5 // G
987 pand xmm3, xmm6 // R
988 por xmm0, xmm1 // BA
989 por xmm2, xmm3 // GR
990 por xmm0, xmm2 // BGRA
991 packssdw xmm0, xmm0
992 lea eax, [eax + 16]
993 movq qword ptr [edx], xmm0 // store 4 pixels of ARGB1555
994 lea edx, [edx + 8]
995 sub ecx, 4
996 jg convertloop
997 ret
998 }
999 }
1000
1001 __declspec(naked)
ARGBToARGB4444Row_SSE2(const uint8 * src_argb,uint8 * dst_rgb,int pix)1002 void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
1003 __asm {
1004 mov eax, [esp + 4] // src_argb
1005 mov edx, [esp + 8] // dst_rgb
1006 mov ecx, [esp + 12] // pix
1007 pcmpeqb xmm4, xmm4 // generate mask 0xf000f000
1008 psllw xmm4, 12
1009 movdqa xmm3, xmm4 // generate mask 0x00f000f0
1010 psrlw xmm3, 8
1011
1012 convertloop:
1013 movdqu xmm0, [eax] // fetch 4 pixels of argb
1014 movdqa xmm1, xmm0
1015 pand xmm0, xmm3 // low nibble
1016 pand xmm1, xmm4 // high nibble
1017 psrld xmm0, 4
1018 psrld xmm1, 8
1019 por xmm0, xmm1
1020 packuswb xmm0, xmm0
1021 lea eax, [eax + 16]
1022 movq qword ptr [edx], xmm0 // store 4 pixels of ARGB4444
1023 lea edx, [edx + 8]
1024 sub ecx, 4
1025 jg convertloop
1026 ret
1027 }
1028 }
1029
1030 #ifdef HAS_ARGBTORGB565ROW_AVX2
1031 __declspec(naked)
ARGBToRGB565Row_AVX2(const uint8 * src_argb,uint8 * dst_rgb,int pix)1032 void ARGBToRGB565Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix) {
1033 __asm {
1034 mov eax, [esp + 4] // src_argb
1035 mov edx, [esp + 8] // dst_rgb
1036 mov ecx, [esp + 12] // pix
1037 vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0x0000001f
1038 vpsrld ymm3, ymm3, 27
1039 vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0x000007e0
1040 vpsrld ymm4, ymm4, 26
1041 vpslld ymm4, ymm4, 5
1042 vpslld ymm5, ymm3, 11 // generate mask 0x0000f800
1043
1044 convertloop:
1045 vmovdqu ymm0, [eax] // fetch 8 pixels of argb
1046 vpsrld ymm2, ymm0, 5 // G
1047 vpsrld ymm1, ymm0, 3 // B
1048 vpsrld ymm0, ymm0, 8 // R
1049 vpand ymm2, ymm2, ymm4 // G
1050 vpand ymm1, ymm1, ymm3 // B
1051 vpand ymm0, ymm0, ymm5 // R
1052 vpor ymm1, ymm1, ymm2 // BG
1053 vpor ymm0, ymm0, ymm1 // BGR
1054 vpackusdw ymm0, ymm0, ymm0
1055 vpermq ymm0, ymm0, 0xd8
1056 lea eax, [eax + 32]
1057 vmovdqu [edx], xmm0 // store 8 pixels of RGB565
1058 lea edx, [edx + 16]
1059 sub ecx, 8
1060 jg convertloop
1061 vzeroupper
1062 ret
1063 }
1064 }
1065 #endif // HAS_ARGBTORGB565ROW_AVX2
1066
1067 #ifdef HAS_ARGBTOARGB1555ROW_AVX2
1068 __declspec(naked)
ARGBToARGB1555Row_AVX2(const uint8 * src_argb,uint8 * dst_rgb,int pix)1069 void ARGBToARGB1555Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix) {
1070 __asm {
1071 mov eax, [esp + 4] // src_argb
1072 mov edx, [esp + 8] // dst_rgb
1073 mov ecx, [esp + 12] // pix
1074 vpcmpeqb ymm4, ymm4, ymm4
1075 vpsrld ymm4, ymm4, 27 // generate mask 0x0000001f
1076 vpslld ymm5, ymm4, 5 // generate mask 0x000003e0
1077 vpslld ymm6, ymm4, 10 // generate mask 0x00007c00
1078 vpcmpeqb ymm7, ymm7, ymm7 // generate mask 0xffff8000
1079 vpslld ymm7, ymm7, 15
1080
1081 convertloop:
1082 vmovdqu ymm0, [eax] // fetch 8 pixels of argb
1083 vpsrld ymm3, ymm0, 9 // R
1084 vpsrld ymm2, ymm0, 6 // G
1085 vpsrld ymm1, ymm0, 3 // B
1086 vpsrad ymm0, ymm0, 16 // A
1087 vpand ymm3, ymm3, ymm6 // R
1088 vpand ymm2, ymm2, ymm5 // G
1089 vpand ymm1, ymm1, ymm4 // B
1090 vpand ymm0, ymm0, ymm7 // A
1091 vpor ymm0, ymm0, ymm1 // BA
1092 vpor ymm2, ymm2, ymm3 // GR
1093 vpor ymm0, ymm0, ymm2 // BGRA
1094 vpackssdw ymm0, ymm0, ymm0
1095 vpermq ymm0, ymm0, 0xd8
1096 lea eax, [eax + 32]
1097 vmovdqu [edx], xmm0 // store 8 pixels of ARGB1555
1098 lea edx, [edx + 16]
1099 sub ecx, 8
1100 jg convertloop
1101 vzeroupper
1102 ret
1103 }
1104 }
1105 #endif // HAS_ARGBTOARGB1555ROW_AVX2
1106
1107 #ifdef HAS_ARGBTOARGB4444ROW_AVX2
1108 __declspec(naked)
ARGBToARGB4444Row_AVX2(const uint8 * src_argb,uint8 * dst_rgb,int pix)1109 void ARGBToARGB4444Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix) {
1110 __asm {
1111 mov eax, [esp + 4] // src_argb
1112 mov edx, [esp + 8] // dst_rgb
1113 mov ecx, [esp + 12] // pix
1114 vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0xf000f000
1115 vpsllw ymm4, ymm4, 12
1116 vpsrlw ymm3, ymm4, 8 // generate mask 0x00f000f0
1117
1118 convertloop:
1119 vmovdqu ymm0, [eax] // fetch 8 pixels of argb
1120 vpand ymm1, ymm0, ymm4 // high nibble
1121 vpand ymm0, ymm0, ymm3 // low nibble
1122 vpsrld ymm1, ymm1, 8
1123 vpsrld ymm0, ymm0, 4
1124 vpor ymm0, ymm0, ymm1
1125 vpackuswb ymm0, ymm0, ymm0
1126 vpermq ymm0, ymm0, 0xd8
1127 lea eax, [eax + 32]
1128 vmovdqu [edx], xmm0 // store 8 pixels of ARGB4444
1129 lea edx, [edx + 16]
1130 sub ecx, 8
1131 jg convertloop
1132 vzeroupper
1133 ret
1134 }
1135 }
1136 #endif // HAS_ARGBTOARGB4444ROW_AVX2
1137
1138 // Convert 16 ARGB pixels (64 bytes) to 16 Y values.
1139 __declspec(naked)
ARGBToYRow_SSSE3(const uint8 * src_argb,uint8 * dst_y,int pix)1140 void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
1141 __asm {
1142 mov eax, [esp + 4] /* src_argb */
1143 mov edx, [esp + 8] /* dst_y */
1144 mov ecx, [esp + 12] /* pix */
1145 movdqa xmm4, kARGBToY
1146 movdqa xmm5, kAddY16
1147
1148 convertloop:
1149 movdqu xmm0, [eax]
1150 movdqu xmm1, [eax + 16]
1151 movdqu xmm2, [eax + 32]
1152 movdqu xmm3, [eax + 48]
1153 pmaddubsw xmm0, xmm4
1154 pmaddubsw xmm1, xmm4
1155 pmaddubsw xmm2, xmm4
1156 pmaddubsw xmm3, xmm4
1157 lea eax, [eax + 64]
1158 phaddw xmm0, xmm1
1159 phaddw xmm2, xmm3
1160 psrlw xmm0, 7
1161 psrlw xmm2, 7
1162 packuswb xmm0, xmm2
1163 paddb xmm0, xmm5
1164 movdqu [edx], xmm0
1165 lea edx, [edx + 16]
1166 sub ecx, 16
1167 jg convertloop
1168 ret
1169 }
1170 }
1171
1172 // Convert 16 ARGB pixels (64 bytes) to 16 YJ values.
1173 // Same as ARGBToYRow but different coefficients, no add 16, but do rounding.
1174 __declspec(naked)
ARGBToYJRow_SSSE3(const uint8 * src_argb,uint8 * dst_y,int pix)1175 void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
1176 __asm {
1177 mov eax, [esp + 4] /* src_argb */
1178 mov edx, [esp + 8] /* dst_y */
1179 mov ecx, [esp + 12] /* pix */
1180 movdqa xmm4, kARGBToYJ
1181 movdqa xmm5, kAddYJ64
1182
1183 convertloop:
1184 movdqu xmm0, [eax]
1185 movdqu xmm1, [eax + 16]
1186 movdqu xmm2, [eax + 32]
1187 movdqu xmm3, [eax + 48]
1188 pmaddubsw xmm0, xmm4
1189 pmaddubsw xmm1, xmm4
1190 pmaddubsw xmm2, xmm4
1191 pmaddubsw xmm3, xmm4
1192 lea eax, [eax + 64]
1193 phaddw xmm0, xmm1
1194 phaddw xmm2, xmm3
1195 paddw xmm0, xmm5 // Add .5 for rounding.
1196 paddw xmm2, xmm5
1197 psrlw xmm0, 7
1198 psrlw xmm2, 7
1199 packuswb xmm0, xmm2
1200 movdqu [edx], xmm0
1201 lea edx, [edx + 16]
1202 sub ecx, 16
1203 jg convertloop
1204 ret
1205 }
1206 }
1207
1208 #ifdef HAS_ARGBTOYROW_AVX2
1209 // vpermd for vphaddw + vpackuswb vpermd.
1210 static const lvec32 kPermdARGBToY_AVX = {
1211 0, 4, 1, 5, 2, 6, 3, 7
1212 };
1213
1214 // Convert 32 ARGB pixels (128 bytes) to 32 Y values.
1215 __declspec(naked)
ARGBToYRow_AVX2(const uint8 * src_argb,uint8 * dst_y,int pix)1216 void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) {
1217 __asm {
1218 mov eax, [esp + 4] /* src_argb */
1219 mov edx, [esp + 8] /* dst_y */
1220 mov ecx, [esp + 12] /* pix */
1221 vbroadcastf128 ymm4, kARGBToY
1222 vbroadcastf128 ymm5, kAddY16
1223 vmovdqu ymm6, kPermdARGBToY_AVX
1224
1225 convertloop:
1226 vmovdqu ymm0, [eax]
1227 vmovdqu ymm1, [eax + 32]
1228 vmovdqu ymm2, [eax + 64]
1229 vmovdqu ymm3, [eax + 96]
1230 vpmaddubsw ymm0, ymm0, ymm4
1231 vpmaddubsw ymm1, ymm1, ymm4
1232 vpmaddubsw ymm2, ymm2, ymm4
1233 vpmaddubsw ymm3, ymm3, ymm4
1234 lea eax, [eax + 128]
1235 vphaddw ymm0, ymm0, ymm1 // mutates.
1236 vphaddw ymm2, ymm2, ymm3
1237 vpsrlw ymm0, ymm0, 7
1238 vpsrlw ymm2, ymm2, 7
1239 vpackuswb ymm0, ymm0, ymm2 // mutates.
1240 vpermd ymm0, ymm6, ymm0 // For vphaddw + vpackuswb mutation.
1241 vpaddb ymm0, ymm0, ymm5 // add 16 for Y
1242 vmovdqu [edx], ymm0
1243 lea edx, [edx + 32]
1244 sub ecx, 32
1245 jg convertloop
1246 vzeroupper
1247 ret
1248 }
1249 }
1250 #endif // HAS_ARGBTOYROW_AVX2
1251
1252 #ifdef HAS_ARGBTOYJROW_AVX2
1253 // Convert 32 ARGB pixels (128 bytes) to 32 Y values.
1254 __declspec(naked)
ARGBToYJRow_AVX2(const uint8 * src_argb,uint8 * dst_y,int pix)1255 void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) {
1256 __asm {
1257 mov eax, [esp + 4] /* src_argb */
1258 mov edx, [esp + 8] /* dst_y */
1259 mov ecx, [esp + 12] /* pix */
1260 vbroadcastf128 ymm4, kARGBToYJ
1261 vbroadcastf128 ymm5, kAddYJ64
1262 vmovdqu ymm6, kPermdARGBToY_AVX
1263
1264 convertloop:
1265 vmovdqu ymm0, [eax]
1266 vmovdqu ymm1, [eax + 32]
1267 vmovdqu ymm2, [eax + 64]
1268 vmovdqu ymm3, [eax + 96]
1269 vpmaddubsw ymm0, ymm0, ymm4
1270 vpmaddubsw ymm1, ymm1, ymm4
1271 vpmaddubsw ymm2, ymm2, ymm4
1272 vpmaddubsw ymm3, ymm3, ymm4
1273 lea eax, [eax + 128]
1274 vphaddw ymm0, ymm0, ymm1 // mutates.
1275 vphaddw ymm2, ymm2, ymm3
1276 vpaddw ymm0, ymm0, ymm5 // Add .5 for rounding.
1277 vpaddw ymm2, ymm2, ymm5
1278 vpsrlw ymm0, ymm0, 7
1279 vpsrlw ymm2, ymm2, 7
1280 vpackuswb ymm0, ymm0, ymm2 // mutates.
1281 vpermd ymm0, ymm6, ymm0 // For vphaddw + vpackuswb mutation.
1282 vmovdqu [edx], ymm0
1283 lea edx, [edx + 32]
1284 sub ecx, 32
1285 jg convertloop
1286
1287 vzeroupper
1288 ret
1289 }
1290 }
1291 #endif // HAS_ARGBTOYJROW_AVX2
1292
1293 __declspec(naked)
BGRAToYRow_SSSE3(const uint8 * src_argb,uint8 * dst_y,int pix)1294 void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
1295 __asm {
1296 mov eax, [esp + 4] /* src_argb */
1297 mov edx, [esp + 8] /* dst_y */
1298 mov ecx, [esp + 12] /* pix */
1299 movdqa xmm4, kBGRAToY
1300 movdqa xmm5, kAddY16
1301
1302 convertloop:
1303 movdqu xmm0, [eax]
1304 movdqu xmm1, [eax + 16]
1305 movdqu xmm2, [eax + 32]
1306 movdqu xmm3, [eax + 48]
1307 pmaddubsw xmm0, xmm4
1308 pmaddubsw xmm1, xmm4
1309 pmaddubsw xmm2, xmm4
1310 pmaddubsw xmm3, xmm4
1311 lea eax, [eax + 64]
1312 phaddw xmm0, xmm1
1313 phaddw xmm2, xmm3
1314 psrlw xmm0, 7
1315 psrlw xmm2, 7
1316 packuswb xmm0, xmm2
1317 paddb xmm0, xmm5
1318 movdqu [edx], xmm0
1319 lea edx, [edx + 16]
1320 sub ecx, 16
1321 jg convertloop
1322 ret
1323 }
1324 }
1325
1326 __declspec(naked)
ABGRToYRow_SSSE3(const uint8 * src_argb,uint8 * dst_y,int pix)1327 void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
1328 __asm {
1329 mov eax, [esp + 4] /* src_argb */
1330 mov edx, [esp + 8] /* dst_y */
1331 mov ecx, [esp + 12] /* pix */
1332 movdqa xmm4, kABGRToY
1333 movdqa xmm5, kAddY16
1334
1335 convertloop:
1336 movdqu xmm0, [eax]
1337 movdqu xmm1, [eax + 16]
1338 movdqu xmm2, [eax + 32]
1339 movdqu xmm3, [eax + 48]
1340 pmaddubsw xmm0, xmm4
1341 pmaddubsw xmm1, xmm4
1342 pmaddubsw xmm2, xmm4
1343 pmaddubsw xmm3, xmm4
1344 lea eax, [eax + 64]
1345 phaddw xmm0, xmm1
1346 phaddw xmm2, xmm3
1347 psrlw xmm0, 7
1348 psrlw xmm2, 7
1349 packuswb xmm0, xmm2
1350 paddb xmm0, xmm5
1351 movdqu [edx], xmm0
1352 lea edx, [edx + 16]
1353 sub ecx, 16
1354 jg convertloop
1355 ret
1356 }
1357 }
1358
1359 __declspec(naked)
RGBAToYRow_SSSE3(const uint8 * src_argb,uint8 * dst_y,int pix)1360 void RGBAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
1361 __asm {
1362 mov eax, [esp + 4] /* src_argb */
1363 mov edx, [esp + 8] /* dst_y */
1364 mov ecx, [esp + 12] /* pix */
1365 movdqa xmm4, kRGBAToY
1366 movdqa xmm5, kAddY16
1367
1368 convertloop:
1369 movdqu xmm0, [eax]
1370 movdqu xmm1, [eax + 16]
1371 movdqu xmm2, [eax + 32]
1372 movdqu xmm3, [eax + 48]
1373 pmaddubsw xmm0, xmm4
1374 pmaddubsw xmm1, xmm4
1375 pmaddubsw xmm2, xmm4
1376 pmaddubsw xmm3, xmm4
1377 lea eax, [eax + 64]
1378 phaddw xmm0, xmm1
1379 phaddw xmm2, xmm3
1380 psrlw xmm0, 7
1381 psrlw xmm2, 7
1382 packuswb xmm0, xmm2
1383 paddb xmm0, xmm5
1384 movdqu [edx], xmm0
1385 lea edx, [edx + 16]
1386 sub ecx, 16
1387 jg convertloop
1388 ret
1389 }
1390 }
1391
1392 __declspec(naked)
ARGBToUVRow_SSSE3(const uint8 * src_argb0,int src_stride_argb,uint8 * dst_u,uint8 * dst_v,int width)1393 void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
1394 uint8* dst_u, uint8* dst_v, int width) {
1395 __asm {
1396 push esi
1397 push edi
1398 mov eax, [esp + 8 + 4] // src_argb
1399 mov esi, [esp + 8 + 8] // src_stride_argb
1400 mov edx, [esp + 8 + 12] // dst_u
1401 mov edi, [esp + 8 + 16] // dst_v
1402 mov ecx, [esp + 8 + 20] // pix
1403 movdqa xmm5, kAddUV128
1404 movdqa xmm6, kARGBToV
1405 movdqa xmm7, kARGBToU
1406 sub edi, edx // stride from u to v
1407
1408 convertloop:
1409 /* step 1 - subsample 16x2 argb pixels to 8x1 */
1410 movdqu xmm0, [eax]
1411 movdqu xmm4, [eax + esi]
1412 pavgb xmm0, xmm4
1413 movdqu xmm1, [eax + 16]
1414 movdqu xmm4, [eax + esi + 16]
1415 pavgb xmm1, xmm4
1416 movdqu xmm2, [eax + 32]
1417 movdqu xmm4, [eax + esi + 32]
1418 pavgb xmm2, xmm4
1419 movdqu xmm3, [eax + 48]
1420 movdqu xmm4, [eax + esi + 48]
1421 pavgb xmm3, xmm4
1422
1423 lea eax, [eax + 64]
1424 movdqa xmm4, xmm0
1425 shufps xmm0, xmm1, 0x88
1426 shufps xmm4, xmm1, 0xdd
1427 pavgb xmm0, xmm4
1428 movdqa xmm4, xmm2
1429 shufps xmm2, xmm3, 0x88
1430 shufps xmm4, xmm3, 0xdd
1431 pavgb xmm2, xmm4
1432
1433 // step 2 - convert to U and V
1434 // from here down is very similar to Y code except
1435 // instead of 16 different pixels, its 8 pixels of U and 8 of V
1436 movdqa xmm1, xmm0
1437 movdqa xmm3, xmm2
1438 pmaddubsw xmm0, xmm7 // U
1439 pmaddubsw xmm2, xmm7
1440 pmaddubsw xmm1, xmm6 // V
1441 pmaddubsw xmm3, xmm6
1442 phaddw xmm0, xmm2
1443 phaddw xmm1, xmm3
1444 psraw xmm0, 8
1445 psraw xmm1, 8
1446 packsswb xmm0, xmm1
1447 paddb xmm0, xmm5 // -> unsigned
1448
1449 // step 3 - store 8 U and 8 V values
1450 movlps qword ptr [edx], xmm0 // U
1451 movhps qword ptr [edx + edi], xmm0 // V
1452 lea edx, [edx + 8]
1453 sub ecx, 16
1454 jg convertloop
1455
1456 pop edi
1457 pop esi
1458 ret
1459 }
1460 }
1461
1462 __declspec(naked)
ARGBToUVJRow_SSSE3(const uint8 * src_argb0,int src_stride_argb,uint8 * dst_u,uint8 * dst_v,int width)1463 void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
1464 uint8* dst_u, uint8* dst_v, int width) {
1465 __asm {
1466 push esi
1467 push edi
1468 mov eax, [esp + 8 + 4] // src_argb
1469 mov esi, [esp + 8 + 8] // src_stride_argb
1470 mov edx, [esp + 8 + 12] // dst_u
1471 mov edi, [esp + 8 + 16] // dst_v
1472 mov ecx, [esp + 8 + 20] // pix
1473 movdqa xmm5, kAddUVJ128
1474 movdqa xmm6, kARGBToVJ
1475 movdqa xmm7, kARGBToUJ
1476 sub edi, edx // stride from u to v
1477
1478 convertloop:
1479 /* step 1 - subsample 16x2 argb pixels to 8x1 */
1480 movdqu xmm0, [eax]
1481 movdqu xmm4, [eax + esi]
1482 pavgb xmm0, xmm4
1483 movdqu xmm1, [eax + 16]
1484 movdqu xmm4, [eax + esi + 16]
1485 pavgb xmm1, xmm4
1486 movdqu xmm2, [eax + 32]
1487 movdqu xmm4, [eax + esi + 32]
1488 pavgb xmm2, xmm4
1489 movdqu xmm3, [eax + 48]
1490 movdqu xmm4, [eax + esi + 48]
1491 pavgb xmm3, xmm4
1492
1493 lea eax, [eax + 64]
1494 movdqa xmm4, xmm0
1495 shufps xmm0, xmm1, 0x88
1496 shufps xmm4, xmm1, 0xdd
1497 pavgb xmm0, xmm4
1498 movdqa xmm4, xmm2
1499 shufps xmm2, xmm3, 0x88
1500 shufps xmm4, xmm3, 0xdd
1501 pavgb xmm2, xmm4
1502
1503 // step 2 - convert to U and V
1504 // from here down is very similar to Y code except
1505 // instead of 16 different pixels, its 8 pixels of U and 8 of V
1506 movdqa xmm1, xmm0
1507 movdqa xmm3, xmm2
1508 pmaddubsw xmm0, xmm7 // U
1509 pmaddubsw xmm2, xmm7
1510 pmaddubsw xmm1, xmm6 // V
1511 pmaddubsw xmm3, xmm6
1512 phaddw xmm0, xmm2
1513 phaddw xmm1, xmm3
1514 paddw xmm0, xmm5 // +.5 rounding -> unsigned
1515 paddw xmm1, xmm5
1516 psraw xmm0, 8
1517 psraw xmm1, 8
1518 packsswb xmm0, xmm1
1519
1520 // step 3 - store 8 U and 8 V values
1521 movlps qword ptr [edx], xmm0 // U
1522 movhps qword ptr [edx + edi], xmm0 // V
1523 lea edx, [edx + 8]
1524 sub ecx, 16
1525 jg convertloop
1526
1527 pop edi
1528 pop esi
1529 ret
1530 }
1531 }
1532
1533 #ifdef HAS_ARGBTOUVROW_AVX2
1534 __declspec(naked)
ARGBToUVRow_AVX2(const uint8 * src_argb0,int src_stride_argb,uint8 * dst_u,uint8 * dst_v,int width)1535 void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb,
1536 uint8* dst_u, uint8* dst_v, int width) {
1537 __asm {
1538 push esi
1539 push edi
1540 mov eax, [esp + 8 + 4] // src_argb
1541 mov esi, [esp + 8 + 8] // src_stride_argb
1542 mov edx, [esp + 8 + 12] // dst_u
1543 mov edi, [esp + 8 + 16] // dst_v
1544 mov ecx, [esp + 8 + 20] // pix
1545 vbroadcastf128 ymm5, kAddUV128
1546 vbroadcastf128 ymm6, kARGBToV
1547 vbroadcastf128 ymm7, kARGBToU
1548 sub edi, edx // stride from u to v
1549
1550 convertloop:
1551 /* step 1 - subsample 32x2 argb pixels to 16x1 */
1552 vmovdqu ymm0, [eax]
1553 vmovdqu ymm1, [eax + 32]
1554 vmovdqu ymm2, [eax + 64]
1555 vmovdqu ymm3, [eax + 96]
1556 vpavgb ymm0, ymm0, [eax + esi]
1557 vpavgb ymm1, ymm1, [eax + esi + 32]
1558 vpavgb ymm2, ymm2, [eax + esi + 64]
1559 vpavgb ymm3, ymm3, [eax + esi + 96]
1560 lea eax, [eax + 128]
1561 vshufps ymm4, ymm0, ymm1, 0x88
1562 vshufps ymm0, ymm0, ymm1, 0xdd
1563 vpavgb ymm0, ymm0, ymm4 // mutated by vshufps
1564 vshufps ymm4, ymm2, ymm3, 0x88
1565 vshufps ymm2, ymm2, ymm3, 0xdd
1566 vpavgb ymm2, ymm2, ymm4 // mutated by vshufps
1567
1568 // step 2 - convert to U and V
1569 // from here down is very similar to Y code except
1570 // instead of 32 different pixels, its 16 pixels of U and 16 of V
1571 vpmaddubsw ymm1, ymm0, ymm7 // U
1572 vpmaddubsw ymm3, ymm2, ymm7
1573 vpmaddubsw ymm0, ymm0, ymm6 // V
1574 vpmaddubsw ymm2, ymm2, ymm6
1575 vphaddw ymm1, ymm1, ymm3 // mutates
1576 vphaddw ymm0, ymm0, ymm2
1577 vpsraw ymm1, ymm1, 8
1578 vpsraw ymm0, ymm0, 8
1579 vpacksswb ymm0, ymm1, ymm0 // mutates
1580 vpermq ymm0, ymm0, 0xd8 // For vpacksswb
1581 vpshufb ymm0, ymm0, kShufARGBToUV_AVX // For vshufps + vphaddw
1582 vpaddb ymm0, ymm0, ymm5 // -> unsigned
1583
1584 // step 3 - store 16 U and 16 V values
1585 vextractf128 [edx], ymm0, 0 // U
1586 vextractf128 [edx + edi], ymm0, 1 // V
1587 lea edx, [edx + 16]
1588 sub ecx, 32
1589 jg convertloop
1590
1591 pop edi
1592 pop esi
1593 vzeroupper
1594 ret
1595 }
1596 }
1597 #endif // HAS_ARGBTOUVROW_AVX2
1598
1599 __declspec(naked)
ARGBToUV444Row_SSSE3(const uint8 * src_argb0,uint8 * dst_u,uint8 * dst_v,int width)1600 void ARGBToUV444Row_SSSE3(const uint8* src_argb0,
1601 uint8* dst_u, uint8* dst_v, int width) {
1602 __asm {
1603 push edi
1604 mov eax, [esp + 4 + 4] // src_argb
1605 mov edx, [esp + 4 + 8] // dst_u
1606 mov edi, [esp + 4 + 12] // dst_v
1607 mov ecx, [esp + 4 + 16] // pix
1608 movdqa xmm5, kAddUV128
1609 movdqa xmm6, kARGBToV
1610 movdqa xmm7, kARGBToU
1611 sub edi, edx // stride from u to v
1612
1613 convertloop:
1614 /* convert to U and V */
1615 movdqu xmm0, [eax] // U
1616 movdqu xmm1, [eax + 16]
1617 movdqu xmm2, [eax + 32]
1618 movdqu xmm3, [eax + 48]
1619 pmaddubsw xmm0, xmm7
1620 pmaddubsw xmm1, xmm7
1621 pmaddubsw xmm2, xmm7
1622 pmaddubsw xmm3, xmm7
1623 phaddw xmm0, xmm1
1624 phaddw xmm2, xmm3
1625 psraw xmm0, 8
1626 psraw xmm2, 8
1627 packsswb xmm0, xmm2
1628 paddb xmm0, xmm5
1629 movdqu [edx], xmm0
1630
1631 movdqu xmm0, [eax] // V
1632 movdqu xmm1, [eax + 16]
1633 movdqu xmm2, [eax + 32]
1634 movdqu xmm3, [eax + 48]
1635 pmaddubsw xmm0, xmm6
1636 pmaddubsw xmm1, xmm6
1637 pmaddubsw xmm2, xmm6
1638 pmaddubsw xmm3, xmm6
1639 phaddw xmm0, xmm1
1640 phaddw xmm2, xmm3
1641 psraw xmm0, 8
1642 psraw xmm2, 8
1643 packsswb xmm0, xmm2
1644 paddb xmm0, xmm5
1645 lea eax, [eax + 64]
1646 movdqu [edx + edi], xmm0
1647 lea edx, [edx + 16]
1648 sub ecx, 16
1649 jg convertloop
1650
1651 pop edi
1652 ret
1653 }
1654 }
1655
1656 __declspec(naked)
ARGBToUV422Row_SSSE3(const uint8 * src_argb0,uint8 * dst_u,uint8 * dst_v,int width)1657 void ARGBToUV422Row_SSSE3(const uint8* src_argb0,
1658 uint8* dst_u, uint8* dst_v, int width) {
1659 __asm {
1660 push edi
1661 mov eax, [esp + 4 + 4] // src_argb
1662 mov edx, [esp + 4 + 8] // dst_u
1663 mov edi, [esp + 4 + 12] // dst_v
1664 mov ecx, [esp + 4 + 16] // pix
1665 movdqa xmm5, kAddUV128
1666 movdqa xmm6, kARGBToV
1667 movdqa xmm7, kARGBToU
1668 sub edi, edx // stride from u to v
1669
1670 convertloop:
1671 /* step 1 - subsample 16x2 argb pixels to 8x1 */
1672 movdqu xmm0, [eax]
1673 movdqu xmm1, [eax + 16]
1674 movdqu xmm2, [eax + 32]
1675 movdqu xmm3, [eax + 48]
1676 lea eax, [eax + 64]
1677 movdqa xmm4, xmm0
1678 shufps xmm0, xmm1, 0x88
1679 shufps xmm4, xmm1, 0xdd
1680 pavgb xmm0, xmm4
1681 movdqa xmm4, xmm2
1682 shufps xmm2, xmm3, 0x88
1683 shufps xmm4, xmm3, 0xdd
1684 pavgb xmm2, xmm4
1685
1686 // step 2 - convert to U and V
1687 // from here down is very similar to Y code except
1688 // instead of 16 different pixels, its 8 pixels of U and 8 of V
1689 movdqa xmm1, xmm0
1690 movdqa xmm3, xmm2
1691 pmaddubsw xmm0, xmm7 // U
1692 pmaddubsw xmm2, xmm7
1693 pmaddubsw xmm1, xmm6 // V
1694 pmaddubsw xmm3, xmm6
1695 phaddw xmm0, xmm2
1696 phaddw xmm1, xmm3
1697 psraw xmm0, 8
1698 psraw xmm1, 8
1699 packsswb xmm0, xmm1
1700 paddb xmm0, xmm5 // -> unsigned
1701
1702 // step 3 - store 8 U and 8 V values
1703 movlps qword ptr [edx], xmm0 // U
1704 movhps qword ptr [edx + edi], xmm0 // V
1705 lea edx, [edx + 8]
1706 sub ecx, 16
1707 jg convertloop
1708
1709 pop edi
1710 ret
1711 }
1712 }
1713
1714 __declspec(naked)
BGRAToUVRow_SSSE3(const uint8 * src_argb0,int src_stride_argb,uint8 * dst_u,uint8 * dst_v,int width)1715 void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
1716 uint8* dst_u, uint8* dst_v, int width) {
1717 __asm {
1718 push esi
1719 push edi
1720 mov eax, [esp + 8 + 4] // src_argb
1721 mov esi, [esp + 8 + 8] // src_stride_argb
1722 mov edx, [esp + 8 + 12] // dst_u
1723 mov edi, [esp + 8 + 16] // dst_v
1724 mov ecx, [esp + 8 + 20] // pix
1725 movdqa xmm5, kAddUV128
1726 movdqa xmm6, kBGRAToV
1727 movdqa xmm7, kBGRAToU
1728 sub edi, edx // stride from u to v
1729
1730 convertloop:
1731 /* step 1 - subsample 16x2 argb pixels to 8x1 */
1732 movdqu xmm0, [eax]
1733 movdqu xmm4, [eax + esi]
1734 pavgb xmm0, xmm4
1735 movdqu xmm1, [eax + 16]
1736 movdqu xmm4, [eax + esi + 16]
1737 pavgb xmm1, xmm4
1738 movdqu xmm2, [eax + 32]
1739 movdqu xmm4, [eax + esi + 32]
1740 pavgb xmm2, xmm4
1741 movdqu xmm3, [eax + 48]
1742 movdqu xmm4, [eax + esi + 48]
1743 pavgb xmm3, xmm4
1744
1745 lea eax, [eax + 64]
1746 movdqa xmm4, xmm0
1747 shufps xmm0, xmm1, 0x88
1748 shufps xmm4, xmm1, 0xdd
1749 pavgb xmm0, xmm4
1750 movdqa xmm4, xmm2
1751 shufps xmm2, xmm3, 0x88
1752 shufps xmm4, xmm3, 0xdd
1753 pavgb xmm2, xmm4
1754
1755 // step 2 - convert to U and V
1756 // from here down is very similar to Y code except
1757 // instead of 16 different pixels, its 8 pixels of U and 8 of V
1758 movdqa xmm1, xmm0
1759 movdqa xmm3, xmm2
1760 pmaddubsw xmm0, xmm7 // U
1761 pmaddubsw xmm2, xmm7
1762 pmaddubsw xmm1, xmm6 // V
1763 pmaddubsw xmm3, xmm6
1764 phaddw xmm0, xmm2
1765 phaddw xmm1, xmm3
1766 psraw xmm0, 8
1767 psraw xmm1, 8
1768 packsswb xmm0, xmm1
1769 paddb xmm0, xmm5 // -> unsigned
1770
1771 // step 3 - store 8 U and 8 V values
1772 movlps qword ptr [edx], xmm0 // U
1773 movhps qword ptr [edx + edi], xmm0 // V
1774 lea edx, [edx + 8]
1775 sub ecx, 16
1776 jg convertloop
1777
1778 pop edi
1779 pop esi
1780 ret
1781 }
1782 }
1783
1784 __declspec(naked)
ABGRToUVRow_SSSE3(const uint8 * src_argb0,int src_stride_argb,uint8 * dst_u,uint8 * dst_v,int width)1785 void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
1786 uint8* dst_u, uint8* dst_v, int width) {
1787 __asm {
1788 push esi
1789 push edi
1790 mov eax, [esp + 8 + 4] // src_argb
1791 mov esi, [esp + 8 + 8] // src_stride_argb
1792 mov edx, [esp + 8 + 12] // dst_u
1793 mov edi, [esp + 8 + 16] // dst_v
1794 mov ecx, [esp + 8 + 20] // pix
1795 movdqa xmm5, kAddUV128
1796 movdqa xmm6, kABGRToV
1797 movdqa xmm7, kABGRToU
1798 sub edi, edx // stride from u to v
1799
1800 convertloop:
1801 /* step 1 - subsample 16x2 argb pixels to 8x1 */
1802 movdqu xmm0, [eax]
1803 movdqu xmm4, [eax + esi]
1804 pavgb xmm0, xmm4
1805 movdqu xmm1, [eax + 16]
1806 movdqu xmm4, [eax + esi + 16]
1807 pavgb xmm1, xmm4
1808 movdqu xmm2, [eax + 32]
1809 movdqu xmm4, [eax + esi + 32]
1810 pavgb xmm2, xmm4
1811 movdqu xmm3, [eax + 48]
1812 movdqu xmm4, [eax + esi + 48]
1813 pavgb xmm3, xmm4
1814
1815 lea eax, [eax + 64]
1816 movdqa xmm4, xmm0
1817 shufps xmm0, xmm1, 0x88
1818 shufps xmm4, xmm1, 0xdd
1819 pavgb xmm0, xmm4
1820 movdqa xmm4, xmm2
1821 shufps xmm2, xmm3, 0x88
1822 shufps xmm4, xmm3, 0xdd
1823 pavgb xmm2, xmm4
1824
1825 // step 2 - convert to U and V
1826 // from here down is very similar to Y code except
1827 // instead of 16 different pixels, its 8 pixels of U and 8 of V
1828 movdqa xmm1, xmm0
1829 movdqa xmm3, xmm2
1830 pmaddubsw xmm0, xmm7 // U
1831 pmaddubsw xmm2, xmm7
1832 pmaddubsw xmm1, xmm6 // V
1833 pmaddubsw xmm3, xmm6
1834 phaddw xmm0, xmm2
1835 phaddw xmm1, xmm3
1836 psraw xmm0, 8
1837 psraw xmm1, 8
1838 packsswb xmm0, xmm1
1839 paddb xmm0, xmm5 // -> unsigned
1840
1841 // step 3 - store 8 U and 8 V values
1842 movlps qword ptr [edx], xmm0 // U
1843 movhps qword ptr [edx + edi], xmm0 // V
1844 lea edx, [edx + 8]
1845 sub ecx, 16
1846 jg convertloop
1847
1848 pop edi
1849 pop esi
1850 ret
1851 }
1852 }
1853
1854 __declspec(naked)
RGBAToUVRow_SSSE3(const uint8 * src_argb0,int src_stride_argb,uint8 * dst_u,uint8 * dst_v,int width)1855 void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
1856 uint8* dst_u, uint8* dst_v, int width) {
1857 __asm {
1858 push esi
1859 push edi
1860 mov eax, [esp + 8 + 4] // src_argb
1861 mov esi, [esp + 8 + 8] // src_stride_argb
1862 mov edx, [esp + 8 + 12] // dst_u
1863 mov edi, [esp + 8 + 16] // dst_v
1864 mov ecx, [esp + 8 + 20] // pix
1865 movdqa xmm5, kAddUV128
1866 movdqa xmm6, kRGBAToV
1867 movdqa xmm7, kRGBAToU
1868 sub edi, edx // stride from u to v
1869
1870 convertloop:
1871 /* step 1 - subsample 16x2 argb pixels to 8x1 */
1872 movdqu xmm0, [eax]
1873 movdqu xmm4, [eax + esi]
1874 pavgb xmm0, xmm4
1875 movdqu xmm1, [eax + 16]
1876 movdqu xmm4, [eax + esi + 16]
1877 pavgb xmm1, xmm4
1878 movdqu xmm2, [eax + 32]
1879 movdqu xmm4, [eax + esi + 32]
1880 pavgb xmm2, xmm4
1881 movdqu xmm3, [eax + 48]
1882 movdqu xmm4, [eax + esi + 48]
1883 pavgb xmm3, xmm4
1884
1885 lea eax, [eax + 64]
1886 movdqa xmm4, xmm0
1887 shufps xmm0, xmm1, 0x88
1888 shufps xmm4, xmm1, 0xdd
1889 pavgb xmm0, xmm4
1890 movdqa xmm4, xmm2
1891 shufps xmm2, xmm3, 0x88
1892 shufps xmm4, xmm3, 0xdd
1893 pavgb xmm2, xmm4
1894
1895 // step 2 - convert to U and V
1896 // from here down is very similar to Y code except
1897 // instead of 16 different pixels, its 8 pixels of U and 8 of V
1898 movdqa xmm1, xmm0
1899 movdqa xmm3, xmm2
1900 pmaddubsw xmm0, xmm7 // U
1901 pmaddubsw xmm2, xmm7
1902 pmaddubsw xmm1, xmm6 // V
1903 pmaddubsw xmm3, xmm6
1904 phaddw xmm0, xmm2
1905 phaddw xmm1, xmm3
1906 psraw xmm0, 8
1907 psraw xmm1, 8
1908 packsswb xmm0, xmm1
1909 paddb xmm0, xmm5 // -> unsigned
1910
1911 // step 3 - store 8 U and 8 V values
1912 movlps qword ptr [edx], xmm0 // U
1913 movhps qword ptr [edx + edi], xmm0 // V
1914 lea edx, [edx + 8]
1915 sub ecx, 16
1916 jg convertloop
1917
1918 pop edi
1919 pop esi
1920 ret
1921 }
1922 }
1923 #endif // HAS_ARGBTOYROW_SSSE3
1924
1925 // Read 16 UV from 444
1926 #define READYUV444_AVX2 __asm { \
1927 __asm vmovdqu xmm0, [esi] /* U */ /* NOLINT */ \
1928 __asm vmovdqu xmm1, [esi + edi] /* V */ /* NOLINT */ \
1929 __asm lea esi, [esi + 16] \
1930 __asm vpermq ymm0, ymm0, 0xd8 \
1931 __asm vpermq ymm1, ymm1, 0xd8 \
1932 __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */ \
1933 }
1934
1935 // Read 8 UV from 422, upsample to 16 UV.
1936 #define READYUV422_AVX2 __asm { \
1937 __asm vmovq xmm0, qword ptr [esi] /* U */ /* NOLINT */ \
1938 __asm vmovq xmm1, qword ptr [esi + edi] /* V */ /* NOLINT */ \
1939 __asm lea esi, [esi + 8] \
1940 __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */ \
1941 __asm vpermq ymm0, ymm0, 0xd8 \
1942 __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \
1943 }
1944
1945 // Read 4 UV from 411, upsample to 16 UV.
1946 #define READYUV411_AVX2 __asm { \
1947 __asm vmovd xmm0, dword ptr [esi] /* U */ /* NOLINT */ \
1948 __asm vmovd xmm1, dword ptr [esi + edi] /* V */ /* NOLINT */ \
1949 __asm lea esi, [esi + 4] \
1950 __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */ \
1951 __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \
1952 __asm vpermq ymm0, ymm0, 0xd8 \
1953 __asm vpunpckldq ymm0, ymm0, ymm0 /* UVUVUVUV (upsample) */ \
1954 }
1955
1956 // Read 8 UV from NV12, upsample to 16 UV.
1957 #define READNV12_AVX2 __asm { \
1958 __asm vmovdqu xmm0, [esi] /* UV */ \
1959 __asm lea esi, [esi + 16] \
1960 __asm vpermq ymm0, ymm0, 0xd8 \
1961 __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \
1962 }
1963
1964 // Convert 16 pixels: 16 UV and 16 Y.
1965 #define YUVTORGB_AVX2(YuvConstants) __asm { \
1966 /* Step 1: Find 8 UV contributions to 16 R,G,B values */ \
1967 __asm vpmaddubsw ymm2, ymm0, YuvConstants.kUVToR /* scale R UV */ \
1968 __asm vpmaddubsw ymm1, ymm0, YuvConstants.kUVToG /* scale G UV */ \
1969 __asm vpmaddubsw ymm0, ymm0, YuvConstants.kUVToB /* scale B UV */ \
1970 __asm vmovdqu ymm3, YuvConstants.kUVBiasR \
1971 __asm vpsubw ymm2, ymm3, ymm2 \
1972 __asm vmovdqu ymm3, YuvConstants.kUVBiasG \
1973 __asm vpsubw ymm1, ymm3, ymm1 \
1974 __asm vmovdqu ymm3, YuvConstants.kUVBiasB \
1975 __asm vpsubw ymm0, ymm3, ymm0 \
1976 /* Step 2: Find Y contribution to 16 R,G,B values */ \
1977 __asm vmovdqu xmm3, [eax] /* NOLINT */ \
1978 __asm lea eax, [eax + 16] \
1979 __asm vpermq ymm3, ymm3, 0xd8 \
1980 __asm vpunpcklbw ymm3, ymm3, ymm3 \
1981 __asm vpmulhuw ymm3, ymm3, YuvConstants.kYToRgb \
1982 __asm vpaddsw ymm0, ymm0, ymm3 /* B += Y */ \
1983 __asm vpaddsw ymm1, ymm1, ymm3 /* G += Y */ \
1984 __asm vpaddsw ymm2, ymm2, ymm3 /* R += Y */ \
1985 __asm vpsraw ymm0, ymm0, 6 \
1986 __asm vpsraw ymm1, ymm1, 6 \
1987 __asm vpsraw ymm2, ymm2, 6 \
1988 __asm vpackuswb ymm0, ymm0, ymm0 /* B */ \
1989 __asm vpackuswb ymm1, ymm1, ymm1 /* G */ \
1990 __asm vpackuswb ymm2, ymm2, ymm2 /* R */ \
1991 }
1992
1993 // Store 16 ARGB values.
1994 #define STOREARGB_AVX2 __asm { \
1995 /* Step 3: Weave into ARGB */ \
1996 __asm vpunpcklbw ymm0, ymm0, ymm1 /* BG */ \
1997 __asm vpermq ymm0, ymm0, 0xd8 \
1998 __asm vpunpcklbw ymm2, ymm2, ymm5 /* RA */ \
1999 __asm vpermq ymm2, ymm2, 0xd8 \
2000 __asm vpunpcklwd ymm1, ymm0, ymm2 /* BGRA first 8 pixels */ \
2001 __asm vpunpckhwd ymm0, ymm0, ymm2 /* BGRA next 8 pixels */ \
2002 __asm vmovdqu 0[edx], ymm1 \
2003 __asm vmovdqu 32[edx], ymm0 \
2004 __asm lea edx, [edx + 64] \
2005 }
2006
2007 #ifdef HAS_I422TOARGBROW_AVX2
2008 // 16 pixels
2009 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
2010 __declspec(naked)
I422ToARGBRow_AVX2(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * dst_argb,int width)2011 void I422ToARGBRow_AVX2(const uint8* y_buf,
2012 const uint8* u_buf,
2013 const uint8* v_buf,
2014 uint8* dst_argb,
2015 int width) {
2016 __asm {
2017 push esi
2018 push edi
2019 mov eax, [esp + 8 + 4] // Y
2020 mov esi, [esp + 8 + 8] // U
2021 mov edi, [esp + 8 + 12] // V
2022 mov edx, [esp + 8 + 16] // argb
2023 mov ecx, [esp + 8 + 20] // width
2024 sub edi, esi
2025 vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
2026
2027 convertloop:
2028 READYUV422_AVX2
2029 YUVTORGB_AVX2(kYuvConstants)
2030 STOREARGB_AVX2
2031
2032 sub ecx, 16
2033 jg convertloop
2034
2035 pop edi
2036 pop esi
2037 vzeroupper
2038 ret
2039 }
2040 }
2041 #endif // HAS_I422TOARGBROW_AVX2
2042
2043 #ifdef HAS_J422TOARGBROW_AVX2
2044 // 16 pixels
2045 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
2046 __declspec(naked)
J422ToARGBRow_AVX2(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * dst_argb,int width)2047 void J422ToARGBRow_AVX2(const uint8* y_buf,
2048 const uint8* u_buf,
2049 const uint8* v_buf,
2050 uint8* dst_argb,
2051 int width) {
2052 __asm {
2053 push esi
2054 push edi
2055 mov eax, [esp + 8 + 4] // Y
2056 mov esi, [esp + 8 + 8] // U
2057 mov edi, [esp + 8 + 12] // V
2058 mov edx, [esp + 8 + 16] // argb
2059 mov ecx, [esp + 8 + 20] // width
2060 sub edi, esi
2061 vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
2062
2063 convertloop:
2064 READYUV422_AVX2
2065 YUVTORGB_AVX2(kYuvJConstants)
2066 STOREARGB_AVX2
2067
2068 sub ecx, 16
2069 jg convertloop
2070
2071 pop edi
2072 pop esi
2073 vzeroupper
2074 ret
2075 }
2076 }
2077 #endif // HAS_J422TOARGBROW_AVX2
2078
2079 #ifdef HAS_I444TOARGBROW_AVX2
2080 // 16 pixels
2081 // 16 UV values with 16 Y producing 16 ARGB (64 bytes).
2082 __declspec(naked)
I444ToARGBRow_AVX2(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * dst_argb,int width)2083 void I444ToARGBRow_AVX2(const uint8* y_buf,
2084 const uint8* u_buf,
2085 const uint8* v_buf,
2086 uint8* dst_argb,
2087 int width) {
2088 __asm {
2089 push esi
2090 push edi
2091 mov eax, [esp + 8 + 4] // Y
2092 mov esi, [esp + 8 + 8] // U
2093 mov edi, [esp + 8 + 12] // V
2094 mov edx, [esp + 8 + 16] // argb
2095 mov ecx, [esp + 8 + 20] // width
2096 sub edi, esi
2097 vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
2098
2099 convertloop:
2100 READYUV444_AVX2
2101 YUVTORGB_AVX2(kYuvConstants)
2102 STOREARGB_AVX2
2103
2104 sub ecx, 16
2105 jg convertloop
2106
2107 pop edi
2108 pop esi
2109 vzeroupper
2110 ret
2111 }
2112 }
2113 #endif // HAS_I444TOARGBROW_AVX2
2114
2115 #ifdef HAS_I411TOARGBROW_AVX2
2116 // 16 pixels
2117 // 4 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
2118 __declspec(naked)
I411ToARGBRow_AVX2(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * dst_argb,int width)2119 void I411ToARGBRow_AVX2(const uint8* y_buf,
2120 const uint8* u_buf,
2121 const uint8* v_buf,
2122 uint8* dst_argb,
2123 int width) {
2124 __asm {
2125 push esi
2126 push edi
2127 mov eax, [esp + 8 + 4] // Y
2128 mov esi, [esp + 8 + 8] // U
2129 mov edi, [esp + 8 + 12] // V
2130 mov edx, [esp + 8 + 16] // argb
2131 mov ecx, [esp + 8 + 20] // width
2132 sub edi, esi
2133 vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
2134
2135 convertloop:
2136 READYUV411_AVX2
2137 YUVTORGB_AVX2(kYuvConstants)
2138 STOREARGB_AVX2
2139
2140 sub ecx, 16
2141 jg convertloop
2142
2143 pop edi
2144 pop esi
2145 vzeroupper
2146 ret
2147 }
2148 }
2149 #endif // HAS_I411TOARGBROW_AVX2
2150
2151 #ifdef HAS_NV12TOARGBROW_AVX2
2152 // 16 pixels.
2153 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
2154 __declspec(naked)
NV12ToARGBRow_AVX2(const uint8 * y_buf,const uint8 * uv_buf,uint8 * dst_argb,int width)2155 void NV12ToARGBRow_AVX2(const uint8* y_buf,
2156 const uint8* uv_buf,
2157 uint8* dst_argb,
2158 int width) {
2159 __asm {
2160 push esi
2161 mov eax, [esp + 4 + 4] // Y
2162 mov esi, [esp + 4 + 8] // UV
2163 mov edx, [esp + 4 + 12] // argb
2164 mov ecx, [esp + 4 + 16] // width
2165 vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
2166
2167 convertloop:
2168 READNV12_AVX2
2169 YUVTORGB_AVX2(kYuvConstants)
2170 STOREARGB_AVX2
2171
2172 sub ecx, 16
2173 jg convertloop
2174
2175 pop esi
2176 vzeroupper
2177 ret
2178 }
2179 }
2180 #endif // HAS_NV12TOARGBROW_AVX2
2181
2182 #ifdef HAS_NV21TOARGBROW_AVX2
2183 // 16 pixels.
2184 // 8 VU values upsampled to 16 VU, mixed with 16 Y producing 16 ARGB (64 bytes).
2185 __declspec(naked)
NV21ToARGBRow_AVX2(const uint8 * y_buf,const uint8 * uv_buf,uint8 * dst_argb,int width)2186 void NV21ToARGBRow_AVX2(const uint8* y_buf,
2187 const uint8* uv_buf,
2188 uint8* dst_argb,
2189 int width) {
2190 __asm {
2191 push esi
2192 mov eax, [esp + 4 + 4] // Y
2193 mov esi, [esp + 4 + 8] // UV
2194 mov edx, [esp + 4 + 12] // argb
2195 mov ecx, [esp + 4 + 16] // width
2196 vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
2197
2198 convertloop:
2199 READNV12_AVX2
2200 YUVTORGB_AVX2(kYvuConstants)
2201 STOREARGB_AVX2
2202
2203 sub ecx, 16
2204 jg convertloop
2205
2206 pop esi
2207 vzeroupper
2208 ret
2209 }
2210 }
2211 #endif // HAS_NV21TOARGBROW_AVX2
2212
2213 #ifdef HAS_I422TOBGRAROW_AVX2
2214 // 16 pixels
2215 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 BGRA (64 bytes).
2216 // TODO(fbarchard): Use macros to reduce duplicate code. See SSSE3.
2217 __declspec(naked)
I422ToBGRARow_AVX2(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * dst_argb,int width)2218 void I422ToBGRARow_AVX2(const uint8* y_buf,
2219 const uint8* u_buf,
2220 const uint8* v_buf,
2221 uint8* dst_argb,
2222 int width) {
2223 __asm {
2224 push esi
2225 push edi
2226 mov eax, [esp + 8 + 4] // Y
2227 mov esi, [esp + 8 + 8] // U
2228 mov edi, [esp + 8 + 12] // V
2229 mov edx, [esp + 8 + 16] // argb
2230 mov ecx, [esp + 8 + 20] // width
2231 sub edi, esi
2232 vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
2233
2234 convertloop:
2235 READYUV422_AVX2
2236 YUVTORGB_AVX2(kYuvConstants)
2237
2238 // Step 3: Weave into BGRA
2239 vpunpcklbw ymm1, ymm1, ymm0 // GB
2240 vpermq ymm1, ymm1, 0xd8
2241 vpunpcklbw ymm2, ymm5, ymm2 // AR
2242 vpermq ymm2, ymm2, 0xd8
2243 vpunpcklwd ymm0, ymm2, ymm1 // ARGB first 8 pixels
2244 vpunpckhwd ymm2, ymm2, ymm1 // ARGB next 8 pixels
2245 vmovdqu [edx], ymm0
2246 vmovdqu [edx + 32], ymm2
2247 lea edx, [edx + 64]
2248 sub ecx, 16
2249 jg convertloop
2250
2251 pop edi
2252 pop esi
2253 vzeroupper
2254 ret
2255 }
2256 }
2257 #endif // HAS_I422TOBGRAROW_AVX2
2258
2259 #ifdef HAS_I422TORGBAROW_AVX2
2260 // 16 pixels
2261 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 RGBA (64 bytes).
2262 // TODO(fbarchard): Use macros to reduce duplicate code. See SSSE3.
2263 __declspec(naked)
I422ToRGBARow_AVX2(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * dst_argb,int width)2264 void I422ToRGBARow_AVX2(const uint8* y_buf,
2265 const uint8* u_buf,
2266 const uint8* v_buf,
2267 uint8* dst_argb,
2268 int width) {
2269 __asm {
2270 push esi
2271 push edi
2272 mov eax, [esp + 8 + 4] // Y
2273 mov esi, [esp + 8 + 8] // U
2274 mov edi, [esp + 8 + 12] // V
2275 mov edx, [esp + 8 + 16] // argb
2276 mov ecx, [esp + 8 + 20] // width
2277 sub edi, esi
2278 vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
2279
2280 convertloop:
2281 READYUV422_AVX2
2282 YUVTORGB_AVX2(kYuvConstants)
2283
2284 // Step 3: Weave into RGBA
2285 vpunpcklbw ymm1, ymm1, ymm2 // GR
2286 vpermq ymm1, ymm1, 0xd8
2287 vpunpcklbw ymm2, ymm5, ymm0 // AB
2288 vpermq ymm2, ymm2, 0xd8
2289 vpunpcklwd ymm0, ymm2, ymm1 // ABGR first 8 pixels
2290 vpunpckhwd ymm1, ymm2, ymm1 // ABGR next 8 pixels
2291 vmovdqu [edx], ymm0
2292 vmovdqu [edx + 32], ymm1
2293 lea edx, [edx + 64]
2294 sub ecx, 16
2295 jg convertloop
2296
2297 pop edi
2298 pop esi
2299 vzeroupper
2300 ret
2301 }
2302 }
2303 #endif // HAS_I422TORGBAROW_AVX2
2304
2305 #ifdef HAS_I422TOABGRROW_AVX2
2306 // 16 pixels
2307 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ABGR (64 bytes).
2308 // TODO(fbarchard): Use macros to reduce duplicate code. See SSSE3.
2309 __declspec(naked)
I422ToABGRRow_AVX2(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * dst_argb,int width)2310 void I422ToABGRRow_AVX2(const uint8* y_buf,
2311 const uint8* u_buf,
2312 const uint8* v_buf,
2313 uint8* dst_argb,
2314 int width) {
2315 __asm {
2316 push esi
2317 push edi
2318 mov eax, [esp + 8 + 4] // Y
2319 mov esi, [esp + 8 + 8] // U
2320 mov edi, [esp + 8 + 12] // V
2321 mov edx, [esp + 8 + 16] // argb
2322 mov ecx, [esp + 8 + 20] // width
2323 sub edi, esi
2324 vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
2325
2326 convertloop:
2327 READYUV422_AVX2
2328 YUVTORGB_AVX2(kYuvConstants)
2329
2330 // Step 3: Weave into ABGR
2331 vpunpcklbw ymm1, ymm2, ymm1 // RG
2332 vpermq ymm1, ymm1, 0xd8
2333 vpunpcklbw ymm2, ymm0, ymm5 // BA
2334 vpermq ymm2, ymm2, 0xd8
2335 vpunpcklwd ymm0, ymm1, ymm2 // RGBA first 8 pixels
2336 vpunpckhwd ymm1, ymm1, ymm2 // RGBA next 8 pixels
2337 vmovdqu [edx], ymm0
2338 vmovdqu [edx + 32], ymm1
2339 lea edx, [edx + 64]
2340 sub ecx, 16
2341 jg convertloop
2342
2343 pop edi
2344 pop esi
2345 vzeroupper
2346 ret
2347 }
2348 }
2349 #endif // HAS_I422TOABGRROW_AVX2
2350
2351 #if defined(HAS_I422TOARGBROW_SSSE3)
2352 // TODO(fbarchard): Read that does half size on Y and treats 420 as 444.
2353
2354 // Read 8 UV from 444.
2355 #define READYUV444 __asm { \
2356 __asm movq xmm0, qword ptr [esi] /* U */ /* NOLINT */ \
2357 __asm movq xmm1, qword ptr [esi + edi] /* V */ /* NOLINT */ \
2358 __asm lea esi, [esi + 8] \
2359 __asm punpcklbw xmm0, xmm1 /* UV */ \
2360 }
2361
2362 // Read 4 UV from 422, upsample to 8 UV.
2363 #define READYUV422 __asm { \
2364 __asm movd xmm0, [esi] /* U */ \
2365 __asm movd xmm1, [esi + edi] /* V */ \
2366 __asm lea esi, [esi + 4] \
2367 __asm punpcklbw xmm0, xmm1 /* UV */ \
2368 __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \
2369 }
2370
2371 // Read 2 UV from 411, upsample to 8 UV.
2372 #define READYUV411 __asm { \
2373 __asm movzx ebx, word ptr [esi] /* U */ /* NOLINT */ \
2374 __asm movd xmm0, ebx \
2375 __asm movzx ebx, word ptr [esi + edi] /* V */ /* NOLINT */ \
2376 __asm movd xmm1, ebx \
2377 __asm lea esi, [esi + 2] \
2378 __asm punpcklbw xmm0, xmm1 /* UV */ \
2379 __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \
2380 __asm punpckldq xmm0, xmm0 /* UVUVUVUV (upsample) */ \
2381 }
2382
2383 // Read 4 UV from NV12, upsample to 8 UV.
2384 #define READNV12 __asm { \
2385 __asm movq xmm0, qword ptr [esi] /* UV */ /* NOLINT */ \
2386 __asm lea esi, [esi + 8] \
2387 __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \
2388 }
2389
2390 // Convert 8 pixels: 8 UV and 8 Y.
2391 #define YUVTORGB(YuvConstants) __asm { \
2392 /* Step 1: Find 4 UV contributions to 8 R,G,B values */ \
2393 __asm movdqa xmm1, xmm0 \
2394 __asm movdqa xmm2, xmm0 \
2395 __asm movdqa xmm3, xmm0 \
2396 __asm movdqa xmm0, YuvConstants.kUVBiasB /* unbias back to signed */ \
2397 __asm pmaddubsw xmm1, YuvConstants.kUVToB /* scale B UV */ \
2398 __asm psubw xmm0, xmm1 \
2399 __asm movdqa xmm1, YuvConstants.kUVBiasG \
2400 __asm pmaddubsw xmm2, YuvConstants.kUVToG /* scale G UV */ \
2401 __asm psubw xmm1, xmm2 \
2402 __asm movdqa xmm2, YuvConstants.kUVBiasR \
2403 __asm pmaddubsw xmm3, YuvConstants.kUVToR /* scale R UV */ \
2404 __asm psubw xmm2, xmm3 \
2405 /* Step 2: Find Y contribution to 8 R,G,B values */ \
2406 __asm movq xmm3, qword ptr [eax] /* NOLINT */ \
2407 __asm lea eax, [eax + 8] \
2408 __asm punpcklbw xmm3, xmm3 \
2409 __asm pmulhuw xmm3, YuvConstants.kYToRgb \
2410 __asm paddsw xmm0, xmm3 /* B += Y */ \
2411 __asm paddsw xmm1, xmm3 /* G += Y */ \
2412 __asm paddsw xmm2, xmm3 /* R += Y */ \
2413 __asm psraw xmm0, 6 \
2414 __asm psraw xmm1, 6 \
2415 __asm psraw xmm2, 6 \
2416 __asm packuswb xmm0, xmm0 /* B */ \
2417 __asm packuswb xmm1, xmm1 /* G */ \
2418 __asm packuswb xmm2, xmm2 /* R */ \
2419 }
2420
2421 // Store 8 ARGB values.
2422 #define STOREARGB __asm { \
2423 /* Step 3: Weave into ARGB */ \
2424 __asm punpcklbw xmm0, xmm1 /* BG */ \
2425 __asm punpcklbw xmm2, xmm5 /* RA */ \
2426 __asm movdqa xmm1, xmm0 \
2427 __asm punpcklwd xmm0, xmm2 /* BGRA first 4 pixels */ \
2428 __asm punpckhwd xmm1, xmm2 /* BGRA next 4 pixels */ \
2429 __asm movdqu 0[edx], xmm0 \
2430 __asm movdqu 16[edx], xmm1 \
2431 __asm lea edx, [edx + 32] \
2432 }
2433
2434 // Store 8 BGRA values.
2435 #define STOREBGRA __asm { \
2436 /* Step 3: Weave into BGRA */ \
2437 __asm pcmpeqb xmm5, xmm5 /* generate 0xffffffff for alpha */ \
2438 __asm punpcklbw xmm1, xmm0 /* GB */ \
2439 __asm punpcklbw xmm5, xmm2 /* AR */ \
2440 __asm movdqa xmm0, xmm5 \
2441 __asm punpcklwd xmm5, xmm1 /* BGRA first 4 pixels */ \
2442 __asm punpckhwd xmm0, xmm1 /* BGRA next 4 pixels */ \
2443 __asm movdqu 0[edx], xmm5 \
2444 __asm movdqu 16[edx], xmm0 \
2445 __asm lea edx, [edx + 32] \
2446 }
2447
2448 // Store 8 ABGR values.
2449 #define STOREABGR __asm { \
2450 /* Step 3: Weave into ABGR */ \
2451 __asm punpcklbw xmm2, xmm1 /* RG */ \
2452 __asm punpcklbw xmm0, xmm5 /* BA */ \
2453 __asm movdqa xmm1, xmm2 \
2454 __asm punpcklwd xmm2, xmm0 /* RGBA first 4 pixels */ \
2455 __asm punpckhwd xmm1, xmm0 /* RGBA next 4 pixels */ \
2456 __asm movdqu 0[edx], xmm2 \
2457 __asm movdqu 16[edx], xmm1 \
2458 __asm lea edx, [edx + 32] \
2459 }
2460
2461 // Store 8 RGBA values.
2462 #define STORERGBA __asm { \
2463 /* Step 3: Weave into RGBA */ \
2464 __asm pcmpeqb xmm5, xmm5 /* generate 0xffffffff for alpha */ \
2465 __asm punpcklbw xmm1, xmm2 /* GR */ \
2466 __asm punpcklbw xmm5, xmm0 /* AB */ \
2467 __asm movdqa xmm0, xmm5 \
2468 __asm punpcklwd xmm5, xmm1 /* RGBA first 4 pixels */ \
2469 __asm punpckhwd xmm0, xmm1 /* RGBA next 4 pixels */ \
2470 __asm movdqu 0[edx], xmm5 \
2471 __asm movdqu 16[edx], xmm0 \
2472 __asm lea edx, [edx + 32] \
2473 }
2474
2475 // Store 8 RGB24 values.
2476 #define STORERGB24 __asm { \
2477 /* Step 3: Weave into RRGB */ \
2478 __asm punpcklbw xmm0, xmm1 /* BG */ \
2479 __asm punpcklbw xmm2, xmm2 /* RR */ \
2480 __asm movdqa xmm1, xmm0 \
2481 __asm punpcklwd xmm0, xmm2 /* BGRR first 4 pixels */ \
2482 __asm punpckhwd xmm1, xmm2 /* BGRR next 4 pixels */ \
2483 /* Step 4: RRGB -> RGB24 */ \
2484 __asm pshufb xmm0, xmm5 /* Pack first 8 and last 4 bytes. */ \
2485 __asm pshufb xmm1, xmm6 /* Pack first 12 bytes. */ \
2486 __asm palignr xmm1, xmm0, 12 /* last 4 bytes of xmm0 + 12 xmm1 */ \
2487 __asm movq qword ptr 0[edx], xmm0 /* First 8 bytes */ \
2488 __asm movdqu 8[edx], xmm1 /* Last 16 bytes */ \
2489 __asm lea edx, [edx + 24] \
2490 }
2491
2492 // Store 8 RAW values.
2493 #define STORERAW __asm { \
2494 /* Step 3: Weave into RRGB */ \
2495 __asm punpcklbw xmm0, xmm1 /* BG */ \
2496 __asm punpcklbw xmm2, xmm2 /* RR */ \
2497 __asm movdqa xmm1, xmm0 \
2498 __asm punpcklwd xmm0, xmm2 /* BGRR first 4 pixels */ \
2499 __asm punpckhwd xmm1, xmm2 /* BGRR next 4 pixels */ \
2500 /* Step 4: RRGB -> RAW */ \
2501 __asm pshufb xmm0, xmm5 /* Pack first 8 and last 4 bytes. */ \
2502 __asm pshufb xmm1, xmm6 /* Pack first 12 bytes. */ \
2503 __asm palignr xmm1, xmm0, 12 /* last 4 bytes of xmm0 + 12 xmm1 */ \
2504 __asm movq qword ptr 0[edx], xmm0 /* First 8 bytes */ \
2505 __asm movdqu 8[edx], xmm1 /* Last 16 bytes */ \
2506 __asm lea edx, [edx + 24] \
2507 }
2508
2509 // Store 8 RGB565 values.
2510 #define STORERGB565 __asm { \
2511 /* Step 3: Weave into RRGB */ \
2512 __asm punpcklbw xmm0, xmm1 /* BG */ \
2513 __asm punpcklbw xmm2, xmm2 /* RR */ \
2514 __asm movdqa xmm1, xmm0 \
2515 __asm punpcklwd xmm0, xmm2 /* BGRR first 4 pixels */ \
2516 __asm punpckhwd xmm1, xmm2 /* BGRR next 4 pixels */ \
2517 /* Step 4: RRGB -> RGB565 */ \
2518 __asm movdqa xmm3, xmm0 /* B first 4 pixels of argb */ \
2519 __asm movdqa xmm2, xmm0 /* G */ \
2520 __asm pslld xmm0, 8 /* R */ \
2521 __asm psrld xmm3, 3 /* B */ \
2522 __asm psrld xmm2, 5 /* G */ \
2523 __asm psrad xmm0, 16 /* R */ \
2524 __asm pand xmm3, xmm5 /* B */ \
2525 __asm pand xmm2, xmm6 /* G */ \
2526 __asm pand xmm0, xmm7 /* R */ \
2527 __asm por xmm3, xmm2 /* BG */ \
2528 __asm por xmm0, xmm3 /* BGR */ \
2529 __asm movdqa xmm3, xmm1 /* B next 4 pixels of argb */ \
2530 __asm movdqa xmm2, xmm1 /* G */ \
2531 __asm pslld xmm1, 8 /* R */ \
2532 __asm psrld xmm3, 3 /* B */ \
2533 __asm psrld xmm2, 5 /* G */ \
2534 __asm psrad xmm1, 16 /* R */ \
2535 __asm pand xmm3, xmm5 /* B */ \
2536 __asm pand xmm2, xmm6 /* G */ \
2537 __asm pand xmm1, xmm7 /* R */ \
2538 __asm por xmm3, xmm2 /* BG */ \
2539 __asm por xmm1, xmm3 /* BGR */ \
2540 __asm packssdw xmm0, xmm1 \
2541 __asm movdqu 0[edx], xmm0 /* store 8 pixels of RGB565 */ \
2542 __asm lea edx, [edx + 16] \
2543 }
2544
2545 // 8 pixels.
2546 // 8 UV values, mixed with 8 Y producing 8 ARGB (32 bytes).
2547 __declspec(naked)
I444ToARGBRow_SSSE3(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * dst_argb,int width)2548 void I444ToARGBRow_SSSE3(const uint8* y_buf,
2549 const uint8* u_buf,
2550 const uint8* v_buf,
2551 uint8* dst_argb,
2552 int width) {
2553 __asm {
2554 push esi
2555 push edi
2556 mov eax, [esp + 8 + 4] // Y
2557 mov esi, [esp + 8 + 8] // U
2558 mov edi, [esp + 8 + 12] // V
2559 mov edx, [esp + 8 + 16] // argb
2560 mov ecx, [esp + 8 + 20] // width
2561 sub edi, esi
2562 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
2563
2564 convertloop:
2565 READYUV444
2566 YUVTORGB(kYuvConstants)
2567 STOREARGB
2568
2569 sub ecx, 8
2570 jg convertloop
2571
2572 pop edi
2573 pop esi
2574 ret
2575 }
2576 }
2577
2578 // 8 pixels.
2579 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RGB24 (24 bytes).
2580 __declspec(naked)
I422ToRGB24Row_SSSE3(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * dst_rgb24,int width)2581 void I422ToRGB24Row_SSSE3(const uint8* y_buf,
2582 const uint8* u_buf,
2583 const uint8* v_buf,
2584 uint8* dst_rgb24,
2585 int width) {
2586 __asm {
2587 push esi
2588 push edi
2589 mov eax, [esp + 8 + 4] // Y
2590 mov esi, [esp + 8 + 8] // U
2591 mov edi, [esp + 8 + 12] // V
2592 mov edx, [esp + 8 + 16] // rgb24
2593 mov ecx, [esp + 8 + 20] // width
2594 sub edi, esi
2595 movdqa xmm5, kShuffleMaskARGBToRGB24_0
2596 movdqa xmm6, kShuffleMaskARGBToRGB24
2597
2598 convertloop:
2599 READYUV422
2600 YUVTORGB(kYuvConstants)
2601 STORERGB24
2602
2603 sub ecx, 8
2604 jg convertloop
2605
2606 pop edi
2607 pop esi
2608 ret
2609 }
2610 }
2611
2612 // 8 pixels.
2613 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RAW (24 bytes).
2614 __declspec(naked)
I422ToRAWRow_SSSE3(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * dst_raw,int width)2615 void I422ToRAWRow_SSSE3(const uint8* y_buf,
2616 const uint8* u_buf,
2617 const uint8* v_buf,
2618 uint8* dst_raw,
2619 int width) {
2620 __asm {
2621 push esi
2622 push edi
2623 mov eax, [esp + 8 + 4] // Y
2624 mov esi, [esp + 8 + 8] // U
2625 mov edi, [esp + 8 + 12] // V
2626 mov edx, [esp + 8 + 16] // raw
2627 mov ecx, [esp + 8 + 20] // width
2628 sub edi, esi
2629 movdqa xmm5, kShuffleMaskARGBToRAW_0
2630 movdqa xmm6, kShuffleMaskARGBToRAW
2631
2632 convertloop:
2633 READYUV422
2634 YUVTORGB(kYuvConstants)
2635 STORERAW
2636
2637 sub ecx, 8
2638 jg convertloop
2639
2640 pop edi
2641 pop esi
2642 ret
2643 }
2644 }
2645
2646 // 8 pixels
2647 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RGB565 (16 bytes).
2648 __declspec(naked)
I422ToRGB565Row_SSSE3(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * rgb565_buf,int width)2649 void I422ToRGB565Row_SSSE3(const uint8* y_buf,
2650 const uint8* u_buf,
2651 const uint8* v_buf,
2652 uint8* rgb565_buf,
2653 int width) {
2654 __asm {
2655 push esi
2656 push edi
2657 mov eax, [esp + 8 + 4] // Y
2658 mov esi, [esp + 8 + 8] // U
2659 mov edi, [esp + 8 + 12] // V
2660 mov edx, [esp + 8 + 16] // rgb565
2661 mov ecx, [esp + 8 + 20] // width
2662 sub edi, esi
2663 pcmpeqb xmm5, xmm5 // generate mask 0x0000001f
2664 psrld xmm5, 27
2665 pcmpeqb xmm6, xmm6 // generate mask 0x000007e0
2666 psrld xmm6, 26
2667 pslld xmm6, 5
2668 pcmpeqb xmm7, xmm7 // generate mask 0xfffff800
2669 pslld xmm7, 11
2670
2671 convertloop:
2672 READYUV422
2673 YUVTORGB(kYuvConstants)
2674 STORERGB565
2675
2676 sub ecx, 8
2677 jg convertloop
2678
2679 pop edi
2680 pop esi
2681 ret
2682 }
2683 }
2684
2685 // 8 pixels.
2686 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
2687 __declspec(naked)
I422ToARGBRow_SSSE3(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * dst_argb,int width)2688 void I422ToARGBRow_SSSE3(const uint8* y_buf,
2689 const uint8* u_buf,
2690 const uint8* v_buf,
2691 uint8* dst_argb,
2692 int width) {
2693 __asm {
2694 push esi
2695 push edi
2696 mov eax, [esp + 8 + 4] // Y
2697 mov esi, [esp + 8 + 8] // U
2698 mov edi, [esp + 8 + 12] // V
2699 mov edx, [esp + 8 + 16] // argb
2700 mov ecx, [esp + 8 + 20] // width
2701 sub edi, esi
2702 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
2703
2704 convertloop:
2705 READYUV422
2706 YUVTORGB(kYuvConstants)
2707 STOREARGB
2708
2709 sub ecx, 8
2710 jg convertloop
2711
2712 pop edi
2713 pop esi
2714 ret
2715 }
2716 }
2717
2718 // 8 pixels.
2719 // JPeg color space version of I422ToARGB
2720 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
2721 __declspec(naked)
J422ToARGBRow_SSSE3(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * dst_argb,int width)2722 void J422ToARGBRow_SSSE3(const uint8* y_buf,
2723 const uint8* u_buf,
2724 const uint8* v_buf,
2725 uint8* dst_argb,
2726 int width) {
2727 __asm {
2728 push esi
2729 push edi
2730 mov eax, [esp + 8 + 4] // Y
2731 mov esi, [esp + 8 + 8] // U
2732 mov edi, [esp + 8 + 12] // V
2733 mov edx, [esp + 8 + 16] // argb
2734 mov ecx, [esp + 8 + 20] // width
2735 sub edi, esi
2736 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
2737
2738 convertloop:
2739 READYUV422
2740 YUVTORGB(kYuvJConstants)
2741 STOREARGB
2742
2743 sub ecx, 8
2744 jg convertloop
2745
2746 pop edi
2747 pop esi
2748 ret
2749 }
2750 }
2751
2752 // 8 pixels.
2753 // 2 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
2754 // Similar to I420 but duplicate UV once more.
2755 __declspec(naked)
I411ToARGBRow_SSSE3(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * dst_argb,int width)2756 void I411ToARGBRow_SSSE3(const uint8* y_buf,
2757 const uint8* u_buf,
2758 const uint8* v_buf,
2759 uint8* dst_argb,
2760 int width) {
2761 __asm {
2762 push ebx
2763 push esi
2764 push edi
2765 mov eax, [esp + 12 + 4] // Y
2766 mov esi, [esp + 12 + 8] // U
2767 mov edi, [esp + 12 + 12] // V
2768 mov edx, [esp + 12 + 16] // argb
2769 mov ecx, [esp + 12 + 20] // width
2770 sub edi, esi
2771 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
2772
2773 convertloop:
2774 READYUV411 // modifies EBX
2775 YUVTORGB(kYuvConstants)
2776 STOREARGB
2777
2778 sub ecx, 8
2779 jg convertloop
2780
2781 pop edi
2782 pop esi
2783 pop ebx
2784 ret
2785 }
2786 }
2787
2788 // 8 pixels.
2789 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
2790 __declspec(naked)
NV12ToARGBRow_SSSE3(const uint8 * y_buf,const uint8 * uv_buf,uint8 * dst_argb,int width)2791 void NV12ToARGBRow_SSSE3(const uint8* y_buf,
2792 const uint8* uv_buf,
2793 uint8* dst_argb,
2794 int width) {
2795 __asm {
2796 push esi
2797 mov eax, [esp + 4 + 4] // Y
2798 mov esi, [esp + 4 + 8] // UV
2799 mov edx, [esp + 4 + 12] // argb
2800 mov ecx, [esp + 4 + 16] // width
2801 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
2802
2803 convertloop:
2804 READNV12
2805 YUVTORGB(kYuvConstants)
2806 STOREARGB
2807
2808 sub ecx, 8
2809 jg convertloop
2810
2811 pop esi
2812 ret
2813 }
2814 }
2815
2816 // 8 pixels.
2817 // 4 VU values upsampled to 8 VU, mixed with 8 Y producing 8 ARGB (32 bytes).
2818 __declspec(naked)
NV21ToARGBRow_SSSE3(const uint8 * y_buf,const uint8 * uv_buf,uint8 * dst_argb,int width)2819 void NV21ToARGBRow_SSSE3(const uint8* y_buf,
2820 const uint8* uv_buf,
2821 uint8* dst_argb,
2822 int width) {
2823 __asm {
2824 push esi
2825 mov eax, [esp + 4 + 4] // Y
2826 mov esi, [esp + 4 + 8] // UV
2827 mov edx, [esp + 4 + 12] // argb
2828 mov ecx, [esp + 4 + 16] // width
2829 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
2830
2831 convertloop:
2832 READNV12
2833 YUVTORGB(kYvuConstants)
2834 STOREARGB
2835
2836 sub ecx, 8
2837 jg convertloop
2838
2839 pop esi
2840 ret
2841 }
2842 }
2843
2844 __declspec(naked)
I422ToBGRARow_SSSE3(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * dst_bgra,int width)2845 void I422ToBGRARow_SSSE3(const uint8* y_buf,
2846 const uint8* u_buf,
2847 const uint8* v_buf,
2848 uint8* dst_bgra,
2849 int width) {
2850 __asm {
2851 push esi
2852 push edi
2853 mov eax, [esp + 8 + 4] // Y
2854 mov esi, [esp + 8 + 8] // U
2855 mov edi, [esp + 8 + 12] // V
2856 mov edx, [esp + 8 + 16] // bgra
2857 mov ecx, [esp + 8 + 20] // width
2858 sub edi, esi
2859
2860 convertloop:
2861 READYUV422
2862 YUVTORGB(kYuvConstants)
2863 STOREBGRA
2864
2865 sub ecx, 8
2866 jg convertloop
2867
2868 pop edi
2869 pop esi
2870 ret
2871 }
2872 }
2873
2874 __declspec(naked)
I422ToABGRRow_SSSE3(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * dst_abgr,int width)2875 void I422ToABGRRow_SSSE3(const uint8* y_buf,
2876 const uint8* u_buf,
2877 const uint8* v_buf,
2878 uint8* dst_abgr,
2879 int width) {
2880 __asm {
2881 push esi
2882 push edi
2883 mov eax, [esp + 8 + 4] // Y
2884 mov esi, [esp + 8 + 8] // U
2885 mov edi, [esp + 8 + 12] // V
2886 mov edx, [esp + 8 + 16] // abgr
2887 mov ecx, [esp + 8 + 20] // width
2888 sub edi, esi
2889 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
2890
2891 convertloop:
2892 READYUV422
2893 YUVTORGB(kYuvConstants)
2894 STOREABGR
2895
2896 sub ecx, 8
2897 jg convertloop
2898
2899 pop edi
2900 pop esi
2901 ret
2902 }
2903 }
2904
2905 __declspec(naked)
I422ToRGBARow_SSSE3(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * dst_rgba,int width)2906 void I422ToRGBARow_SSSE3(const uint8* y_buf,
2907 const uint8* u_buf,
2908 const uint8* v_buf,
2909 uint8* dst_rgba,
2910 int width) {
2911 __asm {
2912 push esi
2913 push edi
2914 mov eax, [esp + 8 + 4] // Y
2915 mov esi, [esp + 8 + 8] // U
2916 mov edi, [esp + 8 + 12] // V
2917 mov edx, [esp + 8 + 16] // rgba
2918 mov ecx, [esp + 8 + 20] // width
2919 sub edi, esi
2920
2921 convertloop:
2922 READYUV422
2923 YUVTORGB(kYuvConstants)
2924 STORERGBA
2925
2926 sub ecx, 8
2927 jg convertloop
2928
2929 pop edi
2930 pop esi
2931 ret
2932 }
2933 }
2934
2935 #endif // HAS_I422TOARGBROW_SSSE3
2936
2937 #ifdef HAS_I400TOARGBROW_SSE2
2938 // 8 pixels of Y converted to 8 pixels of ARGB (32 bytes).
2939 __declspec(naked)
I400ToARGBRow_SSE2(const uint8 * y_buf,uint8 * rgb_buf,int width)2940 void I400ToARGBRow_SSE2(const uint8* y_buf,
2941 uint8* rgb_buf,
2942 int width) {
2943 __asm {
2944 mov eax, 0x4a354a35 // 4a35 = 18997 = round(1.164 * 64 * 256)
2945 movd xmm2, eax
2946 pshufd xmm2, xmm2,0
2947 mov eax, 0x04880488 // 0488 = 1160 = round(1.164 * 64 * 16)
2948 movd xmm3, eax
2949 pshufd xmm3, xmm3, 0
2950 pcmpeqb xmm4, xmm4 // generate mask 0xff000000
2951 pslld xmm4, 24
2952
2953 mov eax, [esp + 4] // Y
2954 mov edx, [esp + 8] // rgb
2955 mov ecx, [esp + 12] // width
2956
2957 convertloop:
2958 // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
2959 movq xmm0, qword ptr [eax]
2960 lea eax, [eax + 8]
2961 punpcklbw xmm0, xmm0 // Y.Y
2962 pmulhuw xmm0, xmm2
2963 psubusw xmm0, xmm3
2964 psrlw xmm0, 6
2965 packuswb xmm0, xmm0 // G
2966
2967 // Step 2: Weave into ARGB
2968 punpcklbw xmm0, xmm0 // GG
2969 movdqa xmm1, xmm0
2970 punpcklwd xmm0, xmm0 // BGRA first 4 pixels
2971 punpckhwd xmm1, xmm1 // BGRA next 4 pixels
2972 por xmm0, xmm4
2973 por xmm1, xmm4
2974 movdqu [edx], xmm0
2975 movdqu [edx + 16], xmm1
2976 lea edx, [edx + 32]
2977 sub ecx, 8
2978 jg convertloop
2979 ret
2980 }
2981 }
2982 #endif // HAS_I400TOARGBROW_SSE2
2983
2984 #ifdef HAS_I400TOARGBROW_AVX2
2985 // 16 pixels of Y converted to 16 pixels of ARGB (64 bytes).
2986 // note: vpunpcklbw mutates and vpackuswb unmutates.
2987 __declspec(naked)
I400ToARGBRow_AVX2(const uint8 * y_buf,uint8 * rgb_buf,int width)2988 void I400ToARGBRow_AVX2(const uint8* y_buf,
2989 uint8* rgb_buf,
2990 int width) {
2991 __asm {
2992 mov eax, 0x4a354a35 // 4a35 = 18997 = round(1.164 * 64 * 256)
2993 vmovd xmm2, eax
2994 vbroadcastss ymm2, xmm2
2995 mov eax, 0x04880488 // 0488 = 1160 = round(1.164 * 64 * 16)
2996 vmovd xmm3, eax
2997 vbroadcastss ymm3, xmm3
2998 vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0xff000000
2999 vpslld ymm4, ymm4, 24
3000
3001 mov eax, [esp + 4] // Y
3002 mov edx, [esp + 8] // rgb
3003 mov ecx, [esp + 12] // width
3004
3005 convertloop:
3006 // Step 1: Scale Y contriportbution to 16 G values. G = (y - 16) * 1.164
3007 vmovdqu xmm0, [eax]
3008 lea eax, [eax + 16]
3009 vpermq ymm0, ymm0, 0xd8 // vpunpcklbw mutates
3010 vpunpcklbw ymm0, ymm0, ymm0 // Y.Y
3011 vpmulhuw ymm0, ymm0, ymm2
3012 vpsubusw ymm0, ymm0, ymm3
3013 vpsrlw ymm0, ymm0, 6
3014 vpackuswb ymm0, ymm0, ymm0 // G. still mutated: 3120
3015
3016 // TODO(fbarchard): Weave alpha with unpack.
3017 // Step 2: Weave into ARGB
3018 vpunpcklbw ymm1, ymm0, ymm0 // GG - mutates
3019 vpermq ymm1, ymm1, 0xd8
3020 vpunpcklwd ymm0, ymm1, ymm1 // GGGG first 8 pixels
3021 vpunpckhwd ymm1, ymm1, ymm1 // GGGG next 8 pixels
3022 vpor ymm0, ymm0, ymm4
3023 vpor ymm1, ymm1, ymm4
3024 vmovdqu [edx], ymm0
3025 vmovdqu [edx + 32], ymm1
3026 lea edx, [edx + 64]
3027 sub ecx, 16
3028 jg convertloop
3029 vzeroupper
3030 ret
3031 }
3032 }
3033 #endif // HAS_I400TOARGBROW_AVX2
3034
3035 #ifdef HAS_MIRRORROW_SSSE3
3036 // Shuffle table for reversing the bytes.
3037 static const uvec8 kShuffleMirror = {
3038 15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
3039 };
3040
3041 // TODO(fbarchard): Replace lea with -16 offset.
3042 __declspec(naked)
MirrorRow_SSSE3(const uint8 * src,uint8 * dst,int width)3043 void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
3044 __asm {
3045 mov eax, [esp + 4] // src
3046 mov edx, [esp + 8] // dst
3047 mov ecx, [esp + 12] // width
3048 movdqa xmm5, kShuffleMirror
3049
3050 convertloop:
3051 movdqu xmm0, [eax - 16 + ecx]
3052 pshufb xmm0, xmm5
3053 movdqu [edx], xmm0
3054 lea edx, [edx + 16]
3055 sub ecx, 16
3056 jg convertloop
3057 ret
3058 }
3059 }
3060 #endif // HAS_MIRRORROW_SSSE3
3061
3062 #ifdef HAS_MIRRORROW_AVX2
3063 __declspec(naked)
MirrorRow_AVX2(const uint8 * src,uint8 * dst,int width)3064 void MirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
3065 __asm {
3066 mov eax, [esp + 4] // src
3067 mov edx, [esp + 8] // dst
3068 mov ecx, [esp + 12] // width
3069 vbroadcastf128 ymm5, kShuffleMirror
3070
3071 convertloop:
3072 vmovdqu ymm0, [eax - 32 + ecx]
3073 vpshufb ymm0, ymm0, ymm5
3074 vpermq ymm0, ymm0, 0x4e // swap high and low halfs
3075 vmovdqu [edx], ymm0
3076 lea edx, [edx + 32]
3077 sub ecx, 32
3078 jg convertloop
3079 vzeroupper
3080 ret
3081 }
3082 }
3083 #endif // HAS_MIRRORROW_AVX2
3084
3085 #ifdef HAS_MIRRORROW_SSE2
3086 __declspec(naked)
MirrorRow_SSE2(const uint8 * src,uint8 * dst,int width)3087 void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
3088 __asm {
3089 mov eax, [esp + 4] // src
3090 mov edx, [esp + 8] // dst
3091 mov ecx, [esp + 12] // width
3092
3093 convertloop:
3094 movdqu xmm0, [eax - 16 + ecx]
3095 movdqa xmm1, xmm0 // swap bytes
3096 psllw xmm0, 8
3097 psrlw xmm1, 8
3098 por xmm0, xmm1
3099 pshuflw xmm0, xmm0, 0x1b // swap words
3100 pshufhw xmm0, xmm0, 0x1b
3101 pshufd xmm0, xmm0, 0x4e // swap qwords
3102 movdqu [edx], xmm0
3103 lea edx, [edx + 16]
3104 sub ecx, 16
3105 jg convertloop
3106 ret
3107 }
3108 }
3109 #endif // HAS_MIRRORROW_SSE2
3110
3111 #ifdef HAS_MIRRORROW_UV_SSSE3
3112 // Shuffle table for reversing the bytes of UV channels.
3113 static const uvec8 kShuffleMirrorUV = {
3114 14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u
3115 };
3116
3117 __declspec(naked)
MirrorUVRow_SSSE3(const uint8 * src,uint8 * dst_u,uint8 * dst_v,int width)3118 void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
3119 int width) {
3120 __asm {
3121 push edi
3122 mov eax, [esp + 4 + 4] // src
3123 mov edx, [esp + 4 + 8] // dst_u
3124 mov edi, [esp + 4 + 12] // dst_v
3125 mov ecx, [esp + 4 + 16] // width
3126 movdqa xmm1, kShuffleMirrorUV
3127 lea eax, [eax + ecx * 2 - 16]
3128 sub edi, edx
3129
3130 convertloop:
3131 movdqu xmm0, [eax]
3132 lea eax, [eax - 16]
3133 pshufb xmm0, xmm1
3134 movlpd qword ptr [edx], xmm0
3135 movhpd qword ptr [edx + edi], xmm0
3136 lea edx, [edx + 8]
3137 sub ecx, 8
3138 jg convertloop
3139
3140 pop edi
3141 ret
3142 }
3143 }
3144 #endif // HAS_MIRRORROW_UV_SSSE3
3145
3146 #ifdef HAS_ARGBMIRRORROW_SSE2
3147 __declspec(naked)
ARGBMirrorRow_SSE2(const uint8 * src,uint8 * dst,int width)3148 void ARGBMirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
3149 __asm {
3150 mov eax, [esp + 4] // src
3151 mov edx, [esp + 8] // dst
3152 mov ecx, [esp + 12] // width
3153 lea eax, [eax - 16 + ecx * 4] // last 4 pixels.
3154
3155 convertloop:
3156 movdqu xmm0, [eax]
3157 lea eax, [eax - 16]
3158 pshufd xmm0, xmm0, 0x1b
3159 movdqu [edx], xmm0
3160 lea edx, [edx + 16]
3161 sub ecx, 4
3162 jg convertloop
3163 ret
3164 }
3165 }
3166 #endif // HAS_ARGBMIRRORROW_SSE2
3167
3168 #ifdef HAS_ARGBMIRRORROW_AVX2
3169 // Shuffle table for reversing the bytes.
3170 static const ulvec32 kARGBShuffleMirror_AVX2 = {
3171 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
3172 };
3173
3174 __declspec(naked)
ARGBMirrorRow_AVX2(const uint8 * src,uint8 * dst,int width)3175 void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
3176 __asm {
3177 mov eax, [esp + 4] // src
3178 mov edx, [esp + 8] // dst
3179 mov ecx, [esp + 12] // width
3180 vmovdqu ymm5, kARGBShuffleMirror_AVX2
3181
3182 convertloop:
3183 vpermd ymm0, ymm5, [eax - 32 + ecx * 4] // permute dword order
3184 vmovdqu [edx], ymm0
3185 lea edx, [edx + 32]
3186 sub ecx, 8
3187 jg convertloop
3188 vzeroupper
3189 ret
3190 }
3191 }
3192 #endif // HAS_ARGBMIRRORROW_AVX2
3193
3194 #ifdef HAS_SPLITUVROW_SSE2
3195 __declspec(naked)
SplitUVRow_SSE2(const uint8 * src_uv,uint8 * dst_u,uint8 * dst_v,int pix)3196 void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
3197 __asm {
3198 push edi
3199 mov eax, [esp + 4 + 4] // src_uv
3200 mov edx, [esp + 4 + 8] // dst_u
3201 mov edi, [esp + 4 + 12] // dst_v
3202 mov ecx, [esp + 4 + 16] // pix
3203 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
3204 psrlw xmm5, 8
3205 sub edi, edx
3206
3207 convertloop:
3208 movdqu xmm0, [eax]
3209 movdqu xmm1, [eax + 16]
3210 lea eax, [eax + 32]
3211 movdqa xmm2, xmm0
3212 movdqa xmm3, xmm1
3213 pand xmm0, xmm5 // even bytes
3214 pand xmm1, xmm5
3215 packuswb xmm0, xmm1
3216 psrlw xmm2, 8 // odd bytes
3217 psrlw xmm3, 8
3218 packuswb xmm2, xmm3
3219 movdqu [edx], xmm0
3220 movdqu [edx + edi], xmm2
3221 lea edx, [edx + 16]
3222 sub ecx, 16
3223 jg convertloop
3224
3225 pop edi
3226 ret
3227 }
3228 }
3229
3230 #endif // HAS_SPLITUVROW_SSE2
3231
3232 #ifdef HAS_SPLITUVROW_AVX2
3233 __declspec(naked)
SplitUVRow_AVX2(const uint8 * src_uv,uint8 * dst_u,uint8 * dst_v,int pix)3234 void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
3235 __asm {
3236 push edi
3237 mov eax, [esp + 4 + 4] // src_uv
3238 mov edx, [esp + 4 + 8] // dst_u
3239 mov edi, [esp + 4 + 12] // dst_v
3240 mov ecx, [esp + 4 + 16] // pix
3241 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff
3242 vpsrlw ymm5, ymm5, 8
3243 sub edi, edx
3244
3245 convertloop:
3246 vmovdqu ymm0, [eax]
3247 vmovdqu ymm1, [eax + 32]
3248 lea eax, [eax + 64]
3249 vpsrlw ymm2, ymm0, 8 // odd bytes
3250 vpsrlw ymm3, ymm1, 8
3251 vpand ymm0, ymm0, ymm5 // even bytes
3252 vpand ymm1, ymm1, ymm5
3253 vpackuswb ymm0, ymm0, ymm1
3254 vpackuswb ymm2, ymm2, ymm3
3255 vpermq ymm0, ymm0, 0xd8
3256 vpermq ymm2, ymm2, 0xd8
3257 vmovdqu [edx], ymm0
3258 vmovdqu [edx + edi], ymm2
3259 lea edx, [edx + 32]
3260 sub ecx, 32
3261 jg convertloop
3262
3263 pop edi
3264 vzeroupper
3265 ret
3266 }
3267 }
3268 #endif // HAS_SPLITUVROW_AVX2
3269
3270 #ifdef HAS_MERGEUVROW_SSE2
3271 __declspec(naked)
MergeUVRow_SSE2(const uint8 * src_u,const uint8 * src_v,uint8 * dst_uv,int width)3272 void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
3273 int width) {
3274 __asm {
3275 push edi
3276 mov eax, [esp + 4 + 4] // src_u
3277 mov edx, [esp + 4 + 8] // src_v
3278 mov edi, [esp + 4 + 12] // dst_uv
3279 mov ecx, [esp + 4 + 16] // width
3280 sub edx, eax
3281
3282 convertloop:
3283 movdqu xmm0, [eax] // read 16 U's
3284 movdqu xmm1, [eax + edx] // and 16 V's
3285 lea eax, [eax + 16]
3286 movdqa xmm2, xmm0
3287 punpcklbw xmm0, xmm1 // first 8 UV pairs
3288 punpckhbw xmm2, xmm1 // next 8 UV pairs
3289 movdqu [edi], xmm0
3290 movdqu [edi + 16], xmm2
3291 lea edi, [edi + 32]
3292 sub ecx, 16
3293 jg convertloop
3294
3295 pop edi
3296 ret
3297 }
3298 }
3299 #endif // HAS_MERGEUVROW_SSE2
3300
3301 #ifdef HAS_MERGEUVROW_AVX2
3302 __declspec(naked)
MergeUVRow_AVX2(const uint8 * src_u,const uint8 * src_v,uint8 * dst_uv,int width)3303 void MergeUVRow_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
3304 int width) {
3305 __asm {
3306 push edi
3307 mov eax, [esp + 4 + 4] // src_u
3308 mov edx, [esp + 4 + 8] // src_v
3309 mov edi, [esp + 4 + 12] // dst_uv
3310 mov ecx, [esp + 4 + 16] // width
3311 sub edx, eax
3312
3313 convertloop:
3314 vmovdqu ymm0, [eax] // read 32 U's
3315 vmovdqu ymm1, [eax + edx] // and 32 V's
3316 lea eax, [eax + 32]
3317 vpunpcklbw ymm2, ymm0, ymm1 // low 16 UV pairs. mutated qqword 0,2
3318 vpunpckhbw ymm0, ymm0, ymm1 // high 16 UV pairs. mutated qqword 1,3
3319 vextractf128 [edi], ymm2, 0 // bytes 0..15
3320 vextractf128 [edi + 16], ymm0, 0 // bytes 16..31
3321 vextractf128 [edi + 32], ymm2, 1 // bytes 32..47
3322 vextractf128 [edi + 48], ymm0, 1 // bytes 47..63
3323 lea edi, [edi + 64]
3324 sub ecx, 32
3325 jg convertloop
3326
3327 pop edi
3328 vzeroupper
3329 ret
3330 }
3331 }
3332 #endif // HAS_MERGEUVROW_AVX2
3333
3334 #ifdef HAS_COPYROW_SSE2
3335 // CopyRow copys 'count' bytes using a 16 byte load/store, 32 bytes at time.
3336 __declspec(naked)
CopyRow_SSE2(const uint8 * src,uint8 * dst,int count)3337 void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
3338 __asm {
3339 mov eax, [esp + 4] // src
3340 mov edx, [esp + 8] // dst
3341 mov ecx, [esp + 12] // count
3342
3343 convertloop:
3344 movdqu xmm0, [eax]
3345 movdqu xmm1, [eax + 16]
3346 lea eax, [eax + 32]
3347 movdqu [edx], xmm0
3348 movdqu [edx + 16], xmm1
3349 lea edx, [edx + 32]
3350 sub ecx, 32
3351 jg convertloop
3352 ret
3353 }
3354 }
3355 #endif // HAS_COPYROW_SSE2
3356
3357 #ifdef HAS_COPYROW_AVX
3358 // CopyRow copys 'count' bytes using a 32 byte load/store, 64 bytes at time.
3359 __declspec(naked)
CopyRow_AVX(const uint8 * src,uint8 * dst,int count)3360 void CopyRow_AVX(const uint8* src, uint8* dst, int count) {
3361 __asm {
3362 mov eax, [esp + 4] // src
3363 mov edx, [esp + 8] // dst
3364 mov ecx, [esp + 12] // count
3365
3366 convertloop:
3367 vmovdqu ymm0, [eax]
3368 vmovdqu ymm1, [eax + 32]
3369 lea eax, [eax + 64]
3370 vmovdqu [edx], ymm0
3371 vmovdqu [edx + 32], ymm1
3372 lea edx, [edx + 64]
3373 sub ecx, 64
3374 jg convertloop
3375
3376 vzeroupper
3377 ret
3378 }
3379 }
3380 #endif // HAS_COPYROW_AVX
3381
3382 // Multiple of 1.
3383 __declspec(naked)
CopyRow_ERMS(const uint8 * src,uint8 * dst,int count)3384 void CopyRow_ERMS(const uint8* src, uint8* dst, int count) {
3385 __asm {
3386 mov eax, esi
3387 mov edx, edi
3388 mov esi, [esp + 4] // src
3389 mov edi, [esp + 8] // dst
3390 mov ecx, [esp + 12] // count
3391 rep movsb
3392 mov edi, edx
3393 mov esi, eax
3394 ret
3395 }
3396 }
3397
3398 #ifdef HAS_ARGBCOPYALPHAROW_SSE2
3399 // width in pixels
3400 __declspec(naked)
ARGBCopyAlphaRow_SSE2(const uint8 * src,uint8 * dst,int width)3401 void ARGBCopyAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
3402 __asm {
3403 mov eax, [esp + 4] // src
3404 mov edx, [esp + 8] // dst
3405 mov ecx, [esp + 12] // count
3406 pcmpeqb xmm0, xmm0 // generate mask 0xff000000
3407 pslld xmm0, 24
3408 pcmpeqb xmm1, xmm1 // generate mask 0x00ffffff
3409 psrld xmm1, 8
3410
3411 convertloop:
3412 movdqu xmm2, [eax]
3413 movdqu xmm3, [eax + 16]
3414 lea eax, [eax + 32]
3415 movdqu xmm4, [edx]
3416 movdqu xmm5, [edx + 16]
3417 pand xmm2, xmm0
3418 pand xmm3, xmm0
3419 pand xmm4, xmm1
3420 pand xmm5, xmm1
3421 por xmm2, xmm4
3422 por xmm3, xmm5
3423 movdqu [edx], xmm2
3424 movdqu [edx + 16], xmm3
3425 lea edx, [edx + 32]
3426 sub ecx, 8
3427 jg convertloop
3428
3429 ret
3430 }
3431 }
3432 #endif // HAS_ARGBCOPYALPHAROW_SSE2
3433
3434 #ifdef HAS_ARGBCOPYALPHAROW_AVX2
3435 // width in pixels
3436 __declspec(naked)
ARGBCopyAlphaRow_AVX2(const uint8 * src,uint8 * dst,int width)3437 void ARGBCopyAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
3438 __asm {
3439 mov eax, [esp + 4] // src
3440 mov edx, [esp + 8] // dst
3441 mov ecx, [esp + 12] // count
3442 vpcmpeqb ymm0, ymm0, ymm0
3443 vpsrld ymm0, ymm0, 8 // generate mask 0x00ffffff
3444
3445 convertloop:
3446 vmovdqu ymm1, [eax]
3447 vmovdqu ymm2, [eax + 32]
3448 lea eax, [eax + 64]
3449 vpblendvb ymm1, ymm1, [edx], ymm0
3450 vpblendvb ymm2, ymm2, [edx + 32], ymm0
3451 vmovdqu [edx], ymm1
3452 vmovdqu [edx + 32], ymm2
3453 lea edx, [edx + 64]
3454 sub ecx, 16
3455 jg convertloop
3456
3457 vzeroupper
3458 ret
3459 }
3460 }
3461 #endif // HAS_ARGBCOPYALPHAROW_AVX2
3462
3463 #ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2
3464 // width in pixels
3465 __declspec(naked)
ARGBCopyYToAlphaRow_SSE2(const uint8 * src,uint8 * dst,int width)3466 void ARGBCopyYToAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
3467 __asm {
3468 mov eax, [esp + 4] // src
3469 mov edx, [esp + 8] // dst
3470 mov ecx, [esp + 12] // count
3471 pcmpeqb xmm0, xmm0 // generate mask 0xff000000
3472 pslld xmm0, 24
3473 pcmpeqb xmm1, xmm1 // generate mask 0x00ffffff
3474 psrld xmm1, 8
3475
3476 convertloop:
3477 movq xmm2, qword ptr [eax] // 8 Y's
3478 lea eax, [eax + 8]
3479 punpcklbw xmm2, xmm2
3480 punpckhwd xmm3, xmm2
3481 punpcklwd xmm2, xmm2
3482 movdqu xmm4, [edx]
3483 movdqu xmm5, [edx + 16]
3484 pand xmm2, xmm0
3485 pand xmm3, xmm0
3486 pand xmm4, xmm1
3487 pand xmm5, xmm1
3488 por xmm2, xmm4
3489 por xmm3, xmm5
3490 movdqu [edx], xmm2
3491 movdqu [edx + 16], xmm3
3492 lea edx, [edx + 32]
3493 sub ecx, 8
3494 jg convertloop
3495
3496 ret
3497 }
3498 }
3499 #endif // HAS_ARGBCOPYYTOALPHAROW_SSE2
3500
3501 #ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2
3502 // width in pixels
3503 __declspec(naked)
ARGBCopyYToAlphaRow_AVX2(const uint8 * src,uint8 * dst,int width)3504 void ARGBCopyYToAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
3505 __asm {
3506 mov eax, [esp + 4] // src
3507 mov edx, [esp + 8] // dst
3508 mov ecx, [esp + 12] // count
3509 vpcmpeqb ymm0, ymm0, ymm0
3510 vpsrld ymm0, ymm0, 8 // generate mask 0x00ffffff
3511
3512 convertloop:
3513 vpmovzxbd ymm1, qword ptr [eax]
3514 vpmovzxbd ymm2, qword ptr [eax + 8]
3515 lea eax, [eax + 16]
3516 vpslld ymm1, ymm1, 24
3517 vpslld ymm2, ymm2, 24
3518 vpblendvb ymm1, ymm1, [edx], ymm0
3519 vpblendvb ymm2, ymm2, [edx + 32], ymm0
3520 vmovdqu [edx], ymm1
3521 vmovdqu [edx + 32], ymm2
3522 lea edx, [edx + 64]
3523 sub ecx, 16
3524 jg convertloop
3525
3526 vzeroupper
3527 ret
3528 }
3529 }
3530 #endif // HAS_ARGBCOPYYTOALPHAROW_AVX2
3531
3532 #ifdef HAS_SETROW_X86
3533 // Write 'count' bytes using an 8 bit value repeated.
3534 // Count should be multiple of 4.
3535 __declspec(naked)
SetRow_X86(uint8 * dst,uint8 v8,int count)3536 void SetRow_X86(uint8* dst, uint8 v8, int count) {
3537 __asm {
3538 movzx eax, byte ptr [esp + 8] // v8
3539 mov edx, 0x01010101 // Duplicate byte to all bytes.
3540 mul edx // overwrites edx with upper part of result.
3541 mov edx, edi
3542 mov edi, [esp + 4] // dst
3543 mov ecx, [esp + 12] // count
3544 shr ecx, 2
3545 rep stosd
3546 mov edi, edx
3547 ret
3548 }
3549 }
3550
3551 // Write 'count' bytes using an 8 bit value repeated.
3552 __declspec(naked)
SetRow_ERMS(uint8 * dst,uint8 v8,int count)3553 void SetRow_ERMS(uint8* dst, uint8 v8, int count) {
3554 __asm {
3555 mov edx, edi
3556 mov edi, [esp + 4] // dst
3557 mov eax, [esp + 8] // v8
3558 mov ecx, [esp + 12] // count
3559 rep stosb
3560 mov edi, edx
3561 ret
3562 }
3563 }
3564
3565 // Write 'count' 32 bit values.
3566 __declspec(naked)
ARGBSetRow_X86(uint8 * dst_argb,uint32 v32,int count)3567 void ARGBSetRow_X86(uint8* dst_argb, uint32 v32, int count) {
3568 __asm {
3569 mov edx, edi
3570 mov edi, [esp + 4] // dst
3571 mov eax, [esp + 8] // v32
3572 mov ecx, [esp + 12] // count
3573 rep stosd
3574 mov edi, edx
3575 ret
3576 }
3577 }
3578 #endif // HAS_SETROW_X86
3579
3580 #ifdef HAS_YUY2TOYROW_AVX2
3581 __declspec(naked)
YUY2ToYRow_AVX2(const uint8 * src_yuy2,uint8 * dst_y,int pix)3582 void YUY2ToYRow_AVX2(const uint8* src_yuy2,
3583 uint8* dst_y, int pix) {
3584 __asm {
3585 mov eax, [esp + 4] // src_yuy2
3586 mov edx, [esp + 8] // dst_y
3587 mov ecx, [esp + 12] // pix
3588 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff
3589 vpsrlw ymm5, ymm5, 8
3590
3591 convertloop:
3592 vmovdqu ymm0, [eax]
3593 vmovdqu ymm1, [eax + 32]
3594 lea eax, [eax + 64]
3595 vpand ymm0, ymm0, ymm5 // even bytes are Y
3596 vpand ymm1, ymm1, ymm5
3597 vpackuswb ymm0, ymm0, ymm1 // mutates.
3598 vpermq ymm0, ymm0, 0xd8
3599 vmovdqu [edx], ymm0
3600 lea edx, [edx + 32]
3601 sub ecx, 32
3602 jg convertloop
3603 vzeroupper
3604 ret
3605 }
3606 }
3607
3608 __declspec(naked)
YUY2ToUVRow_AVX2(const uint8 * src_yuy2,int stride_yuy2,uint8 * dst_u,uint8 * dst_v,int pix)3609 void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2,
3610 uint8* dst_u, uint8* dst_v, int pix) {
3611 __asm {
3612 push esi
3613 push edi
3614 mov eax, [esp + 8 + 4] // src_yuy2
3615 mov esi, [esp + 8 + 8] // stride_yuy2
3616 mov edx, [esp + 8 + 12] // dst_u
3617 mov edi, [esp + 8 + 16] // dst_v
3618 mov ecx, [esp + 8 + 20] // pix
3619 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff
3620 vpsrlw ymm5, ymm5, 8
3621 sub edi, edx
3622
3623 convertloop:
3624 vmovdqu ymm0, [eax]
3625 vmovdqu ymm1, [eax + 32]
3626 vpavgb ymm0, ymm0, [eax + esi]
3627 vpavgb ymm1, ymm1, [eax + esi + 32]
3628 lea eax, [eax + 64]
3629 vpsrlw ymm0, ymm0, 8 // YUYV -> UVUV
3630 vpsrlw ymm1, ymm1, 8
3631 vpackuswb ymm0, ymm0, ymm1 // mutates.
3632 vpermq ymm0, ymm0, 0xd8
3633 vpand ymm1, ymm0, ymm5 // U
3634 vpsrlw ymm0, ymm0, 8 // V
3635 vpackuswb ymm1, ymm1, ymm1 // mutates.
3636 vpackuswb ymm0, ymm0, ymm0 // mutates.
3637 vpermq ymm1, ymm1, 0xd8
3638 vpermq ymm0, ymm0, 0xd8
3639 vextractf128 [edx], ymm1, 0 // U
3640 vextractf128 [edx + edi], ymm0, 0 // V
3641 lea edx, [edx + 16]
3642 sub ecx, 32
3643 jg convertloop
3644
3645 pop edi
3646 pop esi
3647 vzeroupper
3648 ret
3649 }
3650 }
3651
3652 __declspec(naked)
YUY2ToUV422Row_AVX2(const uint8 * src_yuy2,uint8 * dst_u,uint8 * dst_v,int pix)3653 void YUY2ToUV422Row_AVX2(const uint8* src_yuy2,
3654 uint8* dst_u, uint8* dst_v, int pix) {
3655 __asm {
3656 push edi
3657 mov eax, [esp + 4 + 4] // src_yuy2
3658 mov edx, [esp + 4 + 8] // dst_u
3659 mov edi, [esp + 4 + 12] // dst_v
3660 mov ecx, [esp + 4 + 16] // pix
3661 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff
3662 vpsrlw ymm5, ymm5, 8
3663 sub edi, edx
3664
3665 convertloop:
3666 vmovdqu ymm0, [eax]
3667 vmovdqu ymm1, [eax + 32]
3668 lea eax, [eax + 64]
3669 vpsrlw ymm0, ymm0, 8 // YUYV -> UVUV
3670 vpsrlw ymm1, ymm1, 8
3671 vpackuswb ymm0, ymm0, ymm1 // mutates.
3672 vpermq ymm0, ymm0, 0xd8
3673 vpand ymm1, ymm0, ymm5 // U
3674 vpsrlw ymm0, ymm0, 8 // V
3675 vpackuswb ymm1, ymm1, ymm1 // mutates.
3676 vpackuswb ymm0, ymm0, ymm0 // mutates.
3677 vpermq ymm1, ymm1, 0xd8
3678 vpermq ymm0, ymm0, 0xd8
3679 vextractf128 [edx], ymm1, 0 // U
3680 vextractf128 [edx + edi], ymm0, 0 // V
3681 lea edx, [edx + 16]
3682 sub ecx, 32
3683 jg convertloop
3684
3685 pop edi
3686 vzeroupper
3687 ret
3688 }
3689 }
3690
3691 __declspec(naked)
UYVYToYRow_AVX2(const uint8 * src_uyvy,uint8 * dst_y,int pix)3692 void UYVYToYRow_AVX2(const uint8* src_uyvy,
3693 uint8* dst_y, int pix) {
3694 __asm {
3695 mov eax, [esp + 4] // src_uyvy
3696 mov edx, [esp + 8] // dst_y
3697 mov ecx, [esp + 12] // pix
3698
3699 convertloop:
3700 vmovdqu ymm0, [eax]
3701 vmovdqu ymm1, [eax + 32]
3702 lea eax, [eax + 64]
3703 vpsrlw ymm0, ymm0, 8 // odd bytes are Y
3704 vpsrlw ymm1, ymm1, 8
3705 vpackuswb ymm0, ymm0, ymm1 // mutates.
3706 vpermq ymm0, ymm0, 0xd8
3707 vmovdqu [edx], ymm0
3708 lea edx, [edx + 32]
3709 sub ecx, 32
3710 jg convertloop
3711 vzeroupper
3712 ret
3713 }
3714 }
3715
3716 __declspec(naked)
UYVYToUVRow_AVX2(const uint8 * src_uyvy,int stride_uyvy,uint8 * dst_u,uint8 * dst_v,int pix)3717 void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy,
3718 uint8* dst_u, uint8* dst_v, int pix) {
3719 __asm {
3720 push esi
3721 push edi
3722 mov eax, [esp + 8 + 4] // src_yuy2
3723 mov esi, [esp + 8 + 8] // stride_yuy2
3724 mov edx, [esp + 8 + 12] // dst_u
3725 mov edi, [esp + 8 + 16] // dst_v
3726 mov ecx, [esp + 8 + 20] // pix
3727 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff
3728 vpsrlw ymm5, ymm5, 8
3729 sub edi, edx
3730
3731 convertloop:
3732 vmovdqu ymm0, [eax]
3733 vmovdqu ymm1, [eax + 32]
3734 vpavgb ymm0, ymm0, [eax + esi]
3735 vpavgb ymm1, ymm1, [eax + esi + 32]
3736 lea eax, [eax + 64]
3737 vpand ymm0, ymm0, ymm5 // UYVY -> UVUV
3738 vpand ymm1, ymm1, ymm5
3739 vpackuswb ymm0, ymm0, ymm1 // mutates.
3740 vpermq ymm0, ymm0, 0xd8
3741 vpand ymm1, ymm0, ymm5 // U
3742 vpsrlw ymm0, ymm0, 8 // V
3743 vpackuswb ymm1, ymm1, ymm1 // mutates.
3744 vpackuswb ymm0, ymm0, ymm0 // mutates.
3745 vpermq ymm1, ymm1, 0xd8
3746 vpermq ymm0, ymm0, 0xd8
3747 vextractf128 [edx], ymm1, 0 // U
3748 vextractf128 [edx + edi], ymm0, 0 // V
3749 lea edx, [edx + 16]
3750 sub ecx, 32
3751 jg convertloop
3752
3753 pop edi
3754 pop esi
3755 vzeroupper
3756 ret
3757 }
3758 }
3759
3760 __declspec(naked)
UYVYToUV422Row_AVX2(const uint8 * src_uyvy,uint8 * dst_u,uint8 * dst_v,int pix)3761 void UYVYToUV422Row_AVX2(const uint8* src_uyvy,
3762 uint8* dst_u, uint8* dst_v, int pix) {
3763 __asm {
3764 push edi
3765 mov eax, [esp + 4 + 4] // src_yuy2
3766 mov edx, [esp + 4 + 8] // dst_u
3767 mov edi, [esp + 4 + 12] // dst_v
3768 mov ecx, [esp + 4 + 16] // pix
3769 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff
3770 vpsrlw ymm5, ymm5, 8
3771 sub edi, edx
3772
3773 convertloop:
3774 vmovdqu ymm0, [eax]
3775 vmovdqu ymm1, [eax + 32]
3776 lea eax, [eax + 64]
3777 vpand ymm0, ymm0, ymm5 // UYVY -> UVUV
3778 vpand ymm1, ymm1, ymm5
3779 vpackuswb ymm0, ymm0, ymm1 // mutates.
3780 vpermq ymm0, ymm0, 0xd8
3781 vpand ymm1, ymm0, ymm5 // U
3782 vpsrlw ymm0, ymm0, 8 // V
3783 vpackuswb ymm1, ymm1, ymm1 // mutates.
3784 vpackuswb ymm0, ymm0, ymm0 // mutates.
3785 vpermq ymm1, ymm1, 0xd8
3786 vpermq ymm0, ymm0, 0xd8
3787 vextractf128 [edx], ymm1, 0 // U
3788 vextractf128 [edx + edi], ymm0, 0 // V
3789 lea edx, [edx + 16]
3790 sub ecx, 32
3791 jg convertloop
3792
3793 pop edi
3794 vzeroupper
3795 ret
3796 }
3797 }
3798 #endif // HAS_YUY2TOYROW_AVX2
3799
3800 #ifdef HAS_YUY2TOYROW_SSE2
3801 __declspec(naked)
YUY2ToYRow_SSE2(const uint8 * src_yuy2,uint8 * dst_y,int pix)3802 void YUY2ToYRow_SSE2(const uint8* src_yuy2,
3803 uint8* dst_y, int pix) {
3804 __asm {
3805 mov eax, [esp + 4] // src_yuy2
3806 mov edx, [esp + 8] // dst_y
3807 mov ecx, [esp + 12] // pix
3808 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
3809 psrlw xmm5, 8
3810
3811 convertloop:
3812 movdqu xmm0, [eax]
3813 movdqu xmm1, [eax + 16]
3814 lea eax, [eax + 32]
3815 pand xmm0, xmm5 // even bytes are Y
3816 pand xmm1, xmm5
3817 packuswb xmm0, xmm1
3818 movdqu [edx], xmm0
3819 lea edx, [edx + 16]
3820 sub ecx, 16
3821 jg convertloop
3822 ret
3823 }
3824 }
3825
3826 __declspec(naked)
YUY2ToUVRow_SSE2(const uint8 * src_yuy2,int stride_yuy2,uint8 * dst_u,uint8 * dst_v,int pix)3827 void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
3828 uint8* dst_u, uint8* dst_v, int pix) {
3829 __asm {
3830 push esi
3831 push edi
3832 mov eax, [esp + 8 + 4] // src_yuy2
3833 mov esi, [esp + 8 + 8] // stride_yuy2
3834 mov edx, [esp + 8 + 12] // dst_u
3835 mov edi, [esp + 8 + 16] // dst_v
3836 mov ecx, [esp + 8 + 20] // pix
3837 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
3838 psrlw xmm5, 8
3839 sub edi, edx
3840
3841 convertloop:
3842 movdqu xmm0, [eax]
3843 movdqu xmm1, [eax + 16]
3844 movdqu xmm2, [eax + esi]
3845 movdqu xmm3, [eax + esi + 16]
3846 lea eax, [eax + 32]
3847 pavgb xmm0, xmm2
3848 pavgb xmm1, xmm3
3849 psrlw xmm0, 8 // YUYV -> UVUV
3850 psrlw xmm1, 8
3851 packuswb xmm0, xmm1
3852 movdqa xmm1, xmm0
3853 pand xmm0, xmm5 // U
3854 packuswb xmm0, xmm0
3855 psrlw xmm1, 8 // V
3856 packuswb xmm1, xmm1
3857 movq qword ptr [edx], xmm0
3858 movq qword ptr [edx + edi], xmm1
3859 lea edx, [edx + 8]
3860 sub ecx, 16
3861 jg convertloop
3862
3863 pop edi
3864 pop esi
3865 ret
3866 }
3867 }
3868
3869 __declspec(naked)
YUY2ToUV422Row_SSE2(const uint8 * src_yuy2,uint8 * dst_u,uint8 * dst_v,int pix)3870 void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
3871 uint8* dst_u, uint8* dst_v, int pix) {
3872 __asm {
3873 push edi
3874 mov eax, [esp + 4 + 4] // src_yuy2
3875 mov edx, [esp + 4 + 8] // dst_u
3876 mov edi, [esp + 4 + 12] // dst_v
3877 mov ecx, [esp + 4 + 16] // pix
3878 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
3879 psrlw xmm5, 8
3880 sub edi, edx
3881
3882 convertloop:
3883 movdqu xmm0, [eax]
3884 movdqu xmm1, [eax + 16]
3885 lea eax, [eax + 32]
3886 psrlw xmm0, 8 // YUYV -> UVUV
3887 psrlw xmm1, 8
3888 packuswb xmm0, xmm1
3889 movdqa xmm1, xmm0
3890 pand xmm0, xmm5 // U
3891 packuswb xmm0, xmm0
3892 psrlw xmm1, 8 // V
3893 packuswb xmm1, xmm1
3894 movq qword ptr [edx], xmm0
3895 movq qword ptr [edx + edi], xmm1
3896 lea edx, [edx + 8]
3897 sub ecx, 16
3898 jg convertloop
3899
3900 pop edi
3901 ret
3902 }
3903 }
3904
3905 __declspec(naked)
UYVYToYRow_SSE2(const uint8 * src_uyvy,uint8 * dst_y,int pix)3906 void UYVYToYRow_SSE2(const uint8* src_uyvy,
3907 uint8* dst_y, int pix) {
3908 __asm {
3909 mov eax, [esp + 4] // src_uyvy
3910 mov edx, [esp + 8] // dst_y
3911 mov ecx, [esp + 12] // pix
3912
3913 convertloop:
3914 movdqu xmm0, [eax]
3915 movdqu xmm1, [eax + 16]
3916 lea eax, [eax + 32]
3917 psrlw xmm0, 8 // odd bytes are Y
3918 psrlw xmm1, 8
3919 packuswb xmm0, xmm1
3920 movdqu [edx], xmm0
3921 lea edx, [edx + 16]
3922 sub ecx, 16
3923 jg convertloop
3924 ret
3925 }
3926 }
3927
3928 __declspec(naked)
UYVYToUVRow_SSE2(const uint8 * src_uyvy,int stride_uyvy,uint8 * dst_u,uint8 * dst_v,int pix)3929 void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
3930 uint8* dst_u, uint8* dst_v, int pix) {
3931 __asm {
3932 push esi
3933 push edi
3934 mov eax, [esp + 8 + 4] // src_yuy2
3935 mov esi, [esp + 8 + 8] // stride_yuy2
3936 mov edx, [esp + 8 + 12] // dst_u
3937 mov edi, [esp + 8 + 16] // dst_v
3938 mov ecx, [esp + 8 + 20] // pix
3939 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
3940 psrlw xmm5, 8
3941 sub edi, edx
3942
3943 convertloop:
3944 movdqu xmm0, [eax]
3945 movdqu xmm1, [eax + 16]
3946 movdqu xmm2, [eax + esi]
3947 movdqu xmm3, [eax + esi + 16]
3948 lea eax, [eax + 32]
3949 pavgb xmm0, xmm2
3950 pavgb xmm1, xmm3
3951 pand xmm0, xmm5 // UYVY -> UVUV
3952 pand xmm1, xmm5
3953 packuswb xmm0, xmm1
3954 movdqa xmm1, xmm0
3955 pand xmm0, xmm5 // U
3956 packuswb xmm0, xmm0
3957 psrlw xmm1, 8 // V
3958 packuswb xmm1, xmm1
3959 movq qword ptr [edx], xmm0
3960 movq qword ptr [edx + edi], xmm1
3961 lea edx, [edx + 8]
3962 sub ecx, 16
3963 jg convertloop
3964
3965 pop edi
3966 pop esi
3967 ret
3968 }
3969 }
3970
3971 __declspec(naked)
UYVYToUV422Row_SSE2(const uint8 * src_uyvy,uint8 * dst_u,uint8 * dst_v,int pix)3972 void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
3973 uint8* dst_u, uint8* dst_v, int pix) {
3974 __asm {
3975 push edi
3976 mov eax, [esp + 4 + 4] // src_yuy2
3977 mov edx, [esp + 4 + 8] // dst_u
3978 mov edi, [esp + 4 + 12] // dst_v
3979 mov ecx, [esp + 4 + 16] // pix
3980 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
3981 psrlw xmm5, 8
3982 sub edi, edx
3983
3984 convertloop:
3985 movdqu xmm0, [eax]
3986 movdqu xmm1, [eax + 16]
3987 lea eax, [eax + 32]
3988 pand xmm0, xmm5 // UYVY -> UVUV
3989 pand xmm1, xmm5
3990 packuswb xmm0, xmm1
3991 movdqa xmm1, xmm0
3992 pand xmm0, xmm5 // U
3993 packuswb xmm0, xmm0
3994 psrlw xmm1, 8 // V
3995 packuswb xmm1, xmm1
3996 movq qword ptr [edx], xmm0
3997 movq qword ptr [edx + edi], xmm1
3998 lea edx, [edx + 8]
3999 sub ecx, 16
4000 jg convertloop
4001
4002 pop edi
4003 ret
4004 }
4005 }
4006 #endif // HAS_YUY2TOYROW_SSE2
4007
4008 #ifdef HAS_ARGBBLENDROW_SSE2
4009 // Blend 8 pixels at a time.
4010 __declspec(naked)
ARGBBlendRow_SSE2(const uint8 * src_argb0,const uint8 * src_argb1,uint8 * dst_argb,int width)4011 void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
4012 uint8* dst_argb, int width) {
4013 __asm {
4014 push esi
4015 mov eax, [esp + 4 + 4] // src_argb0
4016 mov esi, [esp + 4 + 8] // src_argb1
4017 mov edx, [esp + 4 + 12] // dst_argb
4018 mov ecx, [esp + 4 + 16] // width
4019 pcmpeqb xmm7, xmm7 // generate constant 1
4020 psrlw xmm7, 15
4021 pcmpeqb xmm6, xmm6 // generate mask 0x00ff00ff
4022 psrlw xmm6, 8
4023 pcmpeqb xmm5, xmm5 // generate mask 0xff00ff00
4024 psllw xmm5, 8
4025 pcmpeqb xmm4, xmm4 // generate mask 0xff000000
4026 pslld xmm4, 24
4027 sub ecx, 4
4028 jl convertloop4b // less than 4 pixels?
4029
4030 // 4 pixel loop.
4031 convertloop4:
4032 movdqu xmm3, [eax] // src argb
4033 lea eax, [eax + 16]
4034 movdqa xmm0, xmm3 // src argb
4035 pxor xmm3, xmm4 // ~alpha
4036 movdqu xmm2, [esi] // _r_b
4037 psrlw xmm3, 8 // alpha
4038 pshufhw xmm3, xmm3, 0F5h // 8 alpha words
4039 pshuflw xmm3, xmm3, 0F5h
4040 pand xmm2, xmm6 // _r_b
4041 paddw xmm3, xmm7 // 256 - alpha
4042 pmullw xmm2, xmm3 // _r_b * alpha
4043 movdqu xmm1, [esi] // _a_g
4044 lea esi, [esi + 16]
4045 psrlw xmm1, 8 // _a_g
4046 por xmm0, xmm4 // set alpha to 255
4047 pmullw xmm1, xmm3 // _a_g * alpha
4048 psrlw xmm2, 8 // _r_b convert to 8 bits again
4049 paddusb xmm0, xmm2 // + src argb
4050 pand xmm1, xmm5 // a_g_ convert to 8 bits again
4051 paddusb xmm0, xmm1 // + src argb
4052 movdqu [edx], xmm0
4053 lea edx, [edx + 16]
4054 sub ecx, 4
4055 jge convertloop4
4056
4057 convertloop4b:
4058 add ecx, 4 - 1
4059 jl convertloop1b
4060
4061 // 1 pixel loop.
4062 convertloop1:
4063 movd xmm3, [eax] // src argb
4064 lea eax, [eax + 4]
4065 movdqa xmm0, xmm3 // src argb
4066 pxor xmm3, xmm4 // ~alpha
4067 movd xmm2, [esi] // _r_b
4068 psrlw xmm3, 8 // alpha
4069 pshufhw xmm3, xmm3, 0F5h // 8 alpha words
4070 pshuflw xmm3, xmm3, 0F5h
4071 pand xmm2, xmm6 // _r_b
4072 paddw xmm3, xmm7 // 256 - alpha
4073 pmullw xmm2, xmm3 // _r_b * alpha
4074 movd xmm1, [esi] // _a_g
4075 lea esi, [esi + 4]
4076 psrlw xmm1, 8 // _a_g
4077 por xmm0, xmm4 // set alpha to 255
4078 pmullw xmm1, xmm3 // _a_g * alpha
4079 psrlw xmm2, 8 // _r_b convert to 8 bits again
4080 paddusb xmm0, xmm2 // + src argb
4081 pand xmm1, xmm5 // a_g_ convert to 8 bits again
4082 paddusb xmm0, xmm1 // + src argb
4083 movd [edx], xmm0
4084 lea edx, [edx + 4]
4085 sub ecx, 1
4086 jge convertloop1
4087
4088 convertloop1b:
4089 pop esi
4090 ret
4091 }
4092 }
4093 #endif // HAS_ARGBBLENDROW_SSE2
4094
4095 #ifdef HAS_ARGBBLENDROW_SSSE3
4096 // Shuffle table for isolating alpha.
4097 static const uvec8 kShuffleAlpha = {
4098 3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80,
4099 11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80
4100 };
4101 // Same as SSE2, but replaces:
4102 // psrlw xmm3, 8 // alpha
4103 // pshufhw xmm3, xmm3, 0F5h // 8 alpha words
4104 // pshuflw xmm3, xmm3, 0F5h
4105 // with..
4106 // pshufb xmm3, kShuffleAlpha // alpha
4107 // Blend 8 pixels at a time.
4108
4109 __declspec(naked)
ARGBBlendRow_SSSE3(const uint8 * src_argb0,const uint8 * src_argb1,uint8 * dst_argb,int width)4110 void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
4111 uint8* dst_argb, int width) {
4112 __asm {
4113 push esi
4114 mov eax, [esp + 4 + 4] // src_argb0
4115 mov esi, [esp + 4 + 8] // src_argb1
4116 mov edx, [esp + 4 + 12] // dst_argb
4117 mov ecx, [esp + 4 + 16] // width
4118 pcmpeqb xmm7, xmm7 // generate constant 0x0001
4119 psrlw xmm7, 15
4120 pcmpeqb xmm6, xmm6 // generate mask 0x00ff00ff
4121 psrlw xmm6, 8
4122 pcmpeqb xmm5, xmm5 // generate mask 0xff00ff00
4123 psllw xmm5, 8
4124 pcmpeqb xmm4, xmm4 // generate mask 0xff000000
4125 pslld xmm4, 24
4126 sub ecx, 4
4127 jl convertloop4b // less than 4 pixels?
4128
4129 // 4 pixel loop.
4130 convertloop4:
4131 movdqu xmm3, [eax] // src argb
4132 lea eax, [eax + 16]
4133 movdqa xmm0, xmm3 // src argb
4134 pxor xmm3, xmm4 // ~alpha
4135 movdqu xmm2, [esi] // _r_b
4136 pshufb xmm3, kShuffleAlpha // alpha
4137 pand xmm2, xmm6 // _r_b
4138 paddw xmm3, xmm7 // 256 - alpha
4139 pmullw xmm2, xmm3 // _r_b * alpha
4140 movdqu xmm1, [esi] // _a_g
4141 lea esi, [esi + 16]
4142 psrlw xmm1, 8 // _a_g
4143 por xmm0, xmm4 // set alpha to 255
4144 pmullw xmm1, xmm3 // _a_g * alpha
4145 psrlw xmm2, 8 // _r_b convert to 8 bits again
4146 paddusb xmm0, xmm2 // + src argb
4147 pand xmm1, xmm5 // a_g_ convert to 8 bits again
4148 paddusb xmm0, xmm1 // + src argb
4149 movdqu [edx], xmm0
4150 lea edx, [edx + 16]
4151 sub ecx, 4
4152 jge convertloop4
4153
4154 convertloop4b:
4155 add ecx, 4 - 1
4156 jl convertloop1b
4157
4158 // 1 pixel loop.
4159 convertloop1:
4160 movd xmm3, [eax] // src argb
4161 lea eax, [eax + 4]
4162 movdqa xmm0, xmm3 // src argb
4163 pxor xmm3, xmm4 // ~alpha
4164 movd xmm2, [esi] // _r_b
4165 pshufb xmm3, kShuffleAlpha // alpha
4166 pand xmm2, xmm6 // _r_b
4167 paddw xmm3, xmm7 // 256 - alpha
4168 pmullw xmm2, xmm3 // _r_b * alpha
4169 movd xmm1, [esi] // _a_g
4170 lea esi, [esi + 4]
4171 psrlw xmm1, 8 // _a_g
4172 por xmm0, xmm4 // set alpha to 255
4173 pmullw xmm1, xmm3 // _a_g * alpha
4174 psrlw xmm2, 8 // _r_b convert to 8 bits again
4175 paddusb xmm0, xmm2 // + src argb
4176 pand xmm1, xmm5 // a_g_ convert to 8 bits again
4177 paddusb xmm0, xmm1 // + src argb
4178 movd [edx], xmm0
4179 lea edx, [edx + 4]
4180 sub ecx, 1
4181 jge convertloop1
4182
4183 convertloop1b:
4184 pop esi
4185 ret
4186 }
4187 }
4188 #endif // HAS_ARGBBLENDROW_SSSE3
4189
4190 #ifdef HAS_ARGBATTENUATEROW_SSE2
4191 // Attenuate 4 pixels at a time.
4192 __declspec(naked)
ARGBAttenuateRow_SSE2(const uint8 * src_argb,uint8 * dst_argb,int width)4193 void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
4194 __asm {
4195 mov eax, [esp + 4] // src_argb0
4196 mov edx, [esp + 8] // dst_argb
4197 mov ecx, [esp + 12] // width
4198 pcmpeqb xmm4, xmm4 // generate mask 0xff000000
4199 pslld xmm4, 24
4200 pcmpeqb xmm5, xmm5 // generate mask 0x00ffffff
4201 psrld xmm5, 8
4202
4203 convertloop:
4204 movdqu xmm0, [eax] // read 4 pixels
4205 punpcklbw xmm0, xmm0 // first 2
4206 pshufhw xmm2, xmm0, 0FFh // 8 alpha words
4207 pshuflw xmm2, xmm2, 0FFh
4208 pmulhuw xmm0, xmm2 // rgb * a
4209 movdqu xmm1, [eax] // read 4 pixels
4210 punpckhbw xmm1, xmm1 // next 2 pixels
4211 pshufhw xmm2, xmm1, 0FFh // 8 alpha words
4212 pshuflw xmm2, xmm2, 0FFh
4213 pmulhuw xmm1, xmm2 // rgb * a
4214 movdqu xmm2, [eax] // alphas
4215 lea eax, [eax + 16]
4216 psrlw xmm0, 8
4217 pand xmm2, xmm4
4218 psrlw xmm1, 8
4219 packuswb xmm0, xmm1
4220 pand xmm0, xmm5 // keep original alphas
4221 por xmm0, xmm2
4222 movdqu [edx], xmm0
4223 lea edx, [edx + 16]
4224 sub ecx, 4
4225 jg convertloop
4226
4227 ret
4228 }
4229 }
4230 #endif // HAS_ARGBATTENUATEROW_SSE2
4231
4232 #ifdef HAS_ARGBATTENUATEROW_SSSE3
4233 // Shuffle table duplicating alpha.
4234 static const uvec8 kShuffleAlpha0 = {
4235 3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u,
4236 };
4237 static const uvec8 kShuffleAlpha1 = {
4238 11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
4239 15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u,
4240 };
4241 __declspec(naked)
ARGBAttenuateRow_SSSE3(const uint8 * src_argb,uint8 * dst_argb,int width)4242 void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
4243 __asm {
4244 mov eax, [esp + 4] // src_argb0
4245 mov edx, [esp + 8] // dst_argb
4246 mov ecx, [esp + 12] // width
4247 pcmpeqb xmm3, xmm3 // generate mask 0xff000000
4248 pslld xmm3, 24
4249 movdqa xmm4, kShuffleAlpha0
4250 movdqa xmm5, kShuffleAlpha1
4251
4252 convertloop:
4253 movdqu xmm0, [eax] // read 4 pixels
4254 pshufb xmm0, xmm4 // isolate first 2 alphas
4255 movdqu xmm1, [eax] // read 4 pixels
4256 punpcklbw xmm1, xmm1 // first 2 pixel rgbs
4257 pmulhuw xmm0, xmm1 // rgb * a
4258 movdqu xmm1, [eax] // read 4 pixels
4259 pshufb xmm1, xmm5 // isolate next 2 alphas
4260 movdqu xmm2, [eax] // read 4 pixels
4261 punpckhbw xmm2, xmm2 // next 2 pixel rgbs
4262 pmulhuw xmm1, xmm2 // rgb * a
4263 movdqu xmm2, [eax] // mask original alpha
4264 lea eax, [eax + 16]
4265 pand xmm2, xmm3
4266 psrlw xmm0, 8
4267 psrlw xmm1, 8
4268 packuswb xmm0, xmm1
4269 por xmm0, xmm2 // copy original alpha
4270 movdqu [edx], xmm0
4271 lea edx, [edx + 16]
4272 sub ecx, 4
4273 jg convertloop
4274
4275 ret
4276 }
4277 }
4278 #endif // HAS_ARGBATTENUATEROW_SSSE3
4279
4280 #ifdef HAS_ARGBATTENUATEROW_AVX2
4281 // Shuffle table duplicating alpha.
4282 static const uvec8 kShuffleAlpha_AVX2 = {
4283 6u, 7u, 6u, 7u, 6u, 7u, 128u, 128u, 14u, 15u, 14u, 15u, 14u, 15u, 128u, 128u
4284 };
4285 __declspec(naked)
ARGBAttenuateRow_AVX2(const uint8 * src_argb,uint8 * dst_argb,int width)4286 void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) {
4287 __asm {
4288 mov eax, [esp + 4] // src_argb0
4289 mov edx, [esp + 8] // dst_argb
4290 mov ecx, [esp + 12] // width
4291 sub edx, eax
4292 vbroadcastf128 ymm4,kShuffleAlpha_AVX2
4293 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xff000000
4294 vpslld ymm5, ymm5, 24
4295
4296 convertloop:
4297 vmovdqu ymm6, [eax] // read 8 pixels.
4298 vpunpcklbw ymm0, ymm6, ymm6 // low 4 pixels. mutated.
4299 vpunpckhbw ymm1, ymm6, ymm6 // high 4 pixels. mutated.
4300 vpshufb ymm2, ymm0, ymm4 // low 4 alphas
4301 vpshufb ymm3, ymm1, ymm4 // high 4 alphas
4302 vpmulhuw ymm0, ymm0, ymm2 // rgb * a
4303 vpmulhuw ymm1, ymm1, ymm3 // rgb * a
4304 vpand ymm6, ymm6, ymm5 // isolate alpha
4305 vpsrlw ymm0, ymm0, 8
4306 vpsrlw ymm1, ymm1, 8
4307 vpackuswb ymm0, ymm0, ymm1 // unmutated.
4308 vpor ymm0, ymm0, ymm6 // copy original alpha
4309 vmovdqu [eax + edx], ymm0
4310 lea eax, [eax + 32]
4311 sub ecx, 8
4312 jg convertloop
4313
4314 vzeroupper
4315 ret
4316 }
4317 }
4318 #endif // HAS_ARGBATTENUATEROW_AVX2
4319
4320 #ifdef HAS_ARGBUNATTENUATEROW_SSE2
4321 // Unattenuate 4 pixels at a time.
4322 __declspec(naked)
ARGBUnattenuateRow_SSE2(const uint8 * src_argb,uint8 * dst_argb,int width)4323 void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
4324 int width) {
4325 __asm {
4326 push esi
4327 push edi
4328 mov eax, [esp + 8 + 4] // src_argb0
4329 mov edx, [esp + 8 + 8] // dst_argb
4330 mov ecx, [esp + 8 + 12] // width
4331
4332 convertloop:
4333 movdqu xmm0, [eax] // read 4 pixels
4334 movzx esi, byte ptr [eax + 3] // first alpha
4335 movzx edi, byte ptr [eax + 7] // second alpha
4336 punpcklbw xmm0, xmm0 // first 2
4337 movd xmm2, dword ptr fixed_invtbl8[esi * 4]
4338 movd xmm3, dword ptr fixed_invtbl8[edi * 4]
4339 pshuflw xmm2, xmm2, 040h // first 4 inv_alpha words. 1, a, a, a
4340 pshuflw xmm3, xmm3, 040h // next 4 inv_alpha words
4341 movlhps xmm2, xmm3
4342 pmulhuw xmm0, xmm2 // rgb * a
4343
4344 movdqu xmm1, [eax] // read 4 pixels
4345 movzx esi, byte ptr [eax + 11] // third alpha
4346 movzx edi, byte ptr [eax + 15] // forth alpha
4347 punpckhbw xmm1, xmm1 // next 2
4348 movd xmm2, dword ptr fixed_invtbl8[esi * 4]
4349 movd xmm3, dword ptr fixed_invtbl8[edi * 4]
4350 pshuflw xmm2, xmm2, 040h // first 4 inv_alpha words
4351 pshuflw xmm3, xmm3, 040h // next 4 inv_alpha words
4352 movlhps xmm2, xmm3
4353 pmulhuw xmm1, xmm2 // rgb * a
4354 lea eax, [eax + 16]
4355
4356 packuswb xmm0, xmm1
4357 movdqu [edx], xmm0
4358 lea edx, [edx + 16]
4359 sub ecx, 4
4360 jg convertloop
4361 pop edi
4362 pop esi
4363 ret
4364 }
4365 }
4366 #endif // HAS_ARGBUNATTENUATEROW_SSE2
4367
4368 #ifdef HAS_ARGBUNATTENUATEROW_AVX2
4369 // Shuffle table duplicating alpha.
4370 static const uvec8 kUnattenShuffleAlpha_AVX2 = {
4371 0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15u
4372 };
4373 // TODO(fbarchard): Enable USE_GATHER for future hardware if faster.
4374 // USE_GATHER is not on by default, due to being a slow instruction.
4375 #ifdef USE_GATHER
4376 __declspec(naked)
ARGBUnattenuateRow_AVX2(const uint8 * src_argb,uint8 * dst_argb,int width)4377 void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,
4378 int width) {
4379 __asm {
4380 mov eax, [esp + 4] // src_argb0
4381 mov edx, [esp + 8] // dst_argb
4382 mov ecx, [esp + 12] // width
4383 sub edx, eax
4384 vbroadcastf128 ymm4, kUnattenShuffleAlpha_AVX2
4385
4386 convertloop:
4387 vmovdqu ymm6, [eax] // read 8 pixels.
4388 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xffffffff for gather.
4389 vpsrld ymm2, ymm6, 24 // alpha in low 8 bits.
4390 vpunpcklbw ymm0, ymm6, ymm6 // low 4 pixels. mutated.
4391 vpunpckhbw ymm1, ymm6, ymm6 // high 4 pixels. mutated.
4392 vpgatherdd ymm3, [ymm2 * 4 + fixed_invtbl8], ymm5 // ymm5 cleared. 1, a
4393 vpunpcklwd ymm2, ymm3, ymm3 // low 4 inverted alphas. mutated. 1, 1, a, a
4394 vpunpckhwd ymm3, ymm3, ymm3 // high 4 inverted alphas. mutated.
4395 vpshufb ymm2, ymm2, ymm4 // replicate low 4 alphas. 1, a, a, a
4396 vpshufb ymm3, ymm3, ymm4 // replicate high 4 alphas
4397 vpmulhuw ymm0, ymm0, ymm2 // rgb * ia
4398 vpmulhuw ymm1, ymm1, ymm3 // rgb * ia
4399 vpackuswb ymm0, ymm0, ymm1 // unmutated.
4400 vmovdqu [eax + edx], ymm0
4401 lea eax, [eax + 32]
4402 sub ecx, 8
4403 jg convertloop
4404
4405 vzeroupper
4406 ret
4407 }
4408 }
4409 #else // USE_GATHER
4410 __declspec(naked)
ARGBUnattenuateRow_AVX2(const uint8 * src_argb,uint8 * dst_argb,int width)4411 void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,
4412 int width) {
4413 __asm {
4414
4415 mov eax, [esp + 4] // src_argb0
4416 mov edx, [esp + 8] // dst_argb
4417 mov ecx, [esp + 12] // width
4418 sub edx, eax
4419 vbroadcastf128 ymm5, kUnattenShuffleAlpha_AVX2
4420
4421 push esi
4422 push edi
4423
4424 convertloop:
4425 // replace VPGATHER
4426 movzx esi, byte ptr [eax + 3] // alpha0
4427 movzx edi, byte ptr [eax + 7] // alpha1
4428 vmovd xmm0, dword ptr fixed_invtbl8[esi * 4] // [1,a0]
4429 vmovd xmm1, dword ptr fixed_invtbl8[edi * 4] // [1,a1]
4430 movzx esi, byte ptr [eax + 11] // alpha2
4431 movzx edi, byte ptr [eax + 15] // alpha3
4432 vpunpckldq xmm6, xmm0, xmm1 // [1,a1,1,a0]
4433 vmovd xmm2, dword ptr fixed_invtbl8[esi * 4] // [1,a2]
4434 vmovd xmm3, dword ptr fixed_invtbl8[edi * 4] // [1,a3]
4435 movzx esi, byte ptr [eax + 19] // alpha4
4436 movzx edi, byte ptr [eax + 23] // alpha5
4437 vpunpckldq xmm7, xmm2, xmm3 // [1,a3,1,a2]
4438 vmovd xmm0, dword ptr fixed_invtbl8[esi * 4] // [1,a4]
4439 vmovd xmm1, dword ptr fixed_invtbl8[edi * 4] // [1,a5]
4440 movzx esi, byte ptr [eax + 27] // alpha6
4441 movzx edi, byte ptr [eax + 31] // alpha7
4442 vpunpckldq xmm0, xmm0, xmm1 // [1,a5,1,a4]
4443 vmovd xmm2, dword ptr fixed_invtbl8[esi * 4] // [1,a6]
4444 vmovd xmm3, dword ptr fixed_invtbl8[edi * 4] // [1,a7]
4445 vpunpckldq xmm2, xmm2, xmm3 // [1,a7,1,a6]
4446 vpunpcklqdq xmm3, xmm6, xmm7 // [1,a3,1,a2,1,a1,1,a0]
4447 vpunpcklqdq xmm0, xmm0, xmm2 // [1,a7,1,a6,1,a5,1,a4]
4448 vinserti128 ymm3, ymm3, xmm0, 1 // [1,a7,1,a6,1,a5,1,a4,1,a3,1,a2,1,a1,1,a0]
4449 // end of VPGATHER
4450
4451 vmovdqu ymm6, [eax] // read 8 pixels.
4452 vpunpcklbw ymm0, ymm6, ymm6 // low 4 pixels. mutated.
4453 vpunpckhbw ymm1, ymm6, ymm6 // high 4 pixels. mutated.
4454 vpunpcklwd ymm2, ymm3, ymm3 // low 4 inverted alphas. mutated. 1, 1, a, a
4455 vpunpckhwd ymm3, ymm3, ymm3 // high 4 inverted alphas. mutated.
4456 vpshufb ymm2, ymm2, ymm5 // replicate low 4 alphas. 1, a, a, a
4457 vpshufb ymm3, ymm3, ymm5 // replicate high 4 alphas
4458 vpmulhuw ymm0, ymm0, ymm2 // rgb * ia
4459 vpmulhuw ymm1, ymm1, ymm3 // rgb * ia
4460 vpackuswb ymm0, ymm0, ymm1 // unmutated.
4461 vmovdqu [eax + edx], ymm0
4462 lea eax, [eax + 32]
4463 sub ecx, 8
4464 jg convertloop
4465
4466 pop edi
4467 pop esi
4468 vzeroupper
4469 ret
4470 }
4471 }
4472 #endif // USE_GATHER
4473 #endif // HAS_ARGBATTENUATEROW_AVX2
4474
4475 #ifdef HAS_ARGBGRAYROW_SSSE3
4476 // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels.
4477 __declspec(naked)
ARGBGrayRow_SSSE3(const uint8 * src_argb,uint8 * dst_argb,int width)4478 void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
4479 __asm {
4480 mov eax, [esp + 4] /* src_argb */
4481 mov edx, [esp + 8] /* dst_argb */
4482 mov ecx, [esp + 12] /* width */
4483 movdqa xmm4, kARGBToYJ
4484 movdqa xmm5, kAddYJ64
4485
4486 convertloop:
4487 movdqu xmm0, [eax] // G
4488 movdqu xmm1, [eax + 16]
4489 pmaddubsw xmm0, xmm4
4490 pmaddubsw xmm1, xmm4
4491 phaddw xmm0, xmm1
4492 paddw xmm0, xmm5 // Add .5 for rounding.
4493 psrlw xmm0, 7
4494 packuswb xmm0, xmm0 // 8 G bytes
4495 movdqu xmm2, [eax] // A
4496 movdqu xmm3, [eax + 16]
4497 lea eax, [eax + 32]
4498 psrld xmm2, 24
4499 psrld xmm3, 24
4500 packuswb xmm2, xmm3
4501 packuswb xmm2, xmm2 // 8 A bytes
4502 movdqa xmm3, xmm0 // Weave into GG, GA, then GGGA
4503 punpcklbw xmm0, xmm0 // 8 GG words
4504 punpcklbw xmm3, xmm2 // 8 GA words
4505 movdqa xmm1, xmm0
4506 punpcklwd xmm0, xmm3 // GGGA first 4
4507 punpckhwd xmm1, xmm3 // GGGA next 4
4508 movdqu [edx], xmm0
4509 movdqu [edx + 16], xmm1
4510 lea edx, [edx + 32]
4511 sub ecx, 8
4512 jg convertloop
4513 ret
4514 }
4515 }
4516 #endif // HAS_ARGBGRAYROW_SSSE3
4517
4518 #ifdef HAS_ARGBSEPIAROW_SSSE3
4519 // b = (r * 35 + g * 68 + b * 17) >> 7
4520 // g = (r * 45 + g * 88 + b * 22) >> 7
4521 // r = (r * 50 + g * 98 + b * 24) >> 7
4522 // Constant for ARGB color to sepia tone.
4523 static const vec8 kARGBToSepiaB = {
4524 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0
4525 };
4526
4527 static const vec8 kARGBToSepiaG = {
4528 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0
4529 };
4530
4531 static const vec8 kARGBToSepiaR = {
4532 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0
4533 };
4534
4535 // Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
4536 __declspec(naked)
ARGBSepiaRow_SSSE3(uint8 * dst_argb,int width)4537 void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
4538 __asm {
4539 mov eax, [esp + 4] /* dst_argb */
4540 mov ecx, [esp + 8] /* width */
4541 movdqa xmm2, kARGBToSepiaB
4542 movdqa xmm3, kARGBToSepiaG
4543 movdqa xmm4, kARGBToSepiaR
4544
4545 convertloop:
4546 movdqu xmm0, [eax] // B
4547 movdqu xmm6, [eax + 16]
4548 pmaddubsw xmm0, xmm2
4549 pmaddubsw xmm6, xmm2
4550 phaddw xmm0, xmm6
4551 psrlw xmm0, 7
4552 packuswb xmm0, xmm0 // 8 B values
4553 movdqu xmm5, [eax] // G
4554 movdqu xmm1, [eax + 16]
4555 pmaddubsw xmm5, xmm3
4556 pmaddubsw xmm1, xmm3
4557 phaddw xmm5, xmm1
4558 psrlw xmm5, 7
4559 packuswb xmm5, xmm5 // 8 G values
4560 punpcklbw xmm0, xmm5 // 8 BG values
4561 movdqu xmm5, [eax] // R
4562 movdqu xmm1, [eax + 16]
4563 pmaddubsw xmm5, xmm4
4564 pmaddubsw xmm1, xmm4
4565 phaddw xmm5, xmm1
4566 psrlw xmm5, 7
4567 packuswb xmm5, xmm5 // 8 R values
4568 movdqu xmm6, [eax] // A
4569 movdqu xmm1, [eax + 16]
4570 psrld xmm6, 24
4571 psrld xmm1, 24
4572 packuswb xmm6, xmm1
4573 packuswb xmm6, xmm6 // 8 A values
4574 punpcklbw xmm5, xmm6 // 8 RA values
4575 movdqa xmm1, xmm0 // Weave BG, RA together
4576 punpcklwd xmm0, xmm5 // BGRA first 4
4577 punpckhwd xmm1, xmm5 // BGRA next 4
4578 movdqu [eax], xmm0
4579 movdqu [eax + 16], xmm1
4580 lea eax, [eax + 32]
4581 sub ecx, 8
4582 jg convertloop
4583 ret
4584 }
4585 }
4586 #endif // HAS_ARGBSEPIAROW_SSSE3
4587
4588 #ifdef HAS_ARGBCOLORMATRIXROW_SSSE3
4589 // Tranform 8 ARGB pixels (32 bytes) with color matrix.
4590 // Same as Sepia except matrix is provided.
4591 // TODO(fbarchard): packuswbs only use half of the reg. To make RGBA, combine R
4592 // and B into a high and low, then G/A, unpackl/hbw and then unpckl/hwd.
4593 __declspec(naked)
ARGBColorMatrixRow_SSSE3(const uint8 * src_argb,uint8 * dst_argb,const int8 * matrix_argb,int width)4594 void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
4595 const int8* matrix_argb, int width) {
4596 __asm {
4597 mov eax, [esp + 4] /* src_argb */
4598 mov edx, [esp + 8] /* dst_argb */
4599 mov ecx, [esp + 12] /* matrix_argb */
4600 movdqu xmm5, [ecx]
4601 pshufd xmm2, xmm5, 0x00
4602 pshufd xmm3, xmm5, 0x55
4603 pshufd xmm4, xmm5, 0xaa
4604 pshufd xmm5, xmm5, 0xff
4605 mov ecx, [esp + 16] /* width */
4606
4607 convertloop:
4608 movdqu xmm0, [eax] // B
4609 movdqu xmm7, [eax + 16]
4610 pmaddubsw xmm0, xmm2
4611 pmaddubsw xmm7, xmm2
4612 movdqu xmm6, [eax] // G
4613 movdqu xmm1, [eax + 16]
4614 pmaddubsw xmm6, xmm3
4615 pmaddubsw xmm1, xmm3
4616 phaddsw xmm0, xmm7 // B
4617 phaddsw xmm6, xmm1 // G
4618 psraw xmm0, 6 // B
4619 psraw xmm6, 6 // G
4620 packuswb xmm0, xmm0 // 8 B values
4621 packuswb xmm6, xmm6 // 8 G values
4622 punpcklbw xmm0, xmm6 // 8 BG values
4623 movdqu xmm1, [eax] // R
4624 movdqu xmm7, [eax + 16]
4625 pmaddubsw xmm1, xmm4
4626 pmaddubsw xmm7, xmm4
4627 phaddsw xmm1, xmm7 // R
4628 movdqu xmm6, [eax] // A
4629 movdqu xmm7, [eax + 16]
4630 pmaddubsw xmm6, xmm5
4631 pmaddubsw xmm7, xmm5
4632 phaddsw xmm6, xmm7 // A
4633 psraw xmm1, 6 // R
4634 psraw xmm6, 6 // A
4635 packuswb xmm1, xmm1 // 8 R values
4636 packuswb xmm6, xmm6 // 8 A values
4637 punpcklbw xmm1, xmm6 // 8 RA values
4638 movdqa xmm6, xmm0 // Weave BG, RA together
4639 punpcklwd xmm0, xmm1 // BGRA first 4
4640 punpckhwd xmm6, xmm1 // BGRA next 4
4641 movdqu [edx], xmm0
4642 movdqu [edx + 16], xmm6
4643 lea eax, [eax + 32]
4644 lea edx, [edx + 32]
4645 sub ecx, 8
4646 jg convertloop
4647 ret
4648 }
4649 }
4650 #endif // HAS_ARGBCOLORMATRIXROW_SSSE3
4651
4652 #ifdef HAS_ARGBQUANTIZEROW_SSE2
4653 // Quantize 4 ARGB pixels (16 bytes).
4654 __declspec(naked)
ARGBQuantizeRow_SSE2(uint8 * dst_argb,int scale,int interval_size,int interval_offset,int width)4655 void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,
4656 int interval_offset, int width) {
4657 __asm {
4658 mov eax, [esp + 4] /* dst_argb */
4659 movd xmm2, [esp + 8] /* scale */
4660 movd xmm3, [esp + 12] /* interval_size */
4661 movd xmm4, [esp + 16] /* interval_offset */
4662 mov ecx, [esp + 20] /* width */
4663 pshuflw xmm2, xmm2, 040h
4664 pshufd xmm2, xmm2, 044h
4665 pshuflw xmm3, xmm3, 040h
4666 pshufd xmm3, xmm3, 044h
4667 pshuflw xmm4, xmm4, 040h
4668 pshufd xmm4, xmm4, 044h
4669 pxor xmm5, xmm5 // constant 0
4670 pcmpeqb xmm6, xmm6 // generate mask 0xff000000
4671 pslld xmm6, 24
4672
4673 convertloop:
4674 movdqu xmm0, [eax] // read 4 pixels
4675 punpcklbw xmm0, xmm5 // first 2 pixels
4676 pmulhuw xmm0, xmm2 // pixel * scale >> 16
4677 movdqu xmm1, [eax] // read 4 pixels
4678 punpckhbw xmm1, xmm5 // next 2 pixels
4679 pmulhuw xmm1, xmm2
4680 pmullw xmm0, xmm3 // * interval_size
4681 movdqu xmm7, [eax] // read 4 pixels
4682 pmullw xmm1, xmm3
4683 pand xmm7, xmm6 // mask alpha
4684 paddw xmm0, xmm4 // + interval_size / 2
4685 paddw xmm1, xmm4
4686 packuswb xmm0, xmm1
4687 por xmm0, xmm7
4688 movdqu [eax], xmm0
4689 lea eax, [eax + 16]
4690 sub ecx, 4
4691 jg convertloop
4692 ret
4693 }
4694 }
4695 #endif // HAS_ARGBQUANTIZEROW_SSE2
4696
4697 #ifdef HAS_ARGBSHADEROW_SSE2
4698 // Shade 4 pixels at a time by specified value.
4699 __declspec(naked)
ARGBShadeRow_SSE2(const uint8 * src_argb,uint8 * dst_argb,int width,uint32 value)4700 void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
4701 uint32 value) {
4702 __asm {
4703 mov eax, [esp + 4] // src_argb
4704 mov edx, [esp + 8] // dst_argb
4705 mov ecx, [esp + 12] // width
4706 movd xmm2, [esp + 16] // value
4707 punpcklbw xmm2, xmm2
4708 punpcklqdq xmm2, xmm2
4709
4710 convertloop:
4711 movdqu xmm0, [eax] // read 4 pixels
4712 lea eax, [eax + 16]
4713 movdqa xmm1, xmm0
4714 punpcklbw xmm0, xmm0 // first 2
4715 punpckhbw xmm1, xmm1 // next 2
4716 pmulhuw xmm0, xmm2 // argb * value
4717 pmulhuw xmm1, xmm2 // argb * value
4718 psrlw xmm0, 8
4719 psrlw xmm1, 8
4720 packuswb xmm0, xmm1
4721 movdqu [edx], xmm0
4722 lea edx, [edx + 16]
4723 sub ecx, 4
4724 jg convertloop
4725
4726 ret
4727 }
4728 }
4729 #endif // HAS_ARGBSHADEROW_SSE2
4730
4731 #ifdef HAS_ARGBMULTIPLYROW_SSE2
4732 // Multiply 2 rows of ARGB pixels together, 4 pixels at a time.
4733 __declspec(naked)
ARGBMultiplyRow_SSE2(const uint8 * src_argb0,const uint8 * src_argb1,uint8 * dst_argb,int width)4734 void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
4735 uint8* dst_argb, int width) {
4736 __asm {
4737 push esi
4738 mov eax, [esp + 4 + 4] // src_argb0
4739 mov esi, [esp + 4 + 8] // src_argb1
4740 mov edx, [esp + 4 + 12] // dst_argb
4741 mov ecx, [esp + 4 + 16] // width
4742 pxor xmm5, xmm5 // constant 0
4743
4744 convertloop:
4745 movdqu xmm0, [eax] // read 4 pixels from src_argb0
4746 movdqu xmm2, [esi] // read 4 pixels from src_argb1
4747 movdqu xmm1, xmm0
4748 movdqu xmm3, xmm2
4749 punpcklbw xmm0, xmm0 // first 2
4750 punpckhbw xmm1, xmm1 // next 2
4751 punpcklbw xmm2, xmm5 // first 2
4752 punpckhbw xmm3, xmm5 // next 2
4753 pmulhuw xmm0, xmm2 // src_argb0 * src_argb1 first 2
4754 pmulhuw xmm1, xmm3 // src_argb0 * src_argb1 next 2
4755 lea eax, [eax + 16]
4756 lea esi, [esi + 16]
4757 packuswb xmm0, xmm1
4758 movdqu [edx], xmm0
4759 lea edx, [edx + 16]
4760 sub ecx, 4
4761 jg convertloop
4762
4763 pop esi
4764 ret
4765 }
4766 }
4767 #endif // HAS_ARGBMULTIPLYROW_SSE2
4768
4769 #ifdef HAS_ARGBADDROW_SSE2
4770 // Add 2 rows of ARGB pixels together, 4 pixels at a time.
4771 // TODO(fbarchard): Port this to posix, neon and other math functions.
4772 __declspec(naked)
ARGBAddRow_SSE2(const uint8 * src_argb0,const uint8 * src_argb1,uint8 * dst_argb,int width)4773 void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
4774 uint8* dst_argb, int width) {
4775 __asm {
4776 push esi
4777 mov eax, [esp + 4 + 4] // src_argb0
4778 mov esi, [esp + 4 + 8] // src_argb1
4779 mov edx, [esp + 4 + 12] // dst_argb
4780 mov ecx, [esp + 4 + 16] // width
4781
4782 sub ecx, 4
4783 jl convertloop49
4784
4785 convertloop4:
4786 movdqu xmm0, [eax] // read 4 pixels from src_argb0
4787 lea eax, [eax + 16]
4788 movdqu xmm1, [esi] // read 4 pixels from src_argb1
4789 lea esi, [esi + 16]
4790 paddusb xmm0, xmm1 // src_argb0 + src_argb1
4791 movdqu [edx], xmm0
4792 lea edx, [edx + 16]
4793 sub ecx, 4
4794 jge convertloop4
4795
4796 convertloop49:
4797 add ecx, 4 - 1
4798 jl convertloop19
4799
4800 convertloop1:
4801 movd xmm0, [eax] // read 1 pixels from src_argb0
4802 lea eax, [eax + 4]
4803 movd xmm1, [esi] // read 1 pixels from src_argb1
4804 lea esi, [esi + 4]
4805 paddusb xmm0, xmm1 // src_argb0 + src_argb1
4806 movd [edx], xmm0
4807 lea edx, [edx + 4]
4808 sub ecx, 1
4809 jge convertloop1
4810
4811 convertloop19:
4812 pop esi
4813 ret
4814 }
4815 }
4816 #endif // HAS_ARGBADDROW_SSE2
4817
4818 #ifdef HAS_ARGBSUBTRACTROW_SSE2
4819 // Subtract 2 rows of ARGB pixels together, 4 pixels at a time.
4820 __declspec(naked)
ARGBSubtractRow_SSE2(const uint8 * src_argb0,const uint8 * src_argb1,uint8 * dst_argb,int width)4821 void ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
4822 uint8* dst_argb, int width) {
4823 __asm {
4824 push esi
4825 mov eax, [esp + 4 + 4] // src_argb0
4826 mov esi, [esp + 4 + 8] // src_argb1
4827 mov edx, [esp + 4 + 12] // dst_argb
4828 mov ecx, [esp + 4 + 16] // width
4829
4830 convertloop:
4831 movdqu xmm0, [eax] // read 4 pixels from src_argb0
4832 lea eax, [eax + 16]
4833 movdqu xmm1, [esi] // read 4 pixels from src_argb1
4834 lea esi, [esi + 16]
4835 psubusb xmm0, xmm1 // src_argb0 - src_argb1
4836 movdqu [edx], xmm0
4837 lea edx, [edx + 16]
4838 sub ecx, 4
4839 jg convertloop
4840
4841 pop esi
4842 ret
4843 }
4844 }
4845 #endif // HAS_ARGBSUBTRACTROW_SSE2
4846
4847 #ifdef HAS_ARGBMULTIPLYROW_AVX2
4848 // Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
4849 __declspec(naked)
ARGBMultiplyRow_AVX2(const uint8 * src_argb0,const uint8 * src_argb1,uint8 * dst_argb,int width)4850 void ARGBMultiplyRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
4851 uint8* dst_argb, int width) {
4852 __asm {
4853 push esi
4854 mov eax, [esp + 4 + 4] // src_argb0
4855 mov esi, [esp + 4 + 8] // src_argb1
4856 mov edx, [esp + 4 + 12] // dst_argb
4857 mov ecx, [esp + 4 + 16] // width
4858 vpxor ymm5, ymm5, ymm5 // constant 0
4859
4860 convertloop:
4861 vmovdqu ymm1, [eax] // read 8 pixels from src_argb0
4862 lea eax, [eax + 32]
4863 vmovdqu ymm3, [esi] // read 8 pixels from src_argb1
4864 lea esi, [esi + 32]
4865 vpunpcklbw ymm0, ymm1, ymm1 // low 4
4866 vpunpckhbw ymm1, ymm1, ymm1 // high 4
4867 vpunpcklbw ymm2, ymm3, ymm5 // low 4
4868 vpunpckhbw ymm3, ymm3, ymm5 // high 4
4869 vpmulhuw ymm0, ymm0, ymm2 // src_argb0 * src_argb1 low 4
4870 vpmulhuw ymm1, ymm1, ymm3 // src_argb0 * src_argb1 high 4
4871 vpackuswb ymm0, ymm0, ymm1
4872 vmovdqu [edx], ymm0
4873 lea edx, [edx + 32]
4874 sub ecx, 8
4875 jg convertloop
4876
4877 pop esi
4878 vzeroupper
4879 ret
4880 }
4881 }
4882 #endif // HAS_ARGBMULTIPLYROW_AVX2
4883
4884 #ifdef HAS_ARGBADDROW_AVX2
4885 // Add 2 rows of ARGB pixels together, 8 pixels at a time.
4886 __declspec(naked)
ARGBAddRow_AVX2(const uint8 * src_argb0,const uint8 * src_argb1,uint8 * dst_argb,int width)4887 void ARGBAddRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
4888 uint8* dst_argb, int width) {
4889 __asm {
4890 push esi
4891 mov eax, [esp + 4 + 4] // src_argb0
4892 mov esi, [esp + 4 + 8] // src_argb1
4893 mov edx, [esp + 4 + 12] // dst_argb
4894 mov ecx, [esp + 4 + 16] // width
4895
4896 convertloop:
4897 vmovdqu ymm0, [eax] // read 8 pixels from src_argb0
4898 lea eax, [eax + 32]
4899 vpaddusb ymm0, ymm0, [esi] // add 8 pixels from src_argb1
4900 lea esi, [esi + 32]
4901 vmovdqu [edx], ymm0
4902 lea edx, [edx + 32]
4903 sub ecx, 8
4904 jg convertloop
4905
4906 pop esi
4907 vzeroupper
4908 ret
4909 }
4910 }
4911 #endif // HAS_ARGBADDROW_AVX2
4912
4913 #ifdef HAS_ARGBSUBTRACTROW_AVX2
4914 // Subtract 2 rows of ARGB pixels together, 8 pixels at a time.
4915 __declspec(naked)
ARGBSubtractRow_AVX2(const uint8 * src_argb0,const uint8 * src_argb1,uint8 * dst_argb,int width)4916 void ARGBSubtractRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
4917 uint8* dst_argb, int width) {
4918 __asm {
4919 push esi
4920 mov eax, [esp + 4 + 4] // src_argb0
4921 mov esi, [esp + 4 + 8] // src_argb1
4922 mov edx, [esp + 4 + 12] // dst_argb
4923 mov ecx, [esp + 4 + 16] // width
4924
4925 convertloop:
4926 vmovdqu ymm0, [eax] // read 8 pixels from src_argb0
4927 lea eax, [eax + 32]
4928 vpsubusb ymm0, ymm0, [esi] // src_argb0 - src_argb1
4929 lea esi, [esi + 32]
4930 vmovdqu [edx], ymm0
4931 lea edx, [edx + 32]
4932 sub ecx, 8
4933 jg convertloop
4934
4935 pop esi
4936 vzeroupper
4937 ret
4938 }
4939 }
4940 #endif // HAS_ARGBSUBTRACTROW_AVX2
4941
4942 #ifdef HAS_SOBELXROW_SSE2
4943 // SobelX as a matrix is
4944 // -1 0 1
4945 // -2 0 2
4946 // -1 0 1
4947 __declspec(naked)
SobelXRow_SSE2(const uint8 * src_y0,const uint8 * src_y1,const uint8 * src_y2,uint8 * dst_sobelx,int width)4948 void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1,
4949 const uint8* src_y2, uint8* dst_sobelx, int width) {
4950 __asm {
4951 push esi
4952 push edi
4953 mov eax, [esp + 8 + 4] // src_y0
4954 mov esi, [esp + 8 + 8] // src_y1
4955 mov edi, [esp + 8 + 12] // src_y2
4956 mov edx, [esp + 8 + 16] // dst_sobelx
4957 mov ecx, [esp + 8 + 20] // width
4958 sub esi, eax
4959 sub edi, eax
4960 sub edx, eax
4961 pxor xmm5, xmm5 // constant 0
4962
4963 convertloop:
4964 movq xmm0, qword ptr [eax] // read 8 pixels from src_y0[0]
4965 movq xmm1, qword ptr [eax + 2] // read 8 pixels from src_y0[2]
4966 punpcklbw xmm0, xmm5
4967 punpcklbw xmm1, xmm5
4968 psubw xmm0, xmm1
4969 movq xmm1, qword ptr [eax + esi] // read 8 pixels from src_y1[0]
4970 movq xmm2, qword ptr [eax + esi + 2] // read 8 pixels from src_y1[2]
4971 punpcklbw xmm1, xmm5
4972 punpcklbw xmm2, xmm5
4973 psubw xmm1, xmm2
4974 movq xmm2, qword ptr [eax + edi] // read 8 pixels from src_y2[0]
4975 movq xmm3, qword ptr [eax + edi + 2] // read 8 pixels from src_y2[2]
4976 punpcklbw xmm2, xmm5
4977 punpcklbw xmm3, xmm5
4978 psubw xmm2, xmm3
4979 paddw xmm0, xmm2
4980 paddw xmm0, xmm1
4981 paddw xmm0, xmm1
4982 pxor xmm1, xmm1 // abs = max(xmm0, -xmm0). SSSE3 could use pabsw
4983 psubw xmm1, xmm0
4984 pmaxsw xmm0, xmm1
4985 packuswb xmm0, xmm0
4986 movq qword ptr [eax + edx], xmm0
4987 lea eax, [eax + 8]
4988 sub ecx, 8
4989 jg convertloop
4990
4991 pop edi
4992 pop esi
4993 ret
4994 }
4995 }
4996 #endif // HAS_SOBELXROW_SSE2
4997
4998 #ifdef HAS_SOBELYROW_SSE2
4999 // SobelY as a matrix is
5000 // -1 -2 -1
5001 // 0 0 0
5002 // 1 2 1
5003 __declspec(naked)
SobelYRow_SSE2(const uint8 * src_y0,const uint8 * src_y1,uint8 * dst_sobely,int width)5004 void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1,
5005 uint8* dst_sobely, int width) {
5006 __asm {
5007 push esi
5008 mov eax, [esp + 4 + 4] // src_y0
5009 mov esi, [esp + 4 + 8] // src_y1
5010 mov edx, [esp + 4 + 12] // dst_sobely
5011 mov ecx, [esp + 4 + 16] // width
5012 sub esi, eax
5013 sub edx, eax
5014 pxor xmm5, xmm5 // constant 0
5015
5016 convertloop:
5017 movq xmm0, qword ptr [eax] // read 8 pixels from src_y0[0]
5018 movq xmm1, qword ptr [eax + esi] // read 8 pixels from src_y1[0]
5019 punpcklbw xmm0, xmm5
5020 punpcklbw xmm1, xmm5
5021 psubw xmm0, xmm1
5022 movq xmm1, qword ptr [eax + 1] // read 8 pixels from src_y0[1]
5023 movq xmm2, qword ptr [eax + esi + 1] // read 8 pixels from src_y1[1]
5024 punpcklbw xmm1, xmm5
5025 punpcklbw xmm2, xmm5
5026 psubw xmm1, xmm2
5027 movq xmm2, qword ptr [eax + 2] // read 8 pixels from src_y0[2]
5028 movq xmm3, qword ptr [eax + esi + 2] // read 8 pixels from src_y1[2]
5029 punpcklbw xmm2, xmm5
5030 punpcklbw xmm3, xmm5
5031 psubw xmm2, xmm3
5032 paddw xmm0, xmm2
5033 paddw xmm0, xmm1
5034 paddw xmm0, xmm1
5035 pxor xmm1, xmm1 // abs = max(xmm0, -xmm0). SSSE3 could use pabsw
5036 psubw xmm1, xmm0
5037 pmaxsw xmm0, xmm1
5038 packuswb xmm0, xmm0
5039 movq qword ptr [eax + edx], xmm0
5040 lea eax, [eax + 8]
5041 sub ecx, 8
5042 jg convertloop
5043
5044 pop esi
5045 ret
5046 }
5047 }
5048 #endif // HAS_SOBELYROW_SSE2
5049
5050 #ifdef HAS_SOBELROW_SSE2
5051 // Adds Sobel X and Sobel Y and stores Sobel into ARGB.
5052 // A = 255
5053 // R = Sobel
5054 // G = Sobel
5055 // B = Sobel
5056 __declspec(naked)
SobelRow_SSE2(const uint8 * src_sobelx,const uint8 * src_sobely,uint8 * dst_argb,int width)5057 void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
5058 uint8* dst_argb, int width) {
5059 __asm {
5060 push esi
5061 mov eax, [esp + 4 + 4] // src_sobelx
5062 mov esi, [esp + 4 + 8] // src_sobely
5063 mov edx, [esp + 4 + 12] // dst_argb
5064 mov ecx, [esp + 4 + 16] // width
5065 sub esi, eax
5066 pcmpeqb xmm5, xmm5 // alpha 255
5067 pslld xmm5, 24 // 0xff000000
5068
5069 convertloop:
5070 movdqu xmm0, [eax] // read 16 pixels src_sobelx
5071 movdqu xmm1, [eax + esi] // read 16 pixels src_sobely
5072 lea eax, [eax + 16]
5073 paddusb xmm0, xmm1 // sobel = sobelx + sobely
5074 movdqa xmm2, xmm0 // GG
5075 punpcklbw xmm2, xmm0 // First 8
5076 punpckhbw xmm0, xmm0 // Next 8
5077 movdqa xmm1, xmm2 // GGGG
5078 punpcklwd xmm1, xmm2 // First 4
5079 punpckhwd xmm2, xmm2 // Next 4
5080 por xmm1, xmm5 // GGGA
5081 por xmm2, xmm5
5082 movdqa xmm3, xmm0 // GGGG
5083 punpcklwd xmm3, xmm0 // Next 4
5084 punpckhwd xmm0, xmm0 // Last 4
5085 por xmm3, xmm5 // GGGA
5086 por xmm0, xmm5
5087 movdqu [edx], xmm1
5088 movdqu [edx + 16], xmm2
5089 movdqu [edx + 32], xmm3
5090 movdqu [edx + 48], xmm0
5091 lea edx, [edx + 64]
5092 sub ecx, 16
5093 jg convertloop
5094
5095 pop esi
5096 ret
5097 }
5098 }
5099 #endif // HAS_SOBELROW_SSE2
5100
5101 #ifdef HAS_SOBELTOPLANEROW_SSE2
5102 // Adds Sobel X and Sobel Y and stores Sobel into a plane.
5103 __declspec(naked)
SobelToPlaneRow_SSE2(const uint8 * src_sobelx,const uint8 * src_sobely,uint8 * dst_y,int width)5104 void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
5105 uint8* dst_y, int width) {
5106 __asm {
5107 push esi
5108 mov eax, [esp + 4 + 4] // src_sobelx
5109 mov esi, [esp + 4 + 8] // src_sobely
5110 mov edx, [esp + 4 + 12] // dst_argb
5111 mov ecx, [esp + 4 + 16] // width
5112 sub esi, eax
5113
5114 convertloop:
5115 movdqu xmm0, [eax] // read 16 pixels src_sobelx
5116 movdqu xmm1, [eax + esi] // read 16 pixels src_sobely
5117 lea eax, [eax + 16]
5118 paddusb xmm0, xmm1 // sobel = sobelx + sobely
5119 movdqu [edx], xmm0
5120 lea edx, [edx + 16]
5121 sub ecx, 16
5122 jg convertloop
5123
5124 pop esi
5125 ret
5126 }
5127 }
5128 #endif // HAS_SOBELTOPLANEROW_SSE2
5129
5130 #ifdef HAS_SOBELXYROW_SSE2
5131 // Mixes Sobel X, Sobel Y and Sobel into ARGB.
5132 // A = 255
5133 // R = Sobel X
5134 // G = Sobel
5135 // B = Sobel Y
5136 __declspec(naked)
SobelXYRow_SSE2(const uint8 * src_sobelx,const uint8 * src_sobely,uint8 * dst_argb,int width)5137 void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
5138 uint8* dst_argb, int width) {
5139 __asm {
5140 push esi
5141 mov eax, [esp + 4 + 4] // src_sobelx
5142 mov esi, [esp + 4 + 8] // src_sobely
5143 mov edx, [esp + 4 + 12] // dst_argb
5144 mov ecx, [esp + 4 + 16] // width
5145 sub esi, eax
5146 pcmpeqb xmm5, xmm5 // alpha 255
5147
5148 convertloop:
5149 movdqu xmm0, [eax] // read 16 pixels src_sobelx
5150 movdqu xmm1, [eax + esi] // read 16 pixels src_sobely
5151 lea eax, [eax + 16]
5152 movdqa xmm2, xmm0
5153 paddusb xmm2, xmm1 // sobel = sobelx + sobely
5154 movdqa xmm3, xmm0 // XA
5155 punpcklbw xmm3, xmm5
5156 punpckhbw xmm0, xmm5
5157 movdqa xmm4, xmm1 // YS
5158 punpcklbw xmm4, xmm2
5159 punpckhbw xmm1, xmm2
5160 movdqa xmm6, xmm4 // YSXA
5161 punpcklwd xmm6, xmm3 // First 4
5162 punpckhwd xmm4, xmm3 // Next 4
5163 movdqa xmm7, xmm1 // YSXA
5164 punpcklwd xmm7, xmm0 // Next 4
5165 punpckhwd xmm1, xmm0 // Last 4
5166 movdqu [edx], xmm6
5167 movdqu [edx + 16], xmm4
5168 movdqu [edx + 32], xmm7
5169 movdqu [edx + 48], xmm1
5170 lea edx, [edx + 64]
5171 sub ecx, 16
5172 jg convertloop
5173
5174 pop esi
5175 ret
5176 }
5177 }
5178 #endif // HAS_SOBELXYROW_SSE2
5179
5180 #ifdef HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
5181 // Consider float CumulativeSum.
5182 // Consider calling CumulativeSum one row at time as needed.
5183 // Consider circular CumulativeSum buffer of radius * 2 + 1 height.
5184 // Convert cumulative sum for an area to an average for 1 pixel.
5185 // topleft is pointer to top left of CumulativeSum buffer for area.
5186 // botleft is pointer to bottom left of CumulativeSum buffer.
5187 // width is offset from left to right of area in CumulativeSum buffer measured
5188 // in number of ints.
5189 // area is the number of pixels in the area being averaged.
5190 // dst points to pixel to store result to.
5191 // count is number of averaged pixels to produce.
5192 // Does 4 pixels at a time.
CumulativeSumToAverageRow_SSE2(const int32 * topleft,const int32 * botleft,int width,int area,uint8 * dst,int count)5193 void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft,
5194 int width, int area, uint8* dst,
5195 int count) {
5196 __asm {
5197 mov eax, topleft // eax topleft
5198 mov esi, botleft // esi botleft
5199 mov edx, width
5200 movd xmm5, area
5201 mov edi, dst
5202 mov ecx, count
5203 cvtdq2ps xmm5, xmm5
5204 rcpss xmm4, xmm5 // 1.0f / area
5205 pshufd xmm4, xmm4, 0
5206 sub ecx, 4
5207 jl l4b
5208
5209 cmp area, 128 // 128 pixels will not overflow 15 bits.
5210 ja l4
5211
5212 pshufd xmm5, xmm5, 0 // area
5213 pcmpeqb xmm6, xmm6 // constant of 65536.0 - 1 = 65535.0
5214 psrld xmm6, 16
5215 cvtdq2ps xmm6, xmm6
5216 addps xmm5, xmm6 // (65536.0 + area - 1)
5217 mulps xmm5, xmm4 // (65536.0 + area - 1) * 1 / area
5218 cvtps2dq xmm5, xmm5 // 0.16 fixed point
5219 packssdw xmm5, xmm5 // 16 bit shorts
5220
5221 // 4 pixel loop small blocks.
5222 s4:
5223 // top left
5224 movdqu xmm0, [eax]
5225 movdqu xmm1, [eax + 16]
5226 movdqu xmm2, [eax + 32]
5227 movdqu xmm3, [eax + 48]
5228
5229 // - top right
5230 psubd xmm0, [eax + edx * 4]
5231 psubd xmm1, [eax + edx * 4 + 16]
5232 psubd xmm2, [eax + edx * 4 + 32]
5233 psubd xmm3, [eax + edx * 4 + 48]
5234 lea eax, [eax + 64]
5235
5236 // - bottom left
5237 psubd xmm0, [esi]
5238 psubd xmm1, [esi + 16]
5239 psubd xmm2, [esi + 32]
5240 psubd xmm3, [esi + 48]
5241
5242 // + bottom right
5243 paddd xmm0, [esi + edx * 4]
5244 paddd xmm1, [esi + edx * 4 + 16]
5245 paddd xmm2, [esi + edx * 4 + 32]
5246 paddd xmm3, [esi + edx * 4 + 48]
5247 lea esi, [esi + 64]
5248
5249 packssdw xmm0, xmm1 // pack 4 pixels into 2 registers
5250 packssdw xmm2, xmm3
5251
5252 pmulhuw xmm0, xmm5
5253 pmulhuw xmm2, xmm5
5254
5255 packuswb xmm0, xmm2
5256 movdqu [edi], xmm0
5257 lea edi, [edi + 16]
5258 sub ecx, 4
5259 jge s4
5260
5261 jmp l4b
5262
5263 // 4 pixel loop
5264 l4:
5265 // top left
5266 movdqu xmm0, [eax]
5267 movdqu xmm1, [eax + 16]
5268 movdqu xmm2, [eax + 32]
5269 movdqu xmm3, [eax + 48]
5270
5271 // - top right
5272 psubd xmm0, [eax + edx * 4]
5273 psubd xmm1, [eax + edx * 4 + 16]
5274 psubd xmm2, [eax + edx * 4 + 32]
5275 psubd xmm3, [eax + edx * 4 + 48]
5276 lea eax, [eax + 64]
5277
5278 // - bottom left
5279 psubd xmm0, [esi]
5280 psubd xmm1, [esi + 16]
5281 psubd xmm2, [esi + 32]
5282 psubd xmm3, [esi + 48]
5283
5284 // + bottom right
5285 paddd xmm0, [esi + edx * 4]
5286 paddd xmm1, [esi + edx * 4 + 16]
5287 paddd xmm2, [esi + edx * 4 + 32]
5288 paddd xmm3, [esi + edx * 4 + 48]
5289 lea esi, [esi + 64]
5290
5291 cvtdq2ps xmm0, xmm0 // Average = Sum * 1 / Area
5292 cvtdq2ps xmm1, xmm1
5293 mulps xmm0, xmm4
5294 mulps xmm1, xmm4
5295 cvtdq2ps xmm2, xmm2
5296 cvtdq2ps xmm3, xmm3
5297 mulps xmm2, xmm4
5298 mulps xmm3, xmm4
5299 cvtps2dq xmm0, xmm0
5300 cvtps2dq xmm1, xmm1
5301 cvtps2dq xmm2, xmm2
5302 cvtps2dq xmm3, xmm3
5303 packssdw xmm0, xmm1
5304 packssdw xmm2, xmm3
5305 packuswb xmm0, xmm2
5306 movdqu [edi], xmm0
5307 lea edi, [edi + 16]
5308 sub ecx, 4
5309 jge l4
5310
5311 l4b:
5312 add ecx, 4 - 1
5313 jl l1b
5314
5315 // 1 pixel loop
5316 l1:
5317 movdqu xmm0, [eax]
5318 psubd xmm0, [eax + edx * 4]
5319 lea eax, [eax + 16]
5320 psubd xmm0, [esi]
5321 paddd xmm0, [esi + edx * 4]
5322 lea esi, [esi + 16]
5323 cvtdq2ps xmm0, xmm0
5324 mulps xmm0, xmm4
5325 cvtps2dq xmm0, xmm0
5326 packssdw xmm0, xmm0
5327 packuswb xmm0, xmm0
5328 movd dword ptr [edi], xmm0
5329 lea edi, [edi + 4]
5330 sub ecx, 1
5331 jge l1
5332 l1b:
5333 }
5334 }
5335 #endif // HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
5336
5337 #ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2
5338 // Creates a table of cumulative sums where each value is a sum of all values
5339 // above and to the left of the value.
ComputeCumulativeSumRow_SSE2(const uint8 * row,int32 * cumsum,const int32 * previous_cumsum,int width)5340 void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
5341 const int32* previous_cumsum, int width) {
5342 __asm {
5343 mov eax, row
5344 mov edx, cumsum
5345 mov esi, previous_cumsum
5346 mov ecx, width
5347 pxor xmm0, xmm0
5348 pxor xmm1, xmm1
5349
5350 sub ecx, 4
5351 jl l4b
5352 test edx, 15
5353 jne l4b
5354
5355 // 4 pixel loop
5356 l4:
5357 movdqu xmm2, [eax] // 4 argb pixels 16 bytes.
5358 lea eax, [eax + 16]
5359 movdqa xmm4, xmm2
5360
5361 punpcklbw xmm2, xmm1
5362 movdqa xmm3, xmm2
5363 punpcklwd xmm2, xmm1
5364 punpckhwd xmm3, xmm1
5365
5366 punpckhbw xmm4, xmm1
5367 movdqa xmm5, xmm4
5368 punpcklwd xmm4, xmm1
5369 punpckhwd xmm5, xmm1
5370
5371 paddd xmm0, xmm2
5372 movdqu xmm2, [esi] // previous row above.
5373 paddd xmm2, xmm0
5374
5375 paddd xmm0, xmm3
5376 movdqu xmm3, [esi + 16]
5377 paddd xmm3, xmm0
5378
5379 paddd xmm0, xmm4
5380 movdqu xmm4, [esi + 32]
5381 paddd xmm4, xmm0
5382
5383 paddd xmm0, xmm5
5384 movdqu xmm5, [esi + 48]
5385 lea esi, [esi + 64]
5386 paddd xmm5, xmm0
5387
5388 movdqu [edx], xmm2
5389 movdqu [edx + 16], xmm3
5390 movdqu [edx + 32], xmm4
5391 movdqu [edx + 48], xmm5
5392
5393 lea edx, [edx + 64]
5394 sub ecx, 4
5395 jge l4
5396
5397 l4b:
5398 add ecx, 4 - 1
5399 jl l1b
5400
5401 // 1 pixel loop
5402 l1:
5403 movd xmm2, dword ptr [eax] // 1 argb pixel 4 bytes.
5404 lea eax, [eax + 4]
5405 punpcklbw xmm2, xmm1
5406 punpcklwd xmm2, xmm1
5407 paddd xmm0, xmm2
5408 movdqu xmm2, [esi]
5409 lea esi, [esi + 16]
5410 paddd xmm2, xmm0
5411 movdqu [edx], xmm2
5412 lea edx, [edx + 16]
5413 sub ecx, 1
5414 jge l1
5415
5416 l1b:
5417 }
5418 }
5419 #endif // HAS_COMPUTECUMULATIVESUMROW_SSE2
5420
5421 #ifdef HAS_ARGBAFFINEROW_SSE2
5422 // Copy ARGB pixels from source image with slope to a row of destination.
5423 __declspec(naked)
5424 LIBYUV_API
ARGBAffineRow_SSE2(const uint8 * src_argb,int src_argb_stride,uint8 * dst_argb,const float * uv_dudv,int width)5425 void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
5426 uint8* dst_argb, const float* uv_dudv, int width) {
5427 __asm {
5428 push esi
5429 push edi
5430 mov eax, [esp + 12] // src_argb
5431 mov esi, [esp + 16] // stride
5432 mov edx, [esp + 20] // dst_argb
5433 mov ecx, [esp + 24] // pointer to uv_dudv
5434 movq xmm2, qword ptr [ecx] // uv
5435 movq xmm7, qword ptr [ecx + 8] // dudv
5436 mov ecx, [esp + 28] // width
5437 shl esi, 16 // 4, stride
5438 add esi, 4
5439 movd xmm5, esi
5440 sub ecx, 4
5441 jl l4b
5442
5443 // setup for 4 pixel loop
5444 pshufd xmm7, xmm7, 0x44 // dup dudv
5445 pshufd xmm5, xmm5, 0 // dup 4, stride
5446 movdqa xmm0, xmm2 // x0, y0, x1, y1
5447 addps xmm0, xmm7
5448 movlhps xmm2, xmm0
5449 movdqa xmm4, xmm7
5450 addps xmm4, xmm4 // dudv *= 2
5451 movdqa xmm3, xmm2 // x2, y2, x3, y3
5452 addps xmm3, xmm4
5453 addps xmm4, xmm4 // dudv *= 4
5454
5455 // 4 pixel loop
5456 l4:
5457 cvttps2dq xmm0, xmm2 // x, y float to int first 2
5458 cvttps2dq xmm1, xmm3 // x, y float to int next 2
5459 packssdw xmm0, xmm1 // x, y as 8 shorts
5460 pmaddwd xmm0, xmm5 // offsets = x * 4 + y * stride.
5461 movd esi, xmm0
5462 pshufd xmm0, xmm0, 0x39 // shift right
5463 movd edi, xmm0
5464 pshufd xmm0, xmm0, 0x39 // shift right
5465 movd xmm1, [eax + esi] // read pixel 0
5466 movd xmm6, [eax + edi] // read pixel 1
5467 punpckldq xmm1, xmm6 // combine pixel 0 and 1
5468 addps xmm2, xmm4 // x, y += dx, dy first 2
5469 movq qword ptr [edx], xmm1
5470 movd esi, xmm0
5471 pshufd xmm0, xmm0, 0x39 // shift right
5472 movd edi, xmm0
5473 movd xmm6, [eax + esi] // read pixel 2
5474 movd xmm0, [eax + edi] // read pixel 3
5475 punpckldq xmm6, xmm0 // combine pixel 2 and 3
5476 addps xmm3, xmm4 // x, y += dx, dy next 2
5477 movq qword ptr 8[edx], xmm6
5478 lea edx, [edx + 16]
5479 sub ecx, 4
5480 jge l4
5481
5482 l4b:
5483 add ecx, 4 - 1
5484 jl l1b
5485
5486 // 1 pixel loop
5487 l1:
5488 cvttps2dq xmm0, xmm2 // x, y float to int
5489 packssdw xmm0, xmm0 // x, y as shorts
5490 pmaddwd xmm0, xmm5 // offset = x * 4 + y * stride
5491 addps xmm2, xmm7 // x, y += dx, dy
5492 movd esi, xmm0
5493 movd xmm0, [eax + esi] // copy a pixel
5494 movd [edx], xmm0
5495 lea edx, [edx + 4]
5496 sub ecx, 1
5497 jge l1
5498 l1b:
5499 pop edi
5500 pop esi
5501 ret
5502 }
5503 }
5504 #endif // HAS_ARGBAFFINEROW_SSE2
5505
5506 #ifdef HAS_INTERPOLATEROW_AVX2
5507 // Bilinear filter 32x2 -> 32x1
5508 __declspec(naked)
InterpolateRow_AVX2(uint8 * dst_ptr,const uint8 * src_ptr,ptrdiff_t src_stride,int dst_width,int source_y_fraction)5509 void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr,
5510 ptrdiff_t src_stride, int dst_width,
5511 int source_y_fraction) {
5512 __asm {
5513 push esi
5514 push edi
5515 mov edi, [esp + 8 + 4] // dst_ptr
5516 mov esi, [esp + 8 + 8] // src_ptr
5517 mov edx, [esp + 8 + 12] // src_stride
5518 mov ecx, [esp + 8 + 16] // dst_width
5519 mov eax, [esp + 8 + 20] // source_y_fraction (0..255)
5520 shr eax, 1
5521 // Dispatch to specialized filters if applicable.
5522 cmp eax, 0
5523 je xloop100 // 0 / 128. Blend 100 / 0.
5524 sub edi, esi
5525 cmp eax, 32
5526 je xloop75 // 32 / 128 is 0.25. Blend 75 / 25.
5527 cmp eax, 64
5528 je xloop50 // 64 / 128 is 0.50. Blend 50 / 50.
5529 cmp eax, 96
5530 je xloop25 // 96 / 128 is 0.75. Blend 25 / 75.
5531
5532 vmovd xmm0, eax // high fraction 0..127
5533 neg eax
5534 add eax, 128
5535 vmovd xmm5, eax // low fraction 128..1
5536 vpunpcklbw xmm5, xmm5, xmm0
5537 vpunpcklwd xmm5, xmm5, xmm5
5538 vpxor ymm0, ymm0, ymm0
5539 vpermd ymm5, ymm0, ymm5
5540
5541 xloop:
5542 vmovdqu ymm0, [esi]
5543 vmovdqu ymm2, [esi + edx]
5544 vpunpckhbw ymm1, ymm0, ymm2 // mutates
5545 vpunpcklbw ymm0, ymm0, ymm2 // mutates
5546 vpmaddubsw ymm0, ymm0, ymm5
5547 vpmaddubsw ymm1, ymm1, ymm5
5548 vpsrlw ymm0, ymm0, 7
5549 vpsrlw ymm1, ymm1, 7
5550 vpackuswb ymm0, ymm0, ymm1 // unmutates
5551 vmovdqu [esi + edi], ymm0
5552 lea esi, [esi + 32]
5553 sub ecx, 32
5554 jg xloop
5555 jmp xloop99
5556
5557 // Blend 25 / 75.
5558 xloop25:
5559 vmovdqu ymm0, [esi]
5560 vmovdqu ymm1, [esi + edx]
5561 vpavgb ymm0, ymm0, ymm1
5562 vpavgb ymm0, ymm0, ymm1
5563 vmovdqu [esi + edi], ymm0
5564 lea esi, [esi + 32]
5565 sub ecx, 32
5566 jg xloop25
5567 jmp xloop99
5568
5569 // Blend 50 / 50.
5570 xloop50:
5571 vmovdqu ymm0, [esi]
5572 vpavgb ymm0, ymm0, [esi + edx]
5573 vmovdqu [esi + edi], ymm0
5574 lea esi, [esi + 32]
5575 sub ecx, 32
5576 jg xloop50
5577 jmp xloop99
5578
5579 // Blend 75 / 25.
5580 xloop75:
5581 vmovdqu ymm1, [esi]
5582 vmovdqu ymm0, [esi + edx]
5583 vpavgb ymm0, ymm0, ymm1
5584 vpavgb ymm0, ymm0, ymm1
5585 vmovdqu [esi + edi], ymm0
5586 lea esi, [esi + 32]
5587 sub ecx, 32
5588 jg xloop75
5589 jmp xloop99
5590
5591 // Blend 100 / 0 - Copy row unchanged.
5592 xloop100:
5593 rep movsb
5594
5595 xloop99:
5596 pop edi
5597 pop esi
5598 vzeroupper
5599 ret
5600 }
5601 }
5602 #endif // HAS_INTERPOLATEROW_AVX2
5603
5604 // Bilinear filter 16x2 -> 16x1
5605 __declspec(naked)
InterpolateRow_SSSE3(uint8 * dst_ptr,const uint8 * src_ptr,ptrdiff_t src_stride,int dst_width,int source_y_fraction)5606 void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
5607 ptrdiff_t src_stride, int dst_width,
5608 int source_y_fraction) {
5609 __asm {
5610 push esi
5611 push edi
5612 mov edi, [esp + 8 + 4] // dst_ptr
5613 mov esi, [esp + 8 + 8] // src_ptr
5614 mov edx, [esp + 8 + 12] // src_stride
5615 mov ecx, [esp + 8 + 16] // dst_width
5616 mov eax, [esp + 8 + 20] // source_y_fraction (0..255)
5617 sub edi, esi
5618 shr eax, 1
5619 // Dispatch to specialized filters if applicable.
5620 cmp eax, 0
5621 je xloop100 // 0 / 128. Blend 100 / 0.
5622 cmp eax, 32
5623 je xloop75 // 32 / 128 is 0.25. Blend 75 / 25.
5624 cmp eax, 64
5625 je xloop50 // 64 / 128 is 0.50. Blend 50 / 50.
5626 cmp eax, 96
5627 je xloop25 // 96 / 128 is 0.75. Blend 25 / 75.
5628
5629 movd xmm0, eax // high fraction 0..127
5630 neg eax
5631 add eax, 128
5632 movd xmm5, eax // low fraction 128..1
5633 punpcklbw xmm5, xmm0
5634 punpcklwd xmm5, xmm5
5635 pshufd xmm5, xmm5, 0
5636
5637 xloop:
5638 movdqu xmm0, [esi]
5639 movdqu xmm2, [esi + edx]
5640 movdqu xmm1, xmm0
5641 punpcklbw xmm0, xmm2
5642 punpckhbw xmm1, xmm2
5643 pmaddubsw xmm0, xmm5
5644 pmaddubsw xmm1, xmm5
5645 psrlw xmm0, 7
5646 psrlw xmm1, 7
5647 packuswb xmm0, xmm1
5648 movdqu [esi + edi], xmm0
5649 lea esi, [esi + 16]
5650 sub ecx, 16
5651 jg xloop
5652 jmp xloop99
5653
5654 // Blend 25 / 75.
5655 xloop25:
5656 movdqu xmm0, [esi]
5657 movdqu xmm1, [esi + edx]
5658 pavgb xmm0, xmm1
5659 pavgb xmm0, xmm1
5660 movdqu [esi + edi], xmm0
5661 lea esi, [esi + 16]
5662 sub ecx, 16
5663 jg xloop25
5664 jmp xloop99
5665
5666 // Blend 50 / 50.
5667 xloop50:
5668 movdqu xmm0, [esi]
5669 movdqu xmm1, [esi + edx]
5670 pavgb xmm0, xmm1
5671 movdqu [esi + edi], xmm0
5672 lea esi, [esi + 16]
5673 sub ecx, 16
5674 jg xloop50
5675 jmp xloop99
5676
5677 // Blend 75 / 25.
5678 xloop75:
5679 movdqu xmm1, [esi]
5680 movdqu xmm0, [esi + edx]
5681 pavgb xmm0, xmm1
5682 pavgb xmm0, xmm1
5683 movdqu [esi + edi], xmm0
5684 lea esi, [esi + 16]
5685 sub ecx, 16
5686 jg xloop75
5687 jmp xloop99
5688
5689 // Blend 100 / 0 - Copy row unchanged.
5690 xloop100:
5691 movdqu xmm0, [esi]
5692 movdqu [esi + edi], xmm0
5693 lea esi, [esi + 16]
5694 sub ecx, 16
5695 jg xloop100
5696
5697 xloop99:
5698 pop edi
5699 pop esi
5700 ret
5701 }
5702 }
5703
5704 #ifdef HAS_INTERPOLATEROW_SSE2
5705 // Bilinear filter 16x2 -> 16x1
5706 __declspec(naked)
InterpolateRow_SSE2(uint8 * dst_ptr,const uint8 * src_ptr,ptrdiff_t src_stride,int dst_width,int source_y_fraction)5707 void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr,
5708 ptrdiff_t src_stride, int dst_width,
5709 int source_y_fraction) {
5710 __asm {
5711 push esi
5712 push edi
5713 mov edi, [esp + 8 + 4] // dst_ptr
5714 mov esi, [esp + 8 + 8] // src_ptr
5715 mov edx, [esp + 8 + 12] // src_stride
5716 mov ecx, [esp + 8 + 16] // dst_width
5717 mov eax, [esp + 8 + 20] // source_y_fraction (0..255)
5718 sub edi, esi
5719 // Dispatch to specialized filters if applicable.
5720 cmp eax, 0
5721 je xloop100 // 0 / 256. Blend 100 / 0.
5722 cmp eax, 64
5723 je xloop75 // 64 / 256 is 0.25. Blend 75 / 25.
5724 cmp eax, 128
5725 je xloop50 // 128 / 256 is 0.50. Blend 50 / 50.
5726 cmp eax, 192
5727 je xloop25 // 192 / 256 is 0.75. Blend 25 / 75.
5728
5729 movd xmm5, eax // xmm5 = y fraction
5730 punpcklbw xmm5, xmm5
5731 psrlw xmm5, 1
5732 punpcklwd xmm5, xmm5
5733 punpckldq xmm5, xmm5
5734 punpcklqdq xmm5, xmm5
5735 pxor xmm4, xmm4
5736
5737 xloop:
5738 movdqu xmm0, [esi] // row0
5739 movdqu xmm2, [esi + edx] // row1
5740 movdqu xmm1, xmm0
5741 movdqu xmm3, xmm2
5742 punpcklbw xmm2, xmm4
5743 punpckhbw xmm3, xmm4
5744 punpcklbw xmm0, xmm4
5745 punpckhbw xmm1, xmm4
5746 psubw xmm2, xmm0 // row1 - row0
5747 psubw xmm3, xmm1
5748 paddw xmm2, xmm2 // 9 bits * 15 bits = 8.16
5749 paddw xmm3, xmm3
5750 pmulhw xmm2, xmm5 // scale diff
5751 pmulhw xmm3, xmm5
5752 paddw xmm0, xmm2 // sum rows
5753 paddw xmm1, xmm3
5754 packuswb xmm0, xmm1
5755 movdqu [esi + edi], xmm0
5756 lea esi, [esi + 16]
5757 sub ecx, 16
5758 jg xloop
5759 jmp xloop99
5760
5761 // Blend 25 / 75.
5762 xloop25:
5763 movdqu xmm0, [esi]
5764 movdqu xmm1, [esi + edx]
5765 pavgb xmm0, xmm1
5766 pavgb xmm0, xmm1
5767 movdqu [esi + edi], xmm0
5768 lea esi, [esi + 16]
5769 sub ecx, 16
5770 jg xloop25
5771 jmp xloop99
5772
5773 // Blend 50 / 50.
5774 xloop50:
5775 movdqu xmm0, [esi]
5776 movdqu xmm1, [esi + edx]
5777 pavgb xmm0, xmm1
5778 movdqu [esi + edi], xmm0
5779 lea esi, [esi + 16]
5780 sub ecx, 16
5781 jg xloop50
5782 jmp xloop99
5783
5784 // Blend 75 / 25.
5785 xloop75:
5786 movdqu xmm1, [esi]
5787 movdqu xmm0, [esi + edx]
5788 pavgb xmm0, xmm1
5789 pavgb xmm0, xmm1
5790 movdqu [esi + edi], xmm0
5791 lea esi, [esi + 16]
5792 sub ecx, 16
5793 jg xloop75
5794 jmp xloop99
5795
5796 // Blend 100 / 0 - Copy row unchanged.
5797 xloop100:
5798 movdqu xmm0, [esi]
5799 movdqu [esi + edi], xmm0
5800 lea esi, [esi + 16]
5801 sub ecx, 16
5802 jg xloop100
5803
5804 xloop99:
5805 pop edi
5806 pop esi
5807 ret
5808 }
5809 }
5810 #endif // HAS_INTERPOLATEROW_SSE2
5811
5812 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
5813 __declspec(naked)
ARGBShuffleRow_SSSE3(const uint8 * src_argb,uint8 * dst_argb,const uint8 * shuffler,int pix)5814 void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
5815 const uint8* shuffler, int pix) {
5816 __asm {
5817 mov eax, [esp + 4] // src_argb
5818 mov edx, [esp + 8] // dst_argb
5819 mov ecx, [esp + 12] // shuffler
5820 movdqu xmm5, [ecx]
5821 mov ecx, [esp + 16] // pix
5822
5823 wloop:
5824 movdqu xmm0, [eax]
5825 movdqu xmm1, [eax + 16]
5826 lea eax, [eax + 32]
5827 pshufb xmm0, xmm5
5828 pshufb xmm1, xmm5
5829 movdqu [edx], xmm0
5830 movdqu [edx + 16], xmm1
5831 lea edx, [edx + 32]
5832 sub ecx, 8
5833 jg wloop
5834 ret
5835 }
5836 }
5837
5838 #ifdef HAS_ARGBSHUFFLEROW_AVX2
5839 __declspec(naked)
ARGBShuffleRow_AVX2(const uint8 * src_argb,uint8 * dst_argb,const uint8 * shuffler,int pix)5840 void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb,
5841 const uint8* shuffler, int pix) {
5842 __asm {
5843 mov eax, [esp + 4] // src_argb
5844 mov edx, [esp + 8] // dst_argb
5845 mov ecx, [esp + 12] // shuffler
5846 vbroadcastf128 ymm5, [ecx] // same shuffle in high as low.
5847 mov ecx, [esp + 16] // pix
5848
5849 wloop:
5850 vmovdqu ymm0, [eax]
5851 vmovdqu ymm1, [eax + 32]
5852 lea eax, [eax + 64]
5853 vpshufb ymm0, ymm0, ymm5
5854 vpshufb ymm1, ymm1, ymm5
5855 vmovdqu [edx], ymm0
5856 vmovdqu [edx + 32], ymm1
5857 lea edx, [edx + 64]
5858 sub ecx, 16
5859 jg wloop
5860
5861 vzeroupper
5862 ret
5863 }
5864 }
5865 #endif // HAS_ARGBSHUFFLEROW_AVX2
5866
5867 __declspec(naked)
ARGBShuffleRow_SSE2(const uint8 * src_argb,uint8 * dst_argb,const uint8 * shuffler,int pix)5868 void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb,
5869 const uint8* shuffler, int pix) {
5870 __asm {
5871 push ebx
5872 push esi
5873 mov eax, [esp + 8 + 4] // src_argb
5874 mov edx, [esp + 8 + 8] // dst_argb
5875 mov esi, [esp + 8 + 12] // shuffler
5876 mov ecx, [esp + 8 + 16] // pix
5877 pxor xmm5, xmm5
5878
5879 mov ebx, [esi] // shuffler
5880 cmp ebx, 0x03000102
5881 je shuf_3012
5882 cmp ebx, 0x00010203
5883 je shuf_0123
5884 cmp ebx, 0x00030201
5885 je shuf_0321
5886 cmp ebx, 0x02010003
5887 je shuf_2103
5888
5889 // TODO(fbarchard): Use one source pointer and 3 offsets.
5890 shuf_any1:
5891 movzx ebx, byte ptr [esi]
5892 movzx ebx, byte ptr [eax + ebx]
5893 mov [edx], bl
5894 movzx ebx, byte ptr [esi + 1]
5895 movzx ebx, byte ptr [eax + ebx]
5896 mov [edx + 1], bl
5897 movzx ebx, byte ptr [esi + 2]
5898 movzx ebx, byte ptr [eax + ebx]
5899 mov [edx + 2], bl
5900 movzx ebx, byte ptr [esi + 3]
5901 movzx ebx, byte ptr [eax + ebx]
5902 mov [edx + 3], bl
5903 lea eax, [eax + 4]
5904 lea edx, [edx + 4]
5905 sub ecx, 1
5906 jg shuf_any1
5907 jmp shuf99
5908
5909 shuf_0123:
5910 movdqu xmm0, [eax]
5911 lea eax, [eax + 16]
5912 movdqa xmm1, xmm0
5913 punpcklbw xmm0, xmm5
5914 punpckhbw xmm1, xmm5
5915 pshufhw xmm0, xmm0, 01Bh // 1B = 00011011 = 0x0123 = BGRAToARGB
5916 pshuflw xmm0, xmm0, 01Bh
5917 pshufhw xmm1, xmm1, 01Bh
5918 pshuflw xmm1, xmm1, 01Bh
5919 packuswb xmm0, xmm1
5920 movdqu [edx], xmm0
5921 lea edx, [edx + 16]
5922 sub ecx, 4
5923 jg shuf_0123
5924 jmp shuf99
5925
5926 shuf_0321:
5927 movdqu xmm0, [eax]
5928 lea eax, [eax + 16]
5929 movdqa xmm1, xmm0
5930 punpcklbw xmm0, xmm5
5931 punpckhbw xmm1, xmm5
5932 pshufhw xmm0, xmm0, 039h // 39 = 00111001 = 0x0321 = RGBAToARGB
5933 pshuflw xmm0, xmm0, 039h
5934 pshufhw xmm1, xmm1, 039h
5935 pshuflw xmm1, xmm1, 039h
5936 packuswb xmm0, xmm1
5937 movdqu [edx], xmm0
5938 lea edx, [edx + 16]
5939 sub ecx, 4
5940 jg shuf_0321
5941 jmp shuf99
5942
5943 shuf_2103:
5944 movdqu xmm0, [eax]
5945 lea eax, [eax + 16]
5946 movdqa xmm1, xmm0
5947 punpcklbw xmm0, xmm5
5948 punpckhbw xmm1, xmm5
5949 pshufhw xmm0, xmm0, 093h // 93 = 10010011 = 0x2103 = ARGBToRGBA
5950 pshuflw xmm0, xmm0, 093h
5951 pshufhw xmm1, xmm1, 093h
5952 pshuflw xmm1, xmm1, 093h
5953 packuswb xmm0, xmm1
5954 movdqu [edx], xmm0
5955 lea edx, [edx + 16]
5956 sub ecx, 4
5957 jg shuf_2103
5958 jmp shuf99
5959
5960 shuf_3012:
5961 movdqu xmm0, [eax]
5962 lea eax, [eax + 16]
5963 movdqa xmm1, xmm0
5964 punpcklbw xmm0, xmm5
5965 punpckhbw xmm1, xmm5
5966 pshufhw xmm0, xmm0, 0C6h // C6 = 11000110 = 0x3012 = ABGRToARGB
5967 pshuflw xmm0, xmm0, 0C6h
5968 pshufhw xmm1, xmm1, 0C6h
5969 pshuflw xmm1, xmm1, 0C6h
5970 packuswb xmm0, xmm1
5971 movdqu [edx], xmm0
5972 lea edx, [edx + 16]
5973 sub ecx, 4
5974 jg shuf_3012
5975
5976 shuf99:
5977 pop esi
5978 pop ebx
5979 ret
5980 }
5981 }
5982
5983 // YUY2 - Macro-pixel = 2 image pixels
5984 // Y0U0Y1V0....Y2U2Y3V2...Y4U4Y5V4....
5985
5986 // UYVY - Macro-pixel = 2 image pixels
5987 // U0Y0V0Y1
5988
5989 __declspec(naked)
I422ToYUY2Row_SSE2(const uint8 * src_y,const uint8 * src_u,const uint8 * src_v,uint8 * dst_frame,int width)5990 void I422ToYUY2Row_SSE2(const uint8* src_y,
5991 const uint8* src_u,
5992 const uint8* src_v,
5993 uint8* dst_frame, int width) {
5994 __asm {
5995 push esi
5996 push edi
5997 mov eax, [esp + 8 + 4] // src_y
5998 mov esi, [esp + 8 + 8] // src_u
5999 mov edx, [esp + 8 + 12] // src_v
6000 mov edi, [esp + 8 + 16] // dst_frame
6001 mov ecx, [esp + 8 + 20] // width
6002 sub edx, esi
6003
6004 convertloop:
6005 movq xmm2, qword ptr [esi] // U
6006 movq xmm3, qword ptr [esi + edx] // V
6007 lea esi, [esi + 8]
6008 punpcklbw xmm2, xmm3 // UV
6009 movdqu xmm0, [eax] // Y
6010 lea eax, [eax + 16]
6011 movdqa xmm1, xmm0
6012 punpcklbw xmm0, xmm2 // YUYV
6013 punpckhbw xmm1, xmm2
6014 movdqu [edi], xmm0
6015 movdqu [edi + 16], xmm1
6016 lea edi, [edi + 32]
6017 sub ecx, 16
6018 jg convertloop
6019
6020 pop edi
6021 pop esi
6022 ret
6023 }
6024 }
6025
6026 __declspec(naked)
I422ToUYVYRow_SSE2(const uint8 * src_y,const uint8 * src_u,const uint8 * src_v,uint8 * dst_frame,int width)6027 void I422ToUYVYRow_SSE2(const uint8* src_y,
6028 const uint8* src_u,
6029 const uint8* src_v,
6030 uint8* dst_frame, int width) {
6031 __asm {
6032 push esi
6033 push edi
6034 mov eax, [esp + 8 + 4] // src_y
6035 mov esi, [esp + 8 + 8] // src_u
6036 mov edx, [esp + 8 + 12] // src_v
6037 mov edi, [esp + 8 + 16] // dst_frame
6038 mov ecx, [esp + 8 + 20] // width
6039 sub edx, esi
6040
6041 convertloop:
6042 movq xmm2, qword ptr [esi] // U
6043 movq xmm3, qword ptr [esi + edx] // V
6044 lea esi, [esi + 8]
6045 punpcklbw xmm2, xmm3 // UV
6046 movdqu xmm0, [eax] // Y
6047 movdqa xmm1, xmm2
6048 lea eax, [eax + 16]
6049 punpcklbw xmm1, xmm0 // UYVY
6050 punpckhbw xmm2, xmm0
6051 movdqu [edi], xmm1
6052 movdqu [edi + 16], xmm2
6053 lea edi, [edi + 32]
6054 sub ecx, 16
6055 jg convertloop
6056
6057 pop edi
6058 pop esi
6059 ret
6060 }
6061 }
6062
6063 #ifdef HAS_ARGBPOLYNOMIALROW_SSE2
6064 __declspec(naked)
ARGBPolynomialRow_SSE2(const uint8 * src_argb,uint8 * dst_argb,const float * poly,int width)6065 void ARGBPolynomialRow_SSE2(const uint8* src_argb,
6066 uint8* dst_argb, const float* poly,
6067 int width) {
6068 __asm {
6069 push esi
6070 mov eax, [esp + 4 + 4] /* src_argb */
6071 mov edx, [esp + 4 + 8] /* dst_argb */
6072 mov esi, [esp + 4 + 12] /* poly */
6073 mov ecx, [esp + 4 + 16] /* width */
6074 pxor xmm3, xmm3 // 0 constant for zero extending bytes to ints.
6075
6076 // 2 pixel loop.
6077 convertloop:
6078 // pmovzxbd xmm0, dword ptr [eax] // BGRA pixel
6079 // pmovzxbd xmm4, dword ptr [eax + 4] // BGRA pixel
6080 movq xmm0, qword ptr [eax] // BGRABGRA
6081 lea eax, [eax + 8]
6082 punpcklbw xmm0, xmm3
6083 movdqa xmm4, xmm0
6084 punpcklwd xmm0, xmm3 // pixel 0
6085 punpckhwd xmm4, xmm3 // pixel 1
6086 cvtdq2ps xmm0, xmm0 // 4 floats
6087 cvtdq2ps xmm4, xmm4
6088 movdqa xmm1, xmm0 // X
6089 movdqa xmm5, xmm4
6090 mulps xmm0, [esi + 16] // C1 * X
6091 mulps xmm4, [esi + 16]
6092 addps xmm0, [esi] // result = C0 + C1 * X
6093 addps xmm4, [esi]
6094 movdqa xmm2, xmm1
6095 movdqa xmm6, xmm5
6096 mulps xmm2, xmm1 // X * X
6097 mulps xmm6, xmm5
6098 mulps xmm1, xmm2 // X * X * X
6099 mulps xmm5, xmm6
6100 mulps xmm2, [esi + 32] // C2 * X * X
6101 mulps xmm6, [esi + 32]
6102 mulps xmm1, [esi + 48] // C3 * X * X * X
6103 mulps xmm5, [esi + 48]
6104 addps xmm0, xmm2 // result += C2 * X * X
6105 addps xmm4, xmm6
6106 addps xmm0, xmm1 // result += C3 * X * X * X
6107 addps xmm4, xmm5
6108 cvttps2dq xmm0, xmm0
6109 cvttps2dq xmm4, xmm4
6110 packuswb xmm0, xmm4
6111 packuswb xmm0, xmm0
6112 movq qword ptr [edx], xmm0
6113 lea edx, [edx + 8]
6114 sub ecx, 2
6115 jg convertloop
6116 pop esi
6117 ret
6118 }
6119 }
6120 #endif // HAS_ARGBPOLYNOMIALROW_SSE2
6121
6122 #ifdef HAS_ARGBPOLYNOMIALROW_AVX2
6123 __declspec(naked)
ARGBPolynomialRow_AVX2(const uint8 * src_argb,uint8 * dst_argb,const float * poly,int width)6124 void ARGBPolynomialRow_AVX2(const uint8* src_argb,
6125 uint8* dst_argb, const float* poly,
6126 int width) {
6127 __asm {
6128 mov eax, [esp + 4] /* src_argb */
6129 mov edx, [esp + 8] /* dst_argb */
6130 mov ecx, [esp + 12] /* poly */
6131 vbroadcastf128 ymm4, [ecx] // C0
6132 vbroadcastf128 ymm5, [ecx + 16] // C1
6133 vbroadcastf128 ymm6, [ecx + 32] // C2
6134 vbroadcastf128 ymm7, [ecx + 48] // C3
6135 mov ecx, [esp + 16] /* width */
6136
6137 // 2 pixel loop.
6138 convertloop:
6139 vpmovzxbd ymm0, qword ptr [eax] // 2 BGRA pixels
6140 lea eax, [eax + 8]
6141 vcvtdq2ps ymm0, ymm0 // X 8 floats
6142 vmulps ymm2, ymm0, ymm0 // X * X
6143 vmulps ymm3, ymm0, ymm7 // C3 * X
6144 vfmadd132ps ymm0, ymm4, ymm5 // result = C0 + C1 * X
6145 vfmadd231ps ymm0, ymm2, ymm6 // result += C2 * X * X
6146 vfmadd231ps ymm0, ymm2, ymm3 // result += C3 * X * X * X
6147 vcvttps2dq ymm0, ymm0
6148 vpackusdw ymm0, ymm0, ymm0 // b0g0r0a0_00000000_b0g0r0a0_00000000
6149 vpermq ymm0, ymm0, 0xd8 // b0g0r0a0_b0g0r0a0_00000000_00000000
6150 vpackuswb xmm0, xmm0, xmm0 // bgrabgra_00000000_00000000_00000000
6151 vmovq qword ptr [edx], xmm0
6152 lea edx, [edx + 8]
6153 sub ecx, 2
6154 jg convertloop
6155 vzeroupper
6156 ret
6157 }
6158 }
6159 #endif // HAS_ARGBPOLYNOMIALROW_AVX2
6160
6161 #ifdef HAS_ARGBCOLORTABLEROW_X86
6162 // Tranform ARGB pixels with color table.
6163 __declspec(naked)
ARGBColorTableRow_X86(uint8 * dst_argb,const uint8 * table_argb,int width)6164 void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb,
6165 int width) {
6166 __asm {
6167 push esi
6168 mov eax, [esp + 4 + 4] /* dst_argb */
6169 mov esi, [esp + 4 + 8] /* table_argb */
6170 mov ecx, [esp + 4 + 12] /* width */
6171
6172 // 1 pixel loop.
6173 convertloop:
6174 movzx edx, byte ptr [eax]
6175 lea eax, [eax + 4]
6176 movzx edx, byte ptr [esi + edx * 4]
6177 mov byte ptr [eax - 4], dl
6178 movzx edx, byte ptr [eax - 4 + 1]
6179 movzx edx, byte ptr [esi + edx * 4 + 1]
6180 mov byte ptr [eax - 4 + 1], dl
6181 movzx edx, byte ptr [eax - 4 + 2]
6182 movzx edx, byte ptr [esi + edx * 4 + 2]
6183 mov byte ptr [eax - 4 + 2], dl
6184 movzx edx, byte ptr [eax - 4 + 3]
6185 movzx edx, byte ptr [esi + edx * 4 + 3]
6186 mov byte ptr [eax - 4 + 3], dl
6187 dec ecx
6188 jg convertloop
6189 pop esi
6190 ret
6191 }
6192 }
6193 #endif // HAS_ARGBCOLORTABLEROW_X86
6194
6195 #ifdef HAS_RGBCOLORTABLEROW_X86
6196 // Tranform RGB pixels with color table.
6197 __declspec(naked)
RGBColorTableRow_X86(uint8 * dst_argb,const uint8 * table_argb,int width)6198 void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width) {
6199 __asm {
6200 push esi
6201 mov eax, [esp + 4 + 4] /* dst_argb */
6202 mov esi, [esp + 4 + 8] /* table_argb */
6203 mov ecx, [esp + 4 + 12] /* width */
6204
6205 // 1 pixel loop.
6206 convertloop:
6207 movzx edx, byte ptr [eax]
6208 lea eax, [eax + 4]
6209 movzx edx, byte ptr [esi + edx * 4]
6210 mov byte ptr [eax - 4], dl
6211 movzx edx, byte ptr [eax - 4 + 1]
6212 movzx edx, byte ptr [esi + edx * 4 + 1]
6213 mov byte ptr [eax - 4 + 1], dl
6214 movzx edx, byte ptr [eax - 4 + 2]
6215 movzx edx, byte ptr [esi + edx * 4 + 2]
6216 mov byte ptr [eax - 4 + 2], dl
6217 dec ecx
6218 jg convertloop
6219
6220 pop esi
6221 ret
6222 }
6223 }
6224 #endif // HAS_RGBCOLORTABLEROW_X86
6225
6226 #ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3
6227 // Tranform RGB pixels with luma table.
6228 __declspec(naked)
ARGBLumaColorTableRow_SSSE3(const uint8 * src_argb,uint8 * dst_argb,int width,const uint8 * luma,uint32 lumacoeff)6229 void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
6230 int width,
6231 const uint8* luma, uint32 lumacoeff) {
6232 __asm {
6233 push esi
6234 push edi
6235 mov eax, [esp + 8 + 4] /* src_argb */
6236 mov edi, [esp + 8 + 8] /* dst_argb */
6237 mov ecx, [esp + 8 + 12] /* width */
6238 movd xmm2, dword ptr [esp + 8 + 16] // luma table
6239 movd xmm3, dword ptr [esp + 8 + 20] // lumacoeff
6240 pshufd xmm2, xmm2, 0
6241 pshufd xmm3, xmm3, 0
6242 pcmpeqb xmm4, xmm4 // generate mask 0xff00ff00
6243 psllw xmm4, 8
6244 pxor xmm5, xmm5
6245
6246 // 4 pixel loop.
6247 convertloop:
6248 movdqu xmm0, qword ptr [eax] // generate luma ptr
6249 pmaddubsw xmm0, xmm3
6250 phaddw xmm0, xmm0
6251 pand xmm0, xmm4 // mask out low bits
6252 punpcklwd xmm0, xmm5
6253 paddd xmm0, xmm2 // add table base
6254 movd esi, xmm0
6255 pshufd xmm0, xmm0, 0x39 // 00111001 to rotate right 32
6256
6257 movzx edx, byte ptr [eax]
6258 movzx edx, byte ptr [esi + edx]
6259 mov byte ptr [edi], dl
6260 movzx edx, byte ptr [eax + 1]
6261 movzx edx, byte ptr [esi + edx]
6262 mov byte ptr [edi + 1], dl
6263 movzx edx, byte ptr [eax + 2]
6264 movzx edx, byte ptr [esi + edx]
6265 mov byte ptr [edi + 2], dl
6266 movzx edx, byte ptr [eax + 3] // copy alpha.
6267 mov byte ptr [edi + 3], dl
6268
6269 movd esi, xmm0
6270 pshufd xmm0, xmm0, 0x39 // 00111001 to rotate right 32
6271
6272 movzx edx, byte ptr [eax + 4]
6273 movzx edx, byte ptr [esi + edx]
6274 mov byte ptr [edi + 4], dl
6275 movzx edx, byte ptr [eax + 5]
6276 movzx edx, byte ptr [esi + edx]
6277 mov byte ptr [edi + 5], dl
6278 movzx edx, byte ptr [eax + 6]
6279 movzx edx, byte ptr [esi + edx]
6280 mov byte ptr [edi + 6], dl
6281 movzx edx, byte ptr [eax + 7] // copy alpha.
6282 mov byte ptr [edi + 7], dl
6283
6284 movd esi, xmm0
6285 pshufd xmm0, xmm0, 0x39 // 00111001 to rotate right 32
6286
6287 movzx edx, byte ptr [eax + 8]
6288 movzx edx, byte ptr [esi + edx]
6289 mov byte ptr [edi + 8], dl
6290 movzx edx, byte ptr [eax + 9]
6291 movzx edx, byte ptr [esi + edx]
6292 mov byte ptr [edi + 9], dl
6293 movzx edx, byte ptr [eax + 10]
6294 movzx edx, byte ptr [esi + edx]
6295 mov byte ptr [edi + 10], dl
6296 movzx edx, byte ptr [eax + 11] // copy alpha.
6297 mov byte ptr [edi + 11], dl
6298
6299 movd esi, xmm0
6300
6301 movzx edx, byte ptr [eax + 12]
6302 movzx edx, byte ptr [esi + edx]
6303 mov byte ptr [edi + 12], dl
6304 movzx edx, byte ptr [eax + 13]
6305 movzx edx, byte ptr [esi + edx]
6306 mov byte ptr [edi + 13], dl
6307 movzx edx, byte ptr [eax + 14]
6308 movzx edx, byte ptr [esi + edx]
6309 mov byte ptr [edi + 14], dl
6310 movzx edx, byte ptr [eax + 15] // copy alpha.
6311 mov byte ptr [edi + 15], dl
6312
6313 lea eax, [eax + 16]
6314 lea edi, [edi + 16]
6315 sub ecx, 4
6316 jg convertloop
6317
6318 pop edi
6319 pop esi
6320 ret
6321 }
6322 }
6323 #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3
6324
6325 #endif // defined(_M_X64)
6326 #endif // !defined(LIBYUV_DISABLE_X86) && (defined(_M_IX86) || defined(_M_X64))
6327
6328 #ifdef __cplusplus
6329 } // extern "C"
6330 } // namespace libyuv
6331 #endif
6332