1 /*
2 * Copyright 2011 The LibYuv Project Authors. All rights reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include "libyuv/row.h"
12
13 #if defined (_M_X64)
14 #include <emmintrin.h>
15 #include <tmmintrin.h> // For _mm_maddubs_epi16
16 #endif
17
18 #ifdef __cplusplus
19 namespace libyuv {
20 extern "C" {
21 #endif
22
23 // This module is for Visual C.
24 #if !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER)
25
26 #define YG 74 /* (int8)(1.164 * 64 + 0.5) */
27
28 #define UB 127 /* min(127,(int8)(2.018 * 64)) */
29 #define UG -25 /* (int8)(-0.391 * 64 - 0.5) */
30 #define UR 0
31
32 #define VB 0
33 #define VG -52 /* (int8)(-0.813 * 64 - 0.5) */
34 #define VR 102 /* (int8)(1.596 * 64 + 0.5) */
35
36 // Bias
37 #define BB UB * 128 + VB * 128
38 #define BG UG * 128 + VG * 128
39 #define BR UR * 128 + VR * 128
40
41 static const vec8 kUVToB = {
42 UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB
43 };
44
45 static const vec8 kUVToR = {
46 UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR
47 };
48
49 static const vec8 kUVToG = {
50 UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG
51 };
52
53 static const vec8 kVUToB = {
54 VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB,
55 };
56
57 static const vec8 kVUToR = {
58 VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR,
59 };
60
61 static const vec8 kVUToG = {
62 VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG,
63 };
64
65 static const vec16 kYToRgb = { YG, YG, YG, YG, YG, YG, YG, YG };
66 static const vec16 kYSub16 = { 16, 16, 16, 16, 16, 16, 16, 16 };
67 static const vec16 kUVBiasB = { BB, BB, BB, BB, BB, BB, BB, BB };
68 static const vec16 kUVBiasG = { BG, BG, BG, BG, BG, BG, BG, BG };
69 static const vec16 kUVBiasR = { BR, BR, BR, BR, BR, BR, BR, BR };
70
71 // 64 bit
72 #if defined(_M_X64)
73
74 // Aligned destination version.
75 __declspec(align(16))
I422ToARGBRow_SSSE3(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * dst_argb,int width)76 void I422ToARGBRow_SSSE3(const uint8* y_buf,
77 const uint8* u_buf,
78 const uint8* v_buf,
79 uint8* dst_argb,
80 int width) {
81
82 __m128i xmm0, xmm1, xmm2, xmm3;
83 const __m128i xmm5 = _mm_set1_epi8(-1);
84 const __m128i xmm4 = _mm_setzero_si128();
85 const ptrdiff_t offset = (uint8*)v_buf - (uint8*)u_buf;
86
87 while (width > 0) {
88 xmm0 = _mm_cvtsi32_si128(*(uint32*)u_buf);
89 xmm1 = _mm_cvtsi32_si128(*(uint32*)(u_buf + offset));
90 xmm0 = _mm_unpacklo_epi8(xmm0, xmm1);
91 xmm0 = _mm_unpacklo_epi16(xmm0, xmm0);
92 xmm1 = _mm_load_si128(&xmm0);
93 xmm2 = _mm_load_si128(&xmm0);
94 xmm0 = _mm_maddubs_epi16(xmm0, *(__m128i*)kUVToB);
95 xmm1 = _mm_maddubs_epi16(xmm1, *(__m128i*)kUVToG);
96 xmm2 = _mm_maddubs_epi16(xmm2, *(__m128i*)kUVToR);
97 xmm0 = _mm_sub_epi16(xmm0, *(__m128i*)kUVBiasB);
98 xmm1 = _mm_sub_epi16(xmm1, *(__m128i*)kUVBiasG);
99 xmm2 = _mm_sub_epi16(xmm2, *(__m128i*)kUVBiasR);
100 xmm3 = _mm_loadl_epi64((__m128i*)y_buf);
101 xmm3 = _mm_unpacklo_epi8(xmm3, xmm4);
102 xmm3 = _mm_subs_epi16(xmm3, *(__m128i*)kYSub16);
103 xmm3 = _mm_mullo_epi16(xmm3, *(__m128i*)kYToRgb);
104 xmm0 = _mm_adds_epi16(xmm0, xmm3);
105 xmm1 = _mm_adds_epi16(xmm1, xmm3);
106 xmm2 = _mm_adds_epi16(xmm2, xmm3);
107 xmm0 = _mm_srai_epi16(xmm0, 6);
108 xmm1 = _mm_srai_epi16(xmm1, 6);
109 xmm2 = _mm_srai_epi16(xmm2, 6);
110 xmm0 = _mm_packus_epi16(xmm0, xmm0);
111 xmm1 = _mm_packus_epi16(xmm1, xmm1);
112 xmm2 = _mm_packus_epi16(xmm2, xmm2);
113 xmm0 = _mm_unpacklo_epi8(xmm0, xmm1);
114 xmm2 = _mm_unpacklo_epi8(xmm2, xmm5);
115 xmm1 = _mm_load_si128(&xmm0);
116 xmm0 = _mm_unpacklo_epi16(xmm0, xmm2);
117 xmm1 = _mm_unpackhi_epi16(xmm1, xmm2);
118
119 _mm_store_si128((__m128i *)dst_argb, xmm0);
120 _mm_store_si128((__m128i *)(dst_argb + 16), xmm1);
121
122 y_buf += 8;
123 u_buf += 4;
124 dst_argb += 32;
125 width -= 8;
126 }
127 }
128
129 // Unaligned destination version.
I422ToARGBRow_Unaligned_SSSE3(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * dst_argb,int width)130 void I422ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
131 const uint8* u_buf,
132 const uint8* v_buf,
133 uint8* dst_argb,
134 int width) {
135
136 __m128i xmm0, xmm1, xmm2, xmm3;
137 const __m128i xmm5 = _mm_set1_epi8(-1);
138 const __m128i xmm4 = _mm_setzero_si128();
139 const ptrdiff_t offset = (uint8*)v_buf - (uint8*)u_buf;
140
141 while (width > 0) {
142 xmm0 = _mm_cvtsi32_si128(*(uint32*)u_buf);
143 xmm1 = _mm_cvtsi32_si128(*(uint32*)(u_buf + offset));
144 xmm0 = _mm_unpacklo_epi8(xmm0, xmm1);
145 xmm0 = _mm_unpacklo_epi16(xmm0, xmm0);
146 xmm1 = _mm_load_si128(&xmm0);
147 xmm2 = _mm_load_si128(&xmm0);
148 xmm0 = _mm_maddubs_epi16(xmm0, *(__m128i*)kUVToB);
149 xmm1 = _mm_maddubs_epi16(xmm1, *(__m128i*)kUVToG);
150 xmm2 = _mm_maddubs_epi16(xmm2, *(__m128i*)kUVToR);
151 xmm0 = _mm_sub_epi16(xmm0, *(__m128i*)kUVBiasB);
152 xmm1 = _mm_sub_epi16(xmm1, *(__m128i*)kUVBiasG);
153 xmm2 = _mm_sub_epi16(xmm2, *(__m128i*)kUVBiasR);
154 xmm3 = _mm_loadl_epi64((__m128i*)y_buf);
155 xmm3 = _mm_unpacklo_epi8(xmm3, xmm4);
156 xmm3 = _mm_subs_epi16(xmm3, *(__m128i*)kYSub16);
157 xmm3 = _mm_mullo_epi16(xmm3, *(__m128i*)kYToRgb);
158 xmm0 = _mm_adds_epi16(xmm0, xmm3);
159 xmm1 = _mm_adds_epi16(xmm1, xmm3);
160 xmm2 = _mm_adds_epi16(xmm2, xmm3);
161 xmm0 = _mm_srai_epi16(xmm0, 6);
162 xmm1 = _mm_srai_epi16(xmm1, 6);
163 xmm2 = _mm_srai_epi16(xmm2, 6);
164 xmm0 = _mm_packus_epi16(xmm0, xmm0);
165 xmm1 = _mm_packus_epi16(xmm1, xmm1);
166 xmm2 = _mm_packus_epi16(xmm2, xmm2);
167 xmm0 = _mm_unpacklo_epi8(xmm0, xmm1);
168 xmm2 = _mm_unpacklo_epi8(xmm2, xmm5);
169 xmm1 = _mm_load_si128(&xmm0);
170 xmm0 = _mm_unpacklo_epi16(xmm0, xmm2);
171 xmm1 = _mm_unpackhi_epi16(xmm1, xmm2);
172
173 _mm_storeu_si128((__m128i *)dst_argb, xmm0);
174 _mm_storeu_si128((__m128i *)(dst_argb + 16), xmm1);
175
176 y_buf += 8;
177 u_buf += 4;
178 dst_argb += 32;
179 width -= 8;
180 }
181 }
182 // 32 bit
183 #else // defined(_M_X64)
184
185 #ifdef HAS_ARGBTOYROW_SSSE3
186
187 // Constants for ARGB.
188 static const vec8 kARGBToY = {
189 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0
190 };
191
192 // JPeg full range.
193 static const vec8 kARGBToYJ = {
194 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0
195 };
196
197 static const vec8 kARGBToU = {
198 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0
199 };
200
201 static const vec8 kARGBToUJ = {
202 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0
203 };
204
205 static const vec8 kARGBToV = {
206 -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
207 };
208
209 static const vec8 kARGBToVJ = {
210 -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0
211 };
212
213 // vpermd for vphaddw + vpackuswb vpermd.
214 static const lvec32 kPermdARGBToY_AVX = {
215 0, 4, 1, 5, 2, 6, 3, 7
216 };
217
218 // vpshufb for vphaddw + vpackuswb packed to shorts.
219 static const lvec8 kShufARGBToUV_AVX = {
220 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
221 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
222 };
223
224 // Constants for BGRA.
225 static const vec8 kBGRAToY = {
226 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13
227 };
228
229 static const vec8 kBGRAToU = {
230 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112
231 };
232
233 static const vec8 kBGRAToV = {
234 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18
235 };
236
237 // Constants for ABGR.
238 static const vec8 kABGRToY = {
239 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0
240 };
241
242 static const vec8 kABGRToU = {
243 -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0
244 };
245
246 static const vec8 kABGRToV = {
247 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0
248 };
249
250 // Constants for RGBA.
251 static const vec8 kRGBAToY = {
252 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33
253 };
254
255 static const vec8 kRGBAToU = {
256 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38
257 };
258
259 static const vec8 kRGBAToV = {
260 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112
261 };
262
263 static const uvec8 kAddY16 = {
264 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u
265 };
266
267 static const vec16 kAddYJ64 = {
268 64, 64, 64, 64, 64, 64, 64, 64
269 };
270
271 static const uvec8 kAddUV128 = {
272 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
273 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
274 };
275
276 static const uvec16 kAddUVJ128 = {
277 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u
278 };
279
280 // Shuffle table for converting RGB24 to ARGB.
281 static const uvec8 kShuffleMaskRGB24ToARGB = {
282 0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u
283 };
284
285 // Shuffle table for converting RAW to ARGB.
286 static const uvec8 kShuffleMaskRAWToARGB = {
287 2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u
288 };
289
290 // Shuffle table for converting ARGB to RGB24.
291 static const uvec8 kShuffleMaskARGBToRGB24 = {
292 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u
293 };
294
295 // Shuffle table for converting ARGB to RAW.
296 static const uvec8 kShuffleMaskARGBToRAW = {
297 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u
298 };
299
300 // Shuffle table for converting ARGBToRGB24 for I422ToRGB24. First 8 + next 4
301 static const uvec8 kShuffleMaskARGBToRGB24_0 = {
302 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u
303 };
304
305 // Shuffle table for converting ARGB to RAW.
306 static const uvec8 kShuffleMaskARGBToRAW_0 = {
307 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 128u, 128u, 128u, 128u, 8u, 14u, 13u, 12u
308 };
309
310 // Duplicates gray value 3 times and fills in alpha opaque.
311 __declspec(naked) __declspec(align(16))
I400ToARGBRow_SSE2(const uint8 * src_y,uint8 * dst_argb,int pix)312 void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
313 __asm {
314 mov eax, [esp + 4] // src_y
315 mov edx, [esp + 8] // dst_argb
316 mov ecx, [esp + 12] // pix
317 pcmpeqb xmm5, xmm5 // generate mask 0xff000000
318 pslld xmm5, 24
319
320 align 4
321 convertloop:
322 movq xmm0, qword ptr [eax]
323 lea eax, [eax + 8]
324 punpcklbw xmm0, xmm0
325 movdqa xmm1, xmm0
326 punpcklwd xmm0, xmm0
327 punpckhwd xmm1, xmm1
328 por xmm0, xmm5
329 por xmm1, xmm5
330 movdqa [edx], xmm0
331 movdqa [edx + 16], xmm1
332 lea edx, [edx + 32]
333 sub ecx, 8
334 jg convertloop
335 ret
336 }
337 }
338
339 __declspec(naked) __declspec(align(16))
I400ToARGBRow_Unaligned_SSE2(const uint8 * src_y,uint8 * dst_argb,int pix)340 void I400ToARGBRow_Unaligned_SSE2(const uint8* src_y, uint8* dst_argb,
341 int pix) {
342 __asm {
343 mov eax, [esp + 4] // src_y
344 mov edx, [esp + 8] // dst_argb
345 mov ecx, [esp + 12] // pix
346 pcmpeqb xmm5, xmm5 // generate mask 0xff000000
347 pslld xmm5, 24
348
349 align 4
350 convertloop:
351 movq xmm0, qword ptr [eax]
352 lea eax, [eax + 8]
353 punpcklbw xmm0, xmm0
354 movdqa xmm1, xmm0
355 punpcklwd xmm0, xmm0
356 punpckhwd xmm1, xmm1
357 por xmm0, xmm5
358 por xmm1, xmm5
359 movdqu [edx], xmm0
360 movdqu [edx + 16], xmm1
361 lea edx, [edx + 32]
362 sub ecx, 8
363 jg convertloop
364 ret
365 }
366 }
367
368 __declspec(naked) __declspec(align(16))
RGB24ToARGBRow_SSSE3(const uint8 * src_rgb24,uint8 * dst_argb,int pix)369 void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) {
370 __asm {
371 mov eax, [esp + 4] // src_rgb24
372 mov edx, [esp + 8] // dst_argb
373 mov ecx, [esp + 12] // pix
374 pcmpeqb xmm5, xmm5 // generate mask 0xff000000
375 pslld xmm5, 24
376 movdqa xmm4, kShuffleMaskRGB24ToARGB
377
378 align 4
379 convertloop:
380 movdqu xmm0, [eax]
381 movdqu xmm1, [eax + 16]
382 movdqu xmm3, [eax + 32]
383 lea eax, [eax + 48]
384 movdqa xmm2, xmm3
385 palignr xmm2, xmm1, 8 // xmm2 = { xmm3[0:3] xmm1[8:15]}
386 pshufb xmm2, xmm4
387 por xmm2, xmm5
388 palignr xmm1, xmm0, 12 // xmm1 = { xmm3[0:7] xmm0[12:15]}
389 pshufb xmm0, xmm4
390 movdqa [edx + 32], xmm2
391 por xmm0, xmm5
392 pshufb xmm1, xmm4
393 movdqa [edx], xmm0
394 por xmm1, xmm5
395 palignr xmm3, xmm3, 4 // xmm3 = { xmm3[4:15]}
396 pshufb xmm3, xmm4
397 movdqa [edx + 16], xmm1
398 por xmm3, xmm5
399 sub ecx, 16
400 movdqa [edx + 48], xmm3
401 lea edx, [edx + 64]
402 jg convertloop
403 ret
404 }
405 }
406
407 __declspec(naked) __declspec(align(16))
RAWToARGBRow_SSSE3(const uint8 * src_raw,uint8 * dst_argb,int pix)408 void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb,
409 int pix) {
410 __asm {
411 mov eax, [esp + 4] // src_raw
412 mov edx, [esp + 8] // dst_argb
413 mov ecx, [esp + 12] // pix
414 pcmpeqb xmm5, xmm5 // generate mask 0xff000000
415 pslld xmm5, 24
416 movdqa xmm4, kShuffleMaskRAWToARGB
417
418 align 4
419 convertloop:
420 movdqu xmm0, [eax]
421 movdqu xmm1, [eax + 16]
422 movdqu xmm3, [eax + 32]
423 lea eax, [eax + 48]
424 movdqa xmm2, xmm3
425 palignr xmm2, xmm1, 8 // xmm2 = { xmm3[0:3] xmm1[8:15]}
426 pshufb xmm2, xmm4
427 por xmm2, xmm5
428 palignr xmm1, xmm0, 12 // xmm1 = { xmm3[0:7] xmm0[12:15]}
429 pshufb xmm0, xmm4
430 movdqa [edx + 32], xmm2
431 por xmm0, xmm5
432 pshufb xmm1, xmm4
433 movdqa [edx], xmm0
434 por xmm1, xmm5
435 palignr xmm3, xmm3, 4 // xmm3 = { xmm3[4:15]}
436 pshufb xmm3, xmm4
437 movdqa [edx + 16], xmm1
438 por xmm3, xmm5
439 sub ecx, 16
440 movdqa [edx + 48], xmm3
441 lea edx, [edx + 64]
442 jg convertloop
443 ret
444 }
445 }
446
447 // pmul method to replicate bits.
448 // Math to replicate bits:
449 // (v << 8) | (v << 3)
450 // v * 256 + v * 8
451 // v * (256 + 8)
452 // G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3
453 // 20 instructions.
454 __declspec(naked) __declspec(align(16))
RGB565ToARGBRow_SSE2(const uint8 * src_rgb565,uint8 * dst_argb,int pix)455 void RGB565ToARGBRow_SSE2(const uint8* src_rgb565, uint8* dst_argb,
456 int pix) {
457 __asm {
458 mov eax, 0x01080108 // generate multiplier to repeat 5 bits
459 movd xmm5, eax
460 pshufd xmm5, xmm5, 0
461 mov eax, 0x20802080 // multiplier shift by 5 and then repeat 6 bits
462 movd xmm6, eax
463 pshufd xmm6, xmm6, 0
464 pcmpeqb xmm3, xmm3 // generate mask 0xf800f800 for Red
465 psllw xmm3, 11
466 pcmpeqb xmm4, xmm4 // generate mask 0x07e007e0 for Green
467 psllw xmm4, 10
468 psrlw xmm4, 5
469 pcmpeqb xmm7, xmm7 // generate mask 0xff00ff00 for Alpha
470 psllw xmm7, 8
471
472 mov eax, [esp + 4] // src_rgb565
473 mov edx, [esp + 8] // dst_argb
474 mov ecx, [esp + 12] // pix
475 sub edx, eax
476 sub edx, eax
477
478 align 4
479 convertloop:
480 movdqu xmm0, [eax] // fetch 8 pixels of bgr565
481 movdqa xmm1, xmm0
482 movdqa xmm2, xmm0
483 pand xmm1, xmm3 // R in upper 5 bits
484 psllw xmm2, 11 // B in upper 5 bits
485 pmulhuw xmm1, xmm5 // * (256 + 8)
486 pmulhuw xmm2, xmm5 // * (256 + 8)
487 psllw xmm1, 8
488 por xmm1, xmm2 // RB
489 pand xmm0, xmm4 // G in middle 6 bits
490 pmulhuw xmm0, xmm6 // << 5 * (256 + 4)
491 por xmm0, xmm7 // AG
492 movdqa xmm2, xmm1
493 punpcklbw xmm1, xmm0
494 punpckhbw xmm2, xmm0
495 movdqa [eax * 2 + edx], xmm1 // store 4 pixels of ARGB
496 movdqa [eax * 2 + edx + 16], xmm2 // store next 4 pixels of ARGB
497 lea eax, [eax + 16]
498 sub ecx, 8
499 jg convertloop
500 ret
501 }
502 }
503
504 // 24 instructions
505 __declspec(naked) __declspec(align(16))
ARGB1555ToARGBRow_SSE2(const uint8 * src_argb1555,uint8 * dst_argb,int pix)506 void ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555, uint8* dst_argb,
507 int pix) {
508 __asm {
509 mov eax, 0x01080108 // generate multiplier to repeat 5 bits
510 movd xmm5, eax
511 pshufd xmm5, xmm5, 0
512 mov eax, 0x42004200 // multiplier shift by 6 and then repeat 5 bits
513 movd xmm6, eax
514 pshufd xmm6, xmm6, 0
515 pcmpeqb xmm3, xmm3 // generate mask 0xf800f800 for Red
516 psllw xmm3, 11
517 movdqa xmm4, xmm3 // generate mask 0x03e003e0 for Green
518 psrlw xmm4, 6
519 pcmpeqb xmm7, xmm7 // generate mask 0xff00ff00 for Alpha
520 psllw xmm7, 8
521
522 mov eax, [esp + 4] // src_argb1555
523 mov edx, [esp + 8] // dst_argb
524 mov ecx, [esp + 12] // pix
525 sub edx, eax
526 sub edx, eax
527
528 align 4
529 convertloop:
530 movdqu xmm0, [eax] // fetch 8 pixels of 1555
531 movdqa xmm1, xmm0
532 movdqa xmm2, xmm0
533 psllw xmm1, 1 // R in upper 5 bits
534 psllw xmm2, 11 // B in upper 5 bits
535 pand xmm1, xmm3
536 pmulhuw xmm2, xmm5 // * (256 + 8)
537 pmulhuw xmm1, xmm5 // * (256 + 8)
538 psllw xmm1, 8
539 por xmm1, xmm2 // RB
540 movdqa xmm2, xmm0
541 pand xmm0, xmm4 // G in middle 5 bits
542 psraw xmm2, 8 // A
543 pmulhuw xmm0, xmm6 // << 6 * (256 + 8)
544 pand xmm2, xmm7
545 por xmm0, xmm2 // AG
546 movdqa xmm2, xmm1
547 punpcklbw xmm1, xmm0
548 punpckhbw xmm2, xmm0
549 movdqa [eax * 2 + edx], xmm1 // store 4 pixels of ARGB
550 movdqa [eax * 2 + edx + 16], xmm2 // store next 4 pixels of ARGB
551 lea eax, [eax + 16]
552 sub ecx, 8
553 jg convertloop
554 ret
555 }
556 }
557
558 // 18 instructions.
559 __declspec(naked) __declspec(align(16))
ARGB4444ToARGBRow_SSE2(const uint8 * src_argb4444,uint8 * dst_argb,int pix)560 void ARGB4444ToARGBRow_SSE2(const uint8* src_argb4444, uint8* dst_argb,
561 int pix) {
562 __asm {
563 mov eax, 0x0f0f0f0f // generate mask 0x0f0f0f0f
564 movd xmm4, eax
565 pshufd xmm4, xmm4, 0
566 movdqa xmm5, xmm4 // 0xf0f0f0f0 for high nibbles
567 pslld xmm5, 4
568 mov eax, [esp + 4] // src_argb4444
569 mov edx, [esp + 8] // dst_argb
570 mov ecx, [esp + 12] // pix
571 sub edx, eax
572 sub edx, eax
573
574 align 4
575 convertloop:
576 movdqu xmm0, [eax] // fetch 8 pixels of bgra4444
577 movdqa xmm2, xmm0
578 pand xmm0, xmm4 // mask low nibbles
579 pand xmm2, xmm5 // mask high nibbles
580 movdqa xmm1, xmm0
581 movdqa xmm3, xmm2
582 psllw xmm1, 4
583 psrlw xmm3, 4
584 por xmm0, xmm1
585 por xmm2, xmm3
586 movdqa xmm1, xmm0
587 punpcklbw xmm0, xmm2
588 punpckhbw xmm1, xmm2
589 movdqa [eax * 2 + edx], xmm0 // store 4 pixels of ARGB
590 movdqa [eax * 2 + edx + 16], xmm1 // store next 4 pixels of ARGB
591 lea eax, [eax + 16]
592 sub ecx, 8
593 jg convertloop
594 ret
595 }
596 }
597
598 __declspec(naked) __declspec(align(16))
ARGBToRGB24Row_SSSE3(const uint8 * src_argb,uint8 * dst_rgb,int pix)599 void ARGBToRGB24Row_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) {
600 __asm {
601 mov eax, [esp + 4] // src_argb
602 mov edx, [esp + 8] // dst_rgb
603 mov ecx, [esp + 12] // pix
604 movdqa xmm6, kShuffleMaskARGBToRGB24
605
606 align 4
607 convertloop:
608 movdqu xmm0, [eax] // fetch 16 pixels of argb
609 movdqu xmm1, [eax + 16]
610 movdqu xmm2, [eax + 32]
611 movdqu xmm3, [eax + 48]
612 lea eax, [eax + 64]
613 pshufb xmm0, xmm6 // pack 16 bytes of ARGB to 12 bytes of RGB
614 pshufb xmm1, xmm6
615 pshufb xmm2, xmm6
616 pshufb xmm3, xmm6
617 movdqa xmm4, xmm1 // 4 bytes from 1 for 0
618 psrldq xmm1, 4 // 8 bytes from 1
619 pslldq xmm4, 12 // 4 bytes from 1 for 0
620 movdqa xmm5, xmm2 // 8 bytes from 2 for 1
621 por xmm0, xmm4 // 4 bytes from 1 for 0
622 pslldq xmm5, 8 // 8 bytes from 2 for 1
623 movdqu [edx], xmm0 // store 0
624 por xmm1, xmm5 // 8 bytes from 2 for 1
625 psrldq xmm2, 8 // 4 bytes from 2
626 pslldq xmm3, 4 // 12 bytes from 3 for 2
627 por xmm2, xmm3 // 12 bytes from 3 for 2
628 movdqu [edx + 16], xmm1 // store 1
629 movdqu [edx + 32], xmm2 // store 2
630 lea edx, [edx + 48]
631 sub ecx, 16
632 jg convertloop
633 ret
634 }
635 }
636
637 __declspec(naked) __declspec(align(16))
ARGBToRAWRow_SSSE3(const uint8 * src_argb,uint8 * dst_rgb,int pix)638 void ARGBToRAWRow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) {
639 __asm {
640 mov eax, [esp + 4] // src_argb
641 mov edx, [esp + 8] // dst_rgb
642 mov ecx, [esp + 12] // pix
643 movdqa xmm6, kShuffleMaskARGBToRAW
644
645 align 4
646 convertloop:
647 movdqu xmm0, [eax] // fetch 16 pixels of argb
648 movdqu xmm1, [eax + 16]
649 movdqu xmm2, [eax + 32]
650 movdqu xmm3, [eax + 48]
651 lea eax, [eax + 64]
652 pshufb xmm0, xmm6 // pack 16 bytes of ARGB to 12 bytes of RGB
653 pshufb xmm1, xmm6
654 pshufb xmm2, xmm6
655 pshufb xmm3, xmm6
656 movdqa xmm4, xmm1 // 4 bytes from 1 for 0
657 psrldq xmm1, 4 // 8 bytes from 1
658 pslldq xmm4, 12 // 4 bytes from 1 for 0
659 movdqa xmm5, xmm2 // 8 bytes from 2 for 1
660 por xmm0, xmm4 // 4 bytes from 1 for 0
661 pslldq xmm5, 8 // 8 bytes from 2 for 1
662 movdqu [edx], xmm0 // store 0
663 por xmm1, xmm5 // 8 bytes from 2 for 1
664 psrldq xmm2, 8 // 4 bytes from 2
665 pslldq xmm3, 4 // 12 bytes from 3 for 2
666 por xmm2, xmm3 // 12 bytes from 3 for 2
667 movdqu [edx + 16], xmm1 // store 1
668 movdqu [edx + 32], xmm2 // store 2
669 lea edx, [edx + 48]
670 sub ecx, 16
671 jg convertloop
672 ret
673 }
674 }
675
676 __declspec(naked) __declspec(align(16))
ARGBToRGB565Row_SSE2(const uint8 * src_argb,uint8 * dst_rgb,int pix)677 void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
678 __asm {
679 mov eax, [esp + 4] // src_argb
680 mov edx, [esp + 8] // dst_rgb
681 mov ecx, [esp + 12] // pix
682 pcmpeqb xmm3, xmm3 // generate mask 0x0000001f
683 psrld xmm3, 27
684 pcmpeqb xmm4, xmm4 // generate mask 0x000007e0
685 psrld xmm4, 26
686 pslld xmm4, 5
687 pcmpeqb xmm5, xmm5 // generate mask 0xfffff800
688 pslld xmm5, 11
689
690 align 4
691 convertloop:
692 movdqa xmm0, [eax] // fetch 4 pixels of argb
693 movdqa xmm1, xmm0 // B
694 movdqa xmm2, xmm0 // G
695 pslld xmm0, 8 // R
696 psrld xmm1, 3 // B
697 psrld xmm2, 5 // G
698 psrad xmm0, 16 // R
699 pand xmm1, xmm3 // B
700 pand xmm2, xmm4 // G
701 pand xmm0, xmm5 // R
702 por xmm1, xmm2 // BG
703 por xmm0, xmm1 // BGR
704 packssdw xmm0, xmm0
705 lea eax, [eax + 16]
706 movq qword ptr [edx], xmm0 // store 4 pixels of RGB565
707 lea edx, [edx + 8]
708 sub ecx, 4
709 jg convertloop
710 ret
711 }
712 }
713
714 // TODO(fbarchard): Improve sign extension/packing.
715 __declspec(naked) __declspec(align(16))
ARGBToARGB1555Row_SSE2(const uint8 * src_argb,uint8 * dst_rgb,int pix)716 void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
717 __asm {
718 mov eax, [esp + 4] // src_argb
719 mov edx, [esp + 8] // dst_rgb
720 mov ecx, [esp + 12] // pix
721 pcmpeqb xmm4, xmm4 // generate mask 0x0000001f
722 psrld xmm4, 27
723 movdqa xmm5, xmm4 // generate mask 0x000003e0
724 pslld xmm5, 5
725 movdqa xmm6, xmm4 // generate mask 0x00007c00
726 pslld xmm6, 10
727 pcmpeqb xmm7, xmm7 // generate mask 0xffff8000
728 pslld xmm7, 15
729
730 align 4
731 convertloop:
732 movdqa xmm0, [eax] // fetch 4 pixels of argb
733 movdqa xmm1, xmm0 // B
734 movdqa xmm2, xmm0 // G
735 movdqa xmm3, xmm0 // R
736 psrad xmm0, 16 // A
737 psrld xmm1, 3 // B
738 psrld xmm2, 6 // G
739 psrld xmm3, 9 // R
740 pand xmm0, xmm7 // A
741 pand xmm1, xmm4 // B
742 pand xmm2, xmm5 // G
743 pand xmm3, xmm6 // R
744 por xmm0, xmm1 // BA
745 por xmm2, xmm3 // GR
746 por xmm0, xmm2 // BGRA
747 packssdw xmm0, xmm0
748 lea eax, [eax + 16]
749 movq qword ptr [edx], xmm0 // store 4 pixels of ARGB1555
750 lea edx, [edx + 8]
751 sub ecx, 4
752 jg convertloop
753 ret
754 }
755 }
756
757 __declspec(naked) __declspec(align(16))
ARGBToARGB4444Row_SSE2(const uint8 * src_argb,uint8 * dst_rgb,int pix)758 void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
759 __asm {
760 mov eax, [esp + 4] // src_argb
761 mov edx, [esp + 8] // dst_rgb
762 mov ecx, [esp + 12] // pix
763 pcmpeqb xmm4, xmm4 // generate mask 0xf000f000
764 psllw xmm4, 12
765 movdqa xmm3, xmm4 // generate mask 0x00f000f0
766 psrlw xmm3, 8
767
768 align 4
769 convertloop:
770 movdqa xmm0, [eax] // fetch 4 pixels of argb
771 movdqa xmm1, xmm0
772 pand xmm0, xmm3 // low nibble
773 pand xmm1, xmm4 // high nibble
774 psrl xmm0, 4
775 psrl xmm1, 8
776 por xmm0, xmm1
777 packuswb xmm0, xmm0
778 lea eax, [eax + 16]
779 movq qword ptr [edx], xmm0 // store 4 pixels of ARGB4444
780 lea edx, [edx + 8]
781 sub ecx, 4
782 jg convertloop
783 ret
784 }
785 }
786
787 // Convert 16 ARGB pixels (64 bytes) to 16 Y values.
788 __declspec(naked) __declspec(align(16))
ARGBToYRow_SSSE3(const uint8 * src_argb,uint8 * dst_y,int pix)789 void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
790 __asm {
791 mov eax, [esp + 4] /* src_argb */
792 mov edx, [esp + 8] /* dst_y */
793 mov ecx, [esp + 12] /* pix */
794 movdqa xmm5, kAddY16
795 movdqa xmm4, kARGBToY
796
797 align 4
798 convertloop:
799 movdqa xmm0, [eax]
800 movdqa xmm1, [eax + 16]
801 movdqa xmm2, [eax + 32]
802 movdqa xmm3, [eax + 48]
803 pmaddubsw xmm0, xmm4
804 pmaddubsw xmm1, xmm4
805 pmaddubsw xmm2, xmm4
806 pmaddubsw xmm3, xmm4
807 lea eax, [eax + 64]
808 phaddw xmm0, xmm1
809 phaddw xmm2, xmm3
810 psrlw xmm0, 7
811 psrlw xmm2, 7
812 packuswb xmm0, xmm2
813 paddb xmm0, xmm5
814 sub ecx, 16
815 movdqa [edx], xmm0
816 lea edx, [edx + 16]
817 jg convertloop
818 ret
819 }
820 }
821
822 // Convert 16 ARGB pixels (64 bytes) to 16 Y values.
823 __declspec(naked) __declspec(align(16))
ARGBToYJRow_SSSE3(const uint8 * src_argb,uint8 * dst_y,int pix)824 void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
825 __asm {
826 mov eax, [esp + 4] /* src_argb */
827 mov edx, [esp + 8] /* dst_y */
828 mov ecx, [esp + 12] /* pix */
829 movdqa xmm4, kARGBToYJ
830 movdqa xmm5, kAddYJ64
831
832 align 4
833 convertloop:
834 movdqa xmm0, [eax]
835 movdqa xmm1, [eax + 16]
836 movdqa xmm2, [eax + 32]
837 movdqa xmm3, [eax + 48]
838 pmaddubsw xmm0, xmm4
839 pmaddubsw xmm1, xmm4
840 pmaddubsw xmm2, xmm4
841 pmaddubsw xmm3, xmm4
842 lea eax, [eax + 64]
843 phaddw xmm0, xmm1
844 phaddw xmm2, xmm3
845 paddw xmm0, xmm5 // Add .5 for rounding.
846 paddw xmm2, xmm5
847 psrlw xmm0, 7
848 psrlw xmm2, 7
849 packuswb xmm0, xmm2
850 sub ecx, 16
851 movdqa [edx], xmm0
852 lea edx, [edx + 16]
853 jg convertloop
854 ret
855 }
856 }
857
858 #ifdef HAS_ARGBTOYROW_AVX2
859 // Convert 32 ARGB pixels (128 bytes) to 32 Y values.
860 __declspec(naked) __declspec(align(32))
ARGBToYRow_AVX2(const uint8 * src_argb,uint8 * dst_y,int pix)861 void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) {
862 __asm {
863 mov eax, [esp + 4] /* src_argb */
864 mov edx, [esp + 8] /* dst_y */
865 mov ecx, [esp + 12] /* pix */
866 vbroadcastf128 ymm4, kARGBToY
867 vbroadcastf128 ymm5, kAddY16
868 vmovdqa ymm6, kPermdARGBToY_AVX
869
870 align 4
871 convertloop:
872 vmovdqu ymm0, [eax]
873 vmovdqu ymm1, [eax + 32]
874 vmovdqu ymm2, [eax + 64]
875 vmovdqu ymm3, [eax + 96]
876 vpmaddubsw ymm0, ymm0, ymm4
877 vpmaddubsw ymm1, ymm1, ymm4
878 vpmaddubsw ymm2, ymm2, ymm4
879 vpmaddubsw ymm3, ymm3, ymm4
880 lea eax, [eax + 128]
881 vphaddw ymm0, ymm0, ymm1 // mutates.
882 vphaddw ymm2, ymm2, ymm3
883 vpsrlw ymm0, ymm0, 7
884 vpsrlw ymm2, ymm2, 7
885 vpackuswb ymm0, ymm0, ymm2 // mutates.
886 vpermd ymm0, ymm6, ymm0 // For vphaddw + vpackuswb mutation.
887 vpaddb ymm0, ymm0, ymm5
888 sub ecx, 32
889 vmovdqu [edx], ymm0
890 lea edx, [edx + 32]
891 jg convertloop
892 vzeroupper
893 ret
894 }
895 }
896 #endif // HAS_ARGBTOYROW_AVX2
897
898 #ifdef HAS_ARGBTOYROW_AVX2
899 // Convert 32 ARGB pixels (128 bytes) to 32 Y values.
900 __declspec(naked) __declspec(align(32))
ARGBToYJRow_AVX2(const uint8 * src_argb,uint8 * dst_y,int pix)901 void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) {
902 __asm {
903 mov eax, [esp + 4] /* src_argb */
904 mov edx, [esp + 8] /* dst_y */
905 mov ecx, [esp + 12] /* pix */
906 vbroadcastf128 ymm4, kARGBToYJ
907 vbroadcastf128 ymm5, kAddYJ64
908 vmovdqa ymm6, kPermdARGBToY_AVX
909
910 align 4
911 convertloop:
912 vmovdqu ymm0, [eax]
913 vmovdqu ymm1, [eax + 32]
914 vmovdqu ymm2, [eax + 64]
915 vmovdqu ymm3, [eax + 96]
916 vpmaddubsw ymm0, ymm0, ymm4
917 vpmaddubsw ymm1, ymm1, ymm4
918 vpmaddubsw ymm2, ymm2, ymm4
919 vpmaddubsw ymm3, ymm3, ymm4
920 lea eax, [eax + 128]
921 vphaddw ymm0, ymm0, ymm1 // mutates.
922 vphaddw ymm2, ymm2, ymm3
923 vpaddw ymm0, ymm0, ymm5 // Add .5 for rounding.
924 vpaddw ymm2, ymm2, ymm5
925 vpsrlw ymm0, ymm0, 7
926 vpsrlw ymm2, ymm2, 7
927 vpackuswb ymm0, ymm0, ymm2 // mutates.
928 vpermd ymm0, ymm6, ymm0 // For vphaddw + vpackuswb mutation.
929 sub ecx, 32
930 vmovdqu [edx], ymm0
931 lea edx, [edx + 32]
932 jg convertloop
933
934 vzeroupper
935 ret
936 }
937 }
938 #endif // HAS_ARGBTOYJROW_AVX2
939
940 __declspec(naked) __declspec(align(16))
ARGBToYRow_Unaligned_SSSE3(const uint8 * src_argb,uint8 * dst_y,int pix)941 void ARGBToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
942 __asm {
943 mov eax, [esp + 4] /* src_argb */
944 mov edx, [esp + 8] /* dst_y */
945 mov ecx, [esp + 12] /* pix */
946 movdqa xmm5, kAddY16
947 movdqa xmm4, kARGBToY
948
949 align 4
950 convertloop:
951 movdqu xmm0, [eax]
952 movdqu xmm1, [eax + 16]
953 movdqu xmm2, [eax + 32]
954 movdqu xmm3, [eax + 48]
955 pmaddubsw xmm0, xmm4
956 pmaddubsw xmm1, xmm4
957 pmaddubsw xmm2, xmm4
958 pmaddubsw xmm3, xmm4
959 lea eax, [eax + 64]
960 phaddw xmm0, xmm1
961 phaddw xmm2, xmm3
962 psrlw xmm0, 7
963 psrlw xmm2, 7
964 packuswb xmm0, xmm2
965 paddb xmm0, xmm5
966 sub ecx, 16
967 movdqu [edx], xmm0
968 lea edx, [edx + 16]
969 jg convertloop
970 ret
971 }
972 }
973
974 __declspec(naked) __declspec(align(16))
ARGBToYJRow_Unaligned_SSSE3(const uint8 * src_argb,uint8 * dst_y,int pix)975 void ARGBToYJRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
976 __asm {
977 mov eax, [esp + 4] /* src_argb */
978 mov edx, [esp + 8] /* dst_y */
979 mov ecx, [esp + 12] /* pix */
980 movdqa xmm4, kARGBToYJ
981 movdqa xmm5, kAddYJ64
982
983 align 4
984 convertloop:
985 movdqu xmm0, [eax]
986 movdqu xmm1, [eax + 16]
987 movdqu xmm2, [eax + 32]
988 movdqu xmm3, [eax + 48]
989 pmaddubsw xmm0, xmm4
990 pmaddubsw xmm1, xmm4
991 pmaddubsw xmm2, xmm4
992 pmaddubsw xmm3, xmm4
993 lea eax, [eax + 64]
994 phaddw xmm0, xmm1
995 phaddw xmm2, xmm3
996 paddw xmm0, xmm5
997 paddw xmm2, xmm5
998 psrlw xmm0, 7
999 psrlw xmm2, 7
1000 packuswb xmm0, xmm2
1001 sub ecx, 16
1002 movdqu [edx], xmm0
1003 lea edx, [edx + 16]
1004 jg convertloop
1005 ret
1006 }
1007 }
1008
1009 __declspec(naked) __declspec(align(16))
BGRAToYRow_SSSE3(const uint8 * src_argb,uint8 * dst_y,int pix)1010 void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
1011 __asm {
1012 mov eax, [esp + 4] /* src_argb */
1013 mov edx, [esp + 8] /* dst_y */
1014 mov ecx, [esp + 12] /* pix */
1015 movdqa xmm5, kAddY16
1016 movdqa xmm4, kBGRAToY
1017
1018 align 4
1019 convertloop:
1020 movdqa xmm0, [eax]
1021 movdqa xmm1, [eax + 16]
1022 movdqa xmm2, [eax + 32]
1023 movdqa xmm3, [eax + 48]
1024 pmaddubsw xmm0, xmm4
1025 pmaddubsw xmm1, xmm4
1026 pmaddubsw xmm2, xmm4
1027 pmaddubsw xmm3, xmm4
1028 lea eax, [eax + 64]
1029 phaddw xmm0, xmm1
1030 phaddw xmm2, xmm3
1031 psrlw xmm0, 7
1032 psrlw xmm2, 7
1033 packuswb xmm0, xmm2
1034 paddb xmm0, xmm5
1035 sub ecx, 16
1036 movdqa [edx], xmm0
1037 lea edx, [edx + 16]
1038 jg convertloop
1039 ret
1040 }
1041 }
1042
1043 __declspec(naked) __declspec(align(16))
BGRAToYRow_Unaligned_SSSE3(const uint8 * src_argb,uint8 * dst_y,int pix)1044 void BGRAToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
1045 __asm {
1046 mov eax, [esp + 4] /* src_argb */
1047 mov edx, [esp + 8] /* dst_y */
1048 mov ecx, [esp + 12] /* pix */
1049 movdqa xmm5, kAddY16
1050 movdqa xmm4, kBGRAToY
1051
1052 align 4
1053 convertloop:
1054 movdqu xmm0, [eax]
1055 movdqu xmm1, [eax + 16]
1056 movdqu xmm2, [eax + 32]
1057 movdqu xmm3, [eax + 48]
1058 pmaddubsw xmm0, xmm4
1059 pmaddubsw xmm1, xmm4
1060 pmaddubsw xmm2, xmm4
1061 pmaddubsw xmm3, xmm4
1062 lea eax, [eax + 64]
1063 phaddw xmm0, xmm1
1064 phaddw xmm2, xmm3
1065 psrlw xmm0, 7
1066 psrlw xmm2, 7
1067 packuswb xmm0, xmm2
1068 paddb xmm0, xmm5
1069 sub ecx, 16
1070 movdqu [edx], xmm0
1071 lea edx, [edx + 16]
1072 jg convertloop
1073 ret
1074 }
1075 }
1076
1077 __declspec(naked) __declspec(align(16))
ABGRToYRow_SSSE3(const uint8 * src_argb,uint8 * dst_y,int pix)1078 void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
1079 __asm {
1080 mov eax, [esp + 4] /* src_argb */
1081 mov edx, [esp + 8] /* dst_y */
1082 mov ecx, [esp + 12] /* pix */
1083 movdqa xmm5, kAddY16
1084 movdqa xmm4, kABGRToY
1085
1086 align 4
1087 convertloop:
1088 movdqa xmm0, [eax]
1089 movdqa xmm1, [eax + 16]
1090 movdqa xmm2, [eax + 32]
1091 movdqa xmm3, [eax + 48]
1092 pmaddubsw xmm0, xmm4
1093 pmaddubsw xmm1, xmm4
1094 pmaddubsw xmm2, xmm4
1095 pmaddubsw xmm3, xmm4
1096 lea eax, [eax + 64]
1097 phaddw xmm0, xmm1
1098 phaddw xmm2, xmm3
1099 psrlw xmm0, 7
1100 psrlw xmm2, 7
1101 packuswb xmm0, xmm2
1102 paddb xmm0, xmm5
1103 sub ecx, 16
1104 movdqa [edx], xmm0
1105 lea edx, [edx + 16]
1106 jg convertloop
1107 ret
1108 }
1109 }
1110
1111 __declspec(naked) __declspec(align(16))
ABGRToYRow_Unaligned_SSSE3(const uint8 * src_argb,uint8 * dst_y,int pix)1112 void ABGRToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
1113 __asm {
1114 mov eax, [esp + 4] /* src_argb */
1115 mov edx, [esp + 8] /* dst_y */
1116 mov ecx, [esp + 12] /* pix */
1117 movdqa xmm5, kAddY16
1118 movdqa xmm4, kABGRToY
1119
1120 align 4
1121 convertloop:
1122 movdqu xmm0, [eax]
1123 movdqu xmm1, [eax + 16]
1124 movdqu xmm2, [eax + 32]
1125 movdqu xmm3, [eax + 48]
1126 pmaddubsw xmm0, xmm4
1127 pmaddubsw xmm1, xmm4
1128 pmaddubsw xmm2, xmm4
1129 pmaddubsw xmm3, xmm4
1130 lea eax, [eax + 64]
1131 phaddw xmm0, xmm1
1132 phaddw xmm2, xmm3
1133 psrlw xmm0, 7
1134 psrlw xmm2, 7
1135 packuswb xmm0, xmm2
1136 paddb xmm0, xmm5
1137 sub ecx, 16
1138 movdqu [edx], xmm0
1139 lea edx, [edx + 16]
1140 jg convertloop
1141 ret
1142 }
1143 }
1144
1145 __declspec(naked) __declspec(align(16))
RGBAToYRow_SSSE3(const uint8 * src_argb,uint8 * dst_y,int pix)1146 void RGBAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
1147 __asm {
1148 mov eax, [esp + 4] /* src_argb */
1149 mov edx, [esp + 8] /* dst_y */
1150 mov ecx, [esp + 12] /* pix */
1151 movdqa xmm5, kAddY16
1152 movdqa xmm4, kRGBAToY
1153
1154 align 4
1155 convertloop:
1156 movdqa xmm0, [eax]
1157 movdqa xmm1, [eax + 16]
1158 movdqa xmm2, [eax + 32]
1159 movdqa xmm3, [eax + 48]
1160 pmaddubsw xmm0, xmm4
1161 pmaddubsw xmm1, xmm4
1162 pmaddubsw xmm2, xmm4
1163 pmaddubsw xmm3, xmm4
1164 lea eax, [eax + 64]
1165 phaddw xmm0, xmm1
1166 phaddw xmm2, xmm3
1167 psrlw xmm0, 7
1168 psrlw xmm2, 7
1169 packuswb xmm0, xmm2
1170 paddb xmm0, xmm5
1171 sub ecx, 16
1172 movdqa [edx], xmm0
1173 lea edx, [edx + 16]
1174 jg convertloop
1175 ret
1176 }
1177 }
1178
1179 __declspec(naked) __declspec(align(16))
RGBAToYRow_Unaligned_SSSE3(const uint8 * src_argb,uint8 * dst_y,int pix)1180 void RGBAToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
1181 __asm {
1182 mov eax, [esp + 4] /* src_argb */
1183 mov edx, [esp + 8] /* dst_y */
1184 mov ecx, [esp + 12] /* pix */
1185 movdqa xmm5, kAddY16
1186 movdqa xmm4, kRGBAToY
1187
1188 align 4
1189 convertloop:
1190 movdqu xmm0, [eax]
1191 movdqu xmm1, [eax + 16]
1192 movdqu xmm2, [eax + 32]
1193 movdqu xmm3, [eax + 48]
1194 pmaddubsw xmm0, xmm4
1195 pmaddubsw xmm1, xmm4
1196 pmaddubsw xmm2, xmm4
1197 pmaddubsw xmm3, xmm4
1198 lea eax, [eax + 64]
1199 phaddw xmm0, xmm1
1200 phaddw xmm2, xmm3
1201 psrlw xmm0, 7
1202 psrlw xmm2, 7
1203 packuswb xmm0, xmm2
1204 paddb xmm0, xmm5
1205 sub ecx, 16
1206 movdqu [edx], xmm0
1207 lea edx, [edx + 16]
1208 jg convertloop
1209 ret
1210 }
1211 }
1212
1213 __declspec(naked) __declspec(align(16))
ARGBToUVRow_SSSE3(const uint8 * src_argb0,int src_stride_argb,uint8 * dst_u,uint8 * dst_v,int width)1214 void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
1215 uint8* dst_u, uint8* dst_v, int width) {
1216 __asm {
1217 push esi
1218 push edi
1219 mov eax, [esp + 8 + 4] // src_argb
1220 mov esi, [esp + 8 + 8] // src_stride_argb
1221 mov edx, [esp + 8 + 12] // dst_u
1222 mov edi, [esp + 8 + 16] // dst_v
1223 mov ecx, [esp + 8 + 20] // pix
1224 movdqa xmm7, kARGBToU
1225 movdqa xmm6, kARGBToV
1226 movdqa xmm5, kAddUV128
1227 sub edi, edx // stride from u to v
1228
1229 align 4
1230 convertloop:
1231 /* step 1 - subsample 16x2 argb pixels to 8x1 */
1232 movdqa xmm0, [eax]
1233 movdqa xmm1, [eax + 16]
1234 movdqa xmm2, [eax + 32]
1235 movdqa xmm3, [eax + 48]
1236 pavgb xmm0, [eax + esi]
1237 pavgb xmm1, [eax + esi + 16]
1238 pavgb xmm2, [eax + esi + 32]
1239 pavgb xmm3, [eax + esi + 48]
1240 lea eax, [eax + 64]
1241 movdqa xmm4, xmm0
1242 shufps xmm0, xmm1, 0x88
1243 shufps xmm4, xmm1, 0xdd
1244 pavgb xmm0, xmm4
1245 movdqa xmm4, xmm2
1246 shufps xmm2, xmm3, 0x88
1247 shufps xmm4, xmm3, 0xdd
1248 pavgb xmm2, xmm4
1249
1250 // step 2 - convert to U and V
1251 // from here down is very similar to Y code except
1252 // instead of 16 different pixels, its 8 pixels of U and 8 of V
1253 movdqa xmm1, xmm0
1254 movdqa xmm3, xmm2
1255 pmaddubsw xmm0, xmm7 // U
1256 pmaddubsw xmm2, xmm7
1257 pmaddubsw xmm1, xmm6 // V
1258 pmaddubsw xmm3, xmm6
1259 phaddw xmm0, xmm2
1260 phaddw xmm1, xmm3
1261 psraw xmm0, 8
1262 psraw xmm1, 8
1263 packsswb xmm0, xmm1
1264 paddb xmm0, xmm5 // -> unsigned
1265
1266 // step 3 - store 8 U and 8 V values
1267 sub ecx, 16
1268 movlps qword ptr [edx], xmm0 // U
1269 movhps qword ptr [edx + edi], xmm0 // V
1270 lea edx, [edx + 8]
1271 jg convertloop
1272
1273 pop edi
1274 pop esi
1275 ret
1276 }
1277 }
1278
1279 __declspec(naked) __declspec(align(16))
ARGBToUVJRow_SSSE3(const uint8 * src_argb0,int src_stride_argb,uint8 * dst_u,uint8 * dst_v,int width)1280 void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
1281 uint8* dst_u, uint8* dst_v, int width) {
1282 __asm {
1283 push esi
1284 push edi
1285 mov eax, [esp + 8 + 4] // src_argb
1286 mov esi, [esp + 8 + 8] // src_stride_argb
1287 mov edx, [esp + 8 + 12] // dst_u
1288 mov edi, [esp + 8 + 16] // dst_v
1289 mov ecx, [esp + 8 + 20] // pix
1290 movdqa xmm7, kARGBToUJ
1291 movdqa xmm6, kARGBToVJ
1292 movdqa xmm5, kAddUVJ128
1293 sub edi, edx // stride from u to v
1294
1295 align 4
1296 convertloop:
1297 /* step 1 - subsample 16x2 argb pixels to 8x1 */
1298 movdqa xmm0, [eax]
1299 movdqa xmm1, [eax + 16]
1300 movdqa xmm2, [eax + 32]
1301 movdqa xmm3, [eax + 48]
1302 pavgb xmm0, [eax + esi]
1303 pavgb xmm1, [eax + esi + 16]
1304 pavgb xmm2, [eax + esi + 32]
1305 pavgb xmm3, [eax + esi + 48]
1306 lea eax, [eax + 64]
1307 movdqa xmm4, xmm0
1308 shufps xmm0, xmm1, 0x88
1309 shufps xmm4, xmm1, 0xdd
1310 pavgb xmm0, xmm4
1311 movdqa xmm4, xmm2
1312 shufps xmm2, xmm3, 0x88
1313 shufps xmm4, xmm3, 0xdd
1314 pavgb xmm2, xmm4
1315
1316 // step 2 - convert to U and V
1317 // from here down is very similar to Y code except
1318 // instead of 16 different pixels, its 8 pixels of U and 8 of V
1319 movdqa xmm1, xmm0
1320 movdqa xmm3, xmm2
1321 pmaddubsw xmm0, xmm7 // U
1322 pmaddubsw xmm2, xmm7
1323 pmaddubsw xmm1, xmm6 // V
1324 pmaddubsw xmm3, xmm6
1325 phaddw xmm0, xmm2
1326 phaddw xmm1, xmm3
1327 paddw xmm0, xmm5 // +.5 rounding -> unsigned
1328 paddw xmm1, xmm5
1329 psraw xmm0, 8
1330 psraw xmm1, 8
1331 packsswb xmm0, xmm1
1332
1333 // step 3 - store 8 U and 8 V values
1334 sub ecx, 16
1335 movlps qword ptr [edx], xmm0 // U
1336 movhps qword ptr [edx + edi], xmm0 // V
1337 lea edx, [edx + 8]
1338 jg convertloop
1339
1340 pop edi
1341 pop esi
1342 ret
1343 }
1344 }
1345
1346 #ifdef HAS_ARGBTOUVROW_AVX2
1347 __declspec(naked) __declspec(align(32))
ARGBToUVRow_AVX2(const uint8 * src_argb0,int src_stride_argb,uint8 * dst_u,uint8 * dst_v,int width)1348 void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb,
1349 uint8* dst_u, uint8* dst_v, int width) {
1350 __asm {
1351 push esi
1352 push edi
1353 mov eax, [esp + 8 + 4] // src_argb
1354 mov esi, [esp + 8 + 8] // src_stride_argb
1355 mov edx, [esp + 8 + 12] // dst_u
1356 mov edi, [esp + 8 + 16] // dst_v
1357 mov ecx, [esp + 8 + 20] // pix
1358 vbroadcastf128 ymm5, kAddUV128
1359 vbroadcastf128 ymm6, kARGBToV
1360 vbroadcastf128 ymm7, kARGBToU
1361 sub edi, edx // stride from u to v
1362
1363 align 4
1364 convertloop:
1365 /* step 1 - subsample 32x2 argb pixels to 16x1 */
1366 vmovdqu ymm0, [eax]
1367 vmovdqu ymm1, [eax + 32]
1368 vmovdqu ymm2, [eax + 64]
1369 vmovdqu ymm3, [eax + 96]
1370 vpavgb ymm0, ymm0, [eax + esi]
1371 vpavgb ymm1, ymm1, [eax + esi + 32]
1372 vpavgb ymm2, ymm2, [eax + esi + 64]
1373 vpavgb ymm3, ymm3, [eax + esi + 96]
1374 lea eax, [eax + 128]
1375 vshufps ymm4, ymm0, ymm1, 0x88
1376 vshufps ymm0, ymm0, ymm1, 0xdd
1377 vpavgb ymm0, ymm0, ymm4 // mutated by vshufps
1378 vshufps ymm4, ymm2, ymm3, 0x88
1379 vshufps ymm2, ymm2, ymm3, 0xdd
1380 vpavgb ymm2, ymm2, ymm4 // mutated by vshufps
1381
1382 // step 2 - convert to U and V
1383 // from here down is very similar to Y code except
1384 // instead of 32 different pixels, its 16 pixels of U and 16 of V
1385 vpmaddubsw ymm1, ymm0, ymm7 // U
1386 vpmaddubsw ymm3, ymm2, ymm7
1387 vpmaddubsw ymm0, ymm0, ymm6 // V
1388 vpmaddubsw ymm2, ymm2, ymm6
1389 vphaddw ymm1, ymm1, ymm3 // mutates
1390 vphaddw ymm0, ymm0, ymm2
1391 vpsraw ymm1, ymm1, 8
1392 vpsraw ymm0, ymm0, 8
1393 vpacksswb ymm0, ymm1, ymm0 // mutates
1394 vpermq ymm0, ymm0, 0xd8 // For vpacksswb
1395 vpshufb ymm0, ymm0, kShufARGBToUV_AVX // For vshufps + vphaddw
1396 vpaddb ymm0, ymm0, ymm5 // -> unsigned
1397
1398 // step 3 - store 16 U and 16 V values
1399 sub ecx, 32
1400 vextractf128 [edx], ymm0, 0 // U
1401 vextractf128 [edx + edi], ymm0, 1 // V
1402 lea edx, [edx + 16]
1403 jg convertloop
1404
1405 pop edi
1406 pop esi
1407 vzeroupper
1408 ret
1409 }
1410 }
1411 #endif // HAS_ARGBTOUVROW_AVX2
1412
1413 __declspec(naked) __declspec(align(16))
ARGBToUVRow_Unaligned_SSSE3(const uint8 * src_argb0,int src_stride_argb,uint8 * dst_u,uint8 * dst_v,int width)1414 void ARGBToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
1415 uint8* dst_u, uint8* dst_v, int width) {
1416 __asm {
1417 push esi
1418 push edi
1419 mov eax, [esp + 8 + 4] // src_argb
1420 mov esi, [esp + 8 + 8] // src_stride_argb
1421 mov edx, [esp + 8 + 12] // dst_u
1422 mov edi, [esp + 8 + 16] // dst_v
1423 mov ecx, [esp + 8 + 20] // pix
1424 movdqa xmm7, kARGBToU
1425 movdqa xmm6, kARGBToV
1426 movdqa xmm5, kAddUV128
1427 sub edi, edx // stride from u to v
1428
1429 align 4
1430 convertloop:
1431 /* step 1 - subsample 16x2 argb pixels to 8x1 */
1432 movdqu xmm0, [eax]
1433 movdqu xmm1, [eax + 16]
1434 movdqu xmm2, [eax + 32]
1435 movdqu xmm3, [eax + 48]
1436 movdqu xmm4, [eax + esi]
1437 pavgb xmm0, xmm4
1438 movdqu xmm4, [eax + esi + 16]
1439 pavgb xmm1, xmm4
1440 movdqu xmm4, [eax + esi + 32]
1441 pavgb xmm2, xmm4
1442 movdqu xmm4, [eax + esi + 48]
1443 pavgb xmm3, xmm4
1444 lea eax, [eax + 64]
1445 movdqa xmm4, xmm0
1446 shufps xmm0, xmm1, 0x88
1447 shufps xmm4, xmm1, 0xdd
1448 pavgb xmm0, xmm4
1449 movdqa xmm4, xmm2
1450 shufps xmm2, xmm3, 0x88
1451 shufps xmm4, xmm3, 0xdd
1452 pavgb xmm2, xmm4
1453
1454 // step 2 - convert to U and V
1455 // from here down is very similar to Y code except
1456 // instead of 16 different pixels, its 8 pixels of U and 8 of V
1457 movdqa xmm1, xmm0
1458 movdqa xmm3, xmm2
1459 pmaddubsw xmm0, xmm7 // U
1460 pmaddubsw xmm2, xmm7
1461 pmaddubsw xmm1, xmm6 // V
1462 pmaddubsw xmm3, xmm6
1463 phaddw xmm0, xmm2
1464 phaddw xmm1, xmm3
1465 psraw xmm0, 8
1466 psraw xmm1, 8
1467 packsswb xmm0, xmm1
1468 paddb xmm0, xmm5 // -> unsigned
1469
1470 // step 3 - store 8 U and 8 V values
1471 sub ecx, 16
1472 movlps qword ptr [edx], xmm0 // U
1473 movhps qword ptr [edx + edi], xmm0 // V
1474 lea edx, [edx + 8]
1475 jg convertloop
1476
1477 pop edi
1478 pop esi
1479 ret
1480 }
1481 }
1482
1483 __declspec(naked) __declspec(align(16))
ARGBToUVJRow_Unaligned_SSSE3(const uint8 * src_argb0,int src_stride_argb,uint8 * dst_u,uint8 * dst_v,int width)1484 void ARGBToUVJRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
1485 uint8* dst_u, uint8* dst_v, int width) {
1486 __asm {
1487 push esi
1488 push edi
1489 mov eax, [esp + 8 + 4] // src_argb
1490 mov esi, [esp + 8 + 8] // src_stride_argb
1491 mov edx, [esp + 8 + 12] // dst_u
1492 mov edi, [esp + 8 + 16] // dst_v
1493 mov ecx, [esp + 8 + 20] // pix
1494 movdqa xmm7, kARGBToUJ
1495 movdqa xmm6, kARGBToVJ
1496 movdqa xmm5, kAddUVJ128
1497 sub edi, edx // stride from u to v
1498
1499 align 4
1500 convertloop:
1501 /* step 1 - subsample 16x2 argb pixels to 8x1 */
1502 movdqu xmm0, [eax]
1503 movdqu xmm1, [eax + 16]
1504 movdqu xmm2, [eax + 32]
1505 movdqu xmm3, [eax + 48]
1506 movdqu xmm4, [eax + esi]
1507 pavgb xmm0, xmm4
1508 movdqu xmm4, [eax + esi + 16]
1509 pavgb xmm1, xmm4
1510 movdqu xmm4, [eax + esi + 32]
1511 pavgb xmm2, xmm4
1512 movdqu xmm4, [eax + esi + 48]
1513 pavgb xmm3, xmm4
1514 lea eax, [eax + 64]
1515 movdqa xmm4, xmm0
1516 shufps xmm0, xmm1, 0x88
1517 shufps xmm4, xmm1, 0xdd
1518 pavgb xmm0, xmm4
1519 movdqa xmm4, xmm2
1520 shufps xmm2, xmm3, 0x88
1521 shufps xmm4, xmm3, 0xdd
1522 pavgb xmm2, xmm4
1523
1524 // step 2 - convert to U and V
1525 // from here down is very similar to Y code except
1526 // instead of 16 different pixels, its 8 pixels of U and 8 of V
1527 movdqa xmm1, xmm0
1528 movdqa xmm3, xmm2
1529 pmaddubsw xmm0, xmm7 // U
1530 pmaddubsw xmm2, xmm7
1531 pmaddubsw xmm1, xmm6 // V
1532 pmaddubsw xmm3, xmm6
1533 phaddw xmm0, xmm2
1534 phaddw xmm1, xmm3
1535 paddw xmm0, xmm5 // +.5 rounding -> unsigned
1536 paddw xmm1, xmm5
1537 psraw xmm0, 8
1538 psraw xmm1, 8
1539 packsswb xmm0, xmm1
1540
1541 // step 3 - store 8 U and 8 V values
1542 sub ecx, 16
1543 movlps qword ptr [edx], xmm0 // U
1544 movhps qword ptr [edx + edi], xmm0 // V
1545 lea edx, [edx + 8]
1546 jg convertloop
1547
1548 pop edi
1549 pop esi
1550 ret
1551 }
1552 }
1553
1554 __declspec(naked) __declspec(align(16))
ARGBToUV444Row_SSSE3(const uint8 * src_argb0,uint8 * dst_u,uint8 * dst_v,int width)1555 void ARGBToUV444Row_SSSE3(const uint8* src_argb0,
1556 uint8* dst_u, uint8* dst_v, int width) {
1557 __asm {
1558 push edi
1559 mov eax, [esp + 4 + 4] // src_argb
1560 mov edx, [esp + 4 + 8] // dst_u
1561 mov edi, [esp + 4 + 12] // dst_v
1562 mov ecx, [esp + 4 + 16] // pix
1563 movdqa xmm7, kARGBToU
1564 movdqa xmm6, kARGBToV
1565 movdqa xmm5, kAddUV128
1566 sub edi, edx // stride from u to v
1567
1568 align 4
1569 convertloop:
1570 /* convert to U and V */
1571 movdqa xmm0, [eax] // U
1572 movdqa xmm1, [eax + 16]
1573 movdqa xmm2, [eax + 32]
1574 movdqa xmm3, [eax + 48]
1575 pmaddubsw xmm0, xmm7
1576 pmaddubsw xmm1, xmm7
1577 pmaddubsw xmm2, xmm7
1578 pmaddubsw xmm3, xmm7
1579 phaddw xmm0, xmm1
1580 phaddw xmm2, xmm3
1581 psraw xmm0, 8
1582 psraw xmm2, 8
1583 packsswb xmm0, xmm2
1584 paddb xmm0, xmm5
1585 sub ecx, 16
1586 movdqa [edx], xmm0
1587
1588 movdqa xmm0, [eax] // V
1589 movdqa xmm1, [eax + 16]
1590 movdqa xmm2, [eax + 32]
1591 movdqa xmm3, [eax + 48]
1592 pmaddubsw xmm0, xmm6
1593 pmaddubsw xmm1, xmm6
1594 pmaddubsw xmm2, xmm6
1595 pmaddubsw xmm3, xmm6
1596 phaddw xmm0, xmm1
1597 phaddw xmm2, xmm3
1598 psraw xmm0, 8
1599 psraw xmm2, 8
1600 packsswb xmm0, xmm2
1601 paddb xmm0, xmm5
1602 lea eax, [eax + 64]
1603 movdqa [edx + edi], xmm0
1604 lea edx, [edx + 16]
1605 jg convertloop
1606
1607 pop edi
1608 ret
1609 }
1610 }
1611
1612 __declspec(naked) __declspec(align(16))
ARGBToUV444Row_Unaligned_SSSE3(const uint8 * src_argb0,uint8 * dst_u,uint8 * dst_v,int width)1613 void ARGBToUV444Row_Unaligned_SSSE3(const uint8* src_argb0,
1614 uint8* dst_u, uint8* dst_v, int width) {
1615 __asm {
1616 push edi
1617 mov eax, [esp + 4 + 4] // src_argb
1618 mov edx, [esp + 4 + 8] // dst_u
1619 mov edi, [esp + 4 + 12] // dst_v
1620 mov ecx, [esp + 4 + 16] // pix
1621 movdqa xmm7, kARGBToU
1622 movdqa xmm6, kARGBToV
1623 movdqa xmm5, kAddUV128
1624 sub edi, edx // stride from u to v
1625
1626 align 4
1627 convertloop:
1628 /* convert to U and V */
1629 movdqu xmm0, [eax] // U
1630 movdqu xmm1, [eax + 16]
1631 movdqu xmm2, [eax + 32]
1632 movdqu xmm3, [eax + 48]
1633 pmaddubsw xmm0, xmm7
1634 pmaddubsw xmm1, xmm7
1635 pmaddubsw xmm2, xmm7
1636 pmaddubsw xmm3, xmm7
1637 phaddw xmm0, xmm1
1638 phaddw xmm2, xmm3
1639 psraw xmm0, 8
1640 psraw xmm2, 8
1641 packsswb xmm0, xmm2
1642 paddb xmm0, xmm5
1643 sub ecx, 16
1644 movdqu [edx], xmm0
1645
1646 movdqu xmm0, [eax] // V
1647 movdqu xmm1, [eax + 16]
1648 movdqu xmm2, [eax + 32]
1649 movdqu xmm3, [eax + 48]
1650 pmaddubsw xmm0, xmm6
1651 pmaddubsw xmm1, xmm6
1652 pmaddubsw xmm2, xmm6
1653 pmaddubsw xmm3, xmm6
1654 phaddw xmm0, xmm1
1655 phaddw xmm2, xmm3
1656 psraw xmm0, 8
1657 psraw xmm2, 8
1658 packsswb xmm0, xmm2
1659 paddb xmm0, xmm5
1660 lea eax, [eax + 64]
1661 movdqu [edx + edi], xmm0
1662 lea edx, [edx + 16]
1663 jg convertloop
1664
1665 pop edi
1666 ret
1667 }
1668 }
1669
1670 __declspec(naked) __declspec(align(16))
ARGBToUV422Row_SSSE3(const uint8 * src_argb0,uint8 * dst_u,uint8 * dst_v,int width)1671 void ARGBToUV422Row_SSSE3(const uint8* src_argb0,
1672 uint8* dst_u, uint8* dst_v, int width) {
1673 __asm {
1674 push edi
1675 mov eax, [esp + 4 + 4] // src_argb
1676 mov edx, [esp + 4 + 8] // dst_u
1677 mov edi, [esp + 4 + 12] // dst_v
1678 mov ecx, [esp + 4 + 16] // pix
1679 movdqa xmm7, kARGBToU
1680 movdqa xmm6, kARGBToV
1681 movdqa xmm5, kAddUV128
1682 sub edi, edx // stride from u to v
1683
1684 align 4
1685 convertloop:
1686 /* step 1 - subsample 16x2 argb pixels to 8x1 */
1687 movdqa xmm0, [eax]
1688 movdqa xmm1, [eax + 16]
1689 movdqa xmm2, [eax + 32]
1690 movdqa xmm3, [eax + 48]
1691 lea eax, [eax + 64]
1692 movdqa xmm4, xmm0
1693 shufps xmm0, xmm1, 0x88
1694 shufps xmm4, xmm1, 0xdd
1695 pavgb xmm0, xmm4
1696 movdqa xmm4, xmm2
1697 shufps xmm2, xmm3, 0x88
1698 shufps xmm4, xmm3, 0xdd
1699 pavgb xmm2, xmm4
1700
1701 // step 2 - convert to U and V
1702 // from here down is very similar to Y code except
1703 // instead of 16 different pixels, its 8 pixels of U and 8 of V
1704 movdqa xmm1, xmm0
1705 movdqa xmm3, xmm2
1706 pmaddubsw xmm0, xmm7 // U
1707 pmaddubsw xmm2, xmm7
1708 pmaddubsw xmm1, xmm6 // V
1709 pmaddubsw xmm3, xmm6
1710 phaddw xmm0, xmm2
1711 phaddw xmm1, xmm3
1712 psraw xmm0, 8
1713 psraw xmm1, 8
1714 packsswb xmm0, xmm1
1715 paddb xmm0, xmm5 // -> unsigned
1716
1717 // step 3 - store 8 U and 8 V values
1718 sub ecx, 16
1719 movlps qword ptr [edx], xmm0 // U
1720 movhps qword ptr [edx + edi], xmm0 // V
1721 lea edx, [edx + 8]
1722 jg convertloop
1723
1724 pop edi
1725 ret
1726 }
1727 }
1728
1729 __declspec(naked) __declspec(align(16))
ARGBToUV422Row_Unaligned_SSSE3(const uint8 * src_argb0,uint8 * dst_u,uint8 * dst_v,int width)1730 void ARGBToUV422Row_Unaligned_SSSE3(const uint8* src_argb0,
1731 uint8* dst_u, uint8* dst_v, int width) {
1732 __asm {
1733 push edi
1734 mov eax, [esp + 4 + 4] // src_argb
1735 mov edx, [esp + 4 + 8] // dst_u
1736 mov edi, [esp + 4 + 12] // dst_v
1737 mov ecx, [esp + 4 + 16] // pix
1738 movdqa xmm7, kARGBToU
1739 movdqa xmm6, kARGBToV
1740 movdqa xmm5, kAddUV128
1741 sub edi, edx // stride from u to v
1742
1743 align 4
1744 convertloop:
1745 /* step 1 - subsample 16x2 argb pixels to 8x1 */
1746 movdqu xmm0, [eax]
1747 movdqu xmm1, [eax + 16]
1748 movdqu xmm2, [eax + 32]
1749 movdqu xmm3, [eax + 48]
1750 lea eax, [eax + 64]
1751 movdqa xmm4, xmm0
1752 shufps xmm0, xmm1, 0x88
1753 shufps xmm4, xmm1, 0xdd
1754 pavgb xmm0, xmm4
1755 movdqa xmm4, xmm2
1756 shufps xmm2, xmm3, 0x88
1757 shufps xmm4, xmm3, 0xdd
1758 pavgb xmm2, xmm4
1759
1760 // step 2 - convert to U and V
1761 // from here down is very similar to Y code except
1762 // instead of 16 different pixels, its 8 pixels of U and 8 of V
1763 movdqa xmm1, xmm0
1764 movdqa xmm3, xmm2
1765 pmaddubsw xmm0, xmm7 // U
1766 pmaddubsw xmm2, xmm7
1767 pmaddubsw xmm1, xmm6 // V
1768 pmaddubsw xmm3, xmm6
1769 phaddw xmm0, xmm2
1770 phaddw xmm1, xmm3
1771 psraw xmm0, 8
1772 psraw xmm1, 8
1773 packsswb xmm0, xmm1
1774 paddb xmm0, xmm5 // -> unsigned
1775
1776 // step 3 - store 8 U and 8 V values
1777 sub ecx, 16
1778 movlps qword ptr [edx], xmm0 // U
1779 movhps qword ptr [edx + edi], xmm0 // V
1780 lea edx, [edx + 8]
1781 jg convertloop
1782
1783 pop edi
1784 ret
1785 }
1786 }
1787
1788 __declspec(naked) __declspec(align(16))
BGRAToUVRow_SSSE3(const uint8 * src_argb0,int src_stride_argb,uint8 * dst_u,uint8 * dst_v,int width)1789 void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
1790 uint8* dst_u, uint8* dst_v, int width) {
1791 __asm {
1792 push esi
1793 push edi
1794 mov eax, [esp + 8 + 4] // src_argb
1795 mov esi, [esp + 8 + 8] // src_stride_argb
1796 mov edx, [esp + 8 + 12] // dst_u
1797 mov edi, [esp + 8 + 16] // dst_v
1798 mov ecx, [esp + 8 + 20] // pix
1799 movdqa xmm7, kBGRAToU
1800 movdqa xmm6, kBGRAToV
1801 movdqa xmm5, kAddUV128
1802 sub edi, edx // stride from u to v
1803
1804 align 4
1805 convertloop:
1806 /* step 1 - subsample 16x2 argb pixels to 8x1 */
1807 movdqa xmm0, [eax]
1808 movdqa xmm1, [eax + 16]
1809 movdqa xmm2, [eax + 32]
1810 movdqa xmm3, [eax + 48]
1811 pavgb xmm0, [eax + esi]
1812 pavgb xmm1, [eax + esi + 16]
1813 pavgb xmm2, [eax + esi + 32]
1814 pavgb xmm3, [eax + esi + 48]
1815 lea eax, [eax + 64]
1816 movdqa xmm4, xmm0
1817 shufps xmm0, xmm1, 0x88
1818 shufps xmm4, xmm1, 0xdd
1819 pavgb xmm0, xmm4
1820 movdqa xmm4, xmm2
1821 shufps xmm2, xmm3, 0x88
1822 shufps xmm4, xmm3, 0xdd
1823 pavgb xmm2, xmm4
1824
1825 // step 2 - convert to U and V
1826 // from here down is very similar to Y code except
1827 // instead of 16 different pixels, its 8 pixels of U and 8 of V
1828 movdqa xmm1, xmm0
1829 movdqa xmm3, xmm2
1830 pmaddubsw xmm0, xmm7 // U
1831 pmaddubsw xmm2, xmm7
1832 pmaddubsw xmm1, xmm6 // V
1833 pmaddubsw xmm3, xmm6
1834 phaddw xmm0, xmm2
1835 phaddw xmm1, xmm3
1836 psraw xmm0, 8
1837 psraw xmm1, 8
1838 packsswb xmm0, xmm1
1839 paddb xmm0, xmm5 // -> unsigned
1840
1841 // step 3 - store 8 U and 8 V values
1842 sub ecx, 16
1843 movlps qword ptr [edx], xmm0 // U
1844 movhps qword ptr [edx + edi], xmm0 // V
1845 lea edx, [edx + 8]
1846 jg convertloop
1847
1848 pop edi
1849 pop esi
1850 ret
1851 }
1852 }
1853
1854 __declspec(naked) __declspec(align(16))
BGRAToUVRow_Unaligned_SSSE3(const uint8 * src_argb0,int src_stride_argb,uint8 * dst_u,uint8 * dst_v,int width)1855 void BGRAToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
1856 uint8* dst_u, uint8* dst_v, int width) {
1857 __asm {
1858 push esi
1859 push edi
1860 mov eax, [esp + 8 + 4] // src_argb
1861 mov esi, [esp + 8 + 8] // src_stride_argb
1862 mov edx, [esp + 8 + 12] // dst_u
1863 mov edi, [esp + 8 + 16] // dst_v
1864 mov ecx, [esp + 8 + 20] // pix
1865 movdqa xmm7, kBGRAToU
1866 movdqa xmm6, kBGRAToV
1867 movdqa xmm5, kAddUV128
1868 sub edi, edx // stride from u to v
1869
1870 align 4
1871 convertloop:
1872 /* step 1 - subsample 16x2 argb pixels to 8x1 */
1873 movdqu xmm0, [eax]
1874 movdqu xmm1, [eax + 16]
1875 movdqu xmm2, [eax + 32]
1876 movdqu xmm3, [eax + 48]
1877 movdqu xmm4, [eax + esi]
1878 pavgb xmm0, xmm4
1879 movdqu xmm4, [eax + esi + 16]
1880 pavgb xmm1, xmm4
1881 movdqu xmm4, [eax + esi + 32]
1882 pavgb xmm2, xmm4
1883 movdqu xmm4, [eax + esi + 48]
1884 pavgb xmm3, xmm4
1885 lea eax, [eax + 64]
1886 movdqa xmm4, xmm0
1887 shufps xmm0, xmm1, 0x88
1888 shufps xmm4, xmm1, 0xdd
1889 pavgb xmm0, xmm4
1890 movdqa xmm4, xmm2
1891 shufps xmm2, xmm3, 0x88
1892 shufps xmm4, xmm3, 0xdd
1893 pavgb xmm2, xmm4
1894
1895 // step 2 - convert to U and V
1896 // from here down is very similar to Y code except
1897 // instead of 16 different pixels, its 8 pixels of U and 8 of V
1898 movdqa xmm1, xmm0
1899 movdqa xmm3, xmm2
1900 pmaddubsw xmm0, xmm7 // U
1901 pmaddubsw xmm2, xmm7
1902 pmaddubsw xmm1, xmm6 // V
1903 pmaddubsw xmm3, xmm6
1904 phaddw xmm0, xmm2
1905 phaddw xmm1, xmm3
1906 psraw xmm0, 8
1907 psraw xmm1, 8
1908 packsswb xmm0, xmm1
1909 paddb xmm0, xmm5 // -> unsigned
1910
1911 // step 3 - store 8 U and 8 V values
1912 sub ecx, 16
1913 movlps qword ptr [edx], xmm0 // U
1914 movhps qword ptr [edx + edi], xmm0 // V
1915 lea edx, [edx + 8]
1916 jg convertloop
1917
1918 pop edi
1919 pop esi
1920 ret
1921 }
1922 }
1923
1924 __declspec(naked) __declspec(align(16))
ABGRToUVRow_SSSE3(const uint8 * src_argb0,int src_stride_argb,uint8 * dst_u,uint8 * dst_v,int width)1925 void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
1926 uint8* dst_u, uint8* dst_v, int width) {
1927 __asm {
1928 push esi
1929 push edi
1930 mov eax, [esp + 8 + 4] // src_argb
1931 mov esi, [esp + 8 + 8] // src_stride_argb
1932 mov edx, [esp + 8 + 12] // dst_u
1933 mov edi, [esp + 8 + 16] // dst_v
1934 mov ecx, [esp + 8 + 20] // pix
1935 movdqa xmm7, kABGRToU
1936 movdqa xmm6, kABGRToV
1937 movdqa xmm5, kAddUV128
1938 sub edi, edx // stride from u to v
1939
1940 align 4
1941 convertloop:
1942 /* step 1 - subsample 16x2 argb pixels to 8x1 */
1943 movdqa xmm0, [eax]
1944 movdqa xmm1, [eax + 16]
1945 movdqa xmm2, [eax + 32]
1946 movdqa xmm3, [eax + 48]
1947 pavgb xmm0, [eax + esi]
1948 pavgb xmm1, [eax + esi + 16]
1949 pavgb xmm2, [eax + esi + 32]
1950 pavgb xmm3, [eax + esi + 48]
1951 lea eax, [eax + 64]
1952 movdqa xmm4, xmm0
1953 shufps xmm0, xmm1, 0x88
1954 shufps xmm4, xmm1, 0xdd
1955 pavgb xmm0, xmm4
1956 movdqa xmm4, xmm2
1957 shufps xmm2, xmm3, 0x88
1958 shufps xmm4, xmm3, 0xdd
1959 pavgb xmm2, xmm4
1960
1961 // step 2 - convert to U and V
1962 // from here down is very similar to Y code except
1963 // instead of 16 different pixels, its 8 pixels of U and 8 of V
1964 movdqa xmm1, xmm0
1965 movdqa xmm3, xmm2
1966 pmaddubsw xmm0, xmm7 // U
1967 pmaddubsw xmm2, xmm7
1968 pmaddubsw xmm1, xmm6 // V
1969 pmaddubsw xmm3, xmm6
1970 phaddw xmm0, xmm2
1971 phaddw xmm1, xmm3
1972 psraw xmm0, 8
1973 psraw xmm1, 8
1974 packsswb xmm0, xmm1
1975 paddb xmm0, xmm5 // -> unsigned
1976
1977 // step 3 - store 8 U and 8 V values
1978 sub ecx, 16
1979 movlps qword ptr [edx], xmm0 // U
1980 movhps qword ptr [edx + edi], xmm0 // V
1981 lea edx, [edx + 8]
1982 jg convertloop
1983
1984 pop edi
1985 pop esi
1986 ret
1987 }
1988 }
1989
1990 __declspec(naked) __declspec(align(16))
ABGRToUVRow_Unaligned_SSSE3(const uint8 * src_argb0,int src_stride_argb,uint8 * dst_u,uint8 * dst_v,int width)1991 void ABGRToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
1992 uint8* dst_u, uint8* dst_v, int width) {
1993 __asm {
1994 push esi
1995 push edi
1996 mov eax, [esp + 8 + 4] // src_argb
1997 mov esi, [esp + 8 + 8] // src_stride_argb
1998 mov edx, [esp + 8 + 12] // dst_u
1999 mov edi, [esp + 8 + 16] // dst_v
2000 mov ecx, [esp + 8 + 20] // pix
2001 movdqa xmm7, kABGRToU
2002 movdqa xmm6, kABGRToV
2003 movdqa xmm5, kAddUV128
2004 sub edi, edx // stride from u to v
2005
2006 align 4
2007 convertloop:
2008 /* step 1 - subsample 16x2 argb pixels to 8x1 */
2009 movdqu xmm0, [eax]
2010 movdqu xmm1, [eax + 16]
2011 movdqu xmm2, [eax + 32]
2012 movdqu xmm3, [eax + 48]
2013 movdqu xmm4, [eax + esi]
2014 pavgb xmm0, xmm4
2015 movdqu xmm4, [eax + esi + 16]
2016 pavgb xmm1, xmm4
2017 movdqu xmm4, [eax + esi + 32]
2018 pavgb xmm2, xmm4
2019 movdqu xmm4, [eax + esi + 48]
2020 pavgb xmm3, xmm4
2021 lea eax, [eax + 64]
2022 movdqa xmm4, xmm0
2023 shufps xmm0, xmm1, 0x88
2024 shufps xmm4, xmm1, 0xdd
2025 pavgb xmm0, xmm4
2026 movdqa xmm4, xmm2
2027 shufps xmm2, xmm3, 0x88
2028 shufps xmm4, xmm3, 0xdd
2029 pavgb xmm2, xmm4
2030
2031 // step 2 - convert to U and V
2032 // from here down is very similar to Y code except
2033 // instead of 16 different pixels, its 8 pixels of U and 8 of V
2034 movdqa xmm1, xmm0
2035 movdqa xmm3, xmm2
2036 pmaddubsw xmm0, xmm7 // U
2037 pmaddubsw xmm2, xmm7
2038 pmaddubsw xmm1, xmm6 // V
2039 pmaddubsw xmm3, xmm6
2040 phaddw xmm0, xmm2
2041 phaddw xmm1, xmm3
2042 psraw xmm0, 8
2043 psraw xmm1, 8
2044 packsswb xmm0, xmm1
2045 paddb xmm0, xmm5 // -> unsigned
2046
2047 // step 3 - store 8 U and 8 V values
2048 sub ecx, 16
2049 movlps qword ptr [edx], xmm0 // U
2050 movhps qword ptr [edx + edi], xmm0 // V
2051 lea edx, [edx + 8]
2052 jg convertloop
2053
2054 pop edi
2055 pop esi
2056 ret
2057 }
2058 }
2059
2060 __declspec(naked) __declspec(align(16))
RGBAToUVRow_SSSE3(const uint8 * src_argb0,int src_stride_argb,uint8 * dst_u,uint8 * dst_v,int width)2061 void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
2062 uint8* dst_u, uint8* dst_v, int width) {
2063 __asm {
2064 push esi
2065 push edi
2066 mov eax, [esp + 8 + 4] // src_argb
2067 mov esi, [esp + 8 + 8] // src_stride_argb
2068 mov edx, [esp + 8 + 12] // dst_u
2069 mov edi, [esp + 8 + 16] // dst_v
2070 mov ecx, [esp + 8 + 20] // pix
2071 movdqa xmm7, kRGBAToU
2072 movdqa xmm6, kRGBAToV
2073 movdqa xmm5, kAddUV128
2074 sub edi, edx // stride from u to v
2075
2076 align 4
2077 convertloop:
2078 /* step 1 - subsample 16x2 argb pixels to 8x1 */
2079 movdqa xmm0, [eax]
2080 movdqa xmm1, [eax + 16]
2081 movdqa xmm2, [eax + 32]
2082 movdqa xmm3, [eax + 48]
2083 pavgb xmm0, [eax + esi]
2084 pavgb xmm1, [eax + esi + 16]
2085 pavgb xmm2, [eax + esi + 32]
2086 pavgb xmm3, [eax + esi + 48]
2087 lea eax, [eax + 64]
2088 movdqa xmm4, xmm0
2089 shufps xmm0, xmm1, 0x88
2090 shufps xmm4, xmm1, 0xdd
2091 pavgb xmm0, xmm4
2092 movdqa xmm4, xmm2
2093 shufps xmm2, xmm3, 0x88
2094 shufps xmm4, xmm3, 0xdd
2095 pavgb xmm2, xmm4
2096
2097 // step 2 - convert to U and V
2098 // from here down is very similar to Y code except
2099 // instead of 16 different pixels, its 8 pixels of U and 8 of V
2100 movdqa xmm1, xmm0
2101 movdqa xmm3, xmm2
2102 pmaddubsw xmm0, xmm7 // U
2103 pmaddubsw xmm2, xmm7
2104 pmaddubsw xmm1, xmm6 // V
2105 pmaddubsw xmm3, xmm6
2106 phaddw xmm0, xmm2
2107 phaddw xmm1, xmm3
2108 psraw xmm0, 8
2109 psraw xmm1, 8
2110 packsswb xmm0, xmm1
2111 paddb xmm0, xmm5 // -> unsigned
2112
2113 // step 3 - store 8 U and 8 V values
2114 sub ecx, 16
2115 movlps qword ptr [edx], xmm0 // U
2116 movhps qword ptr [edx + edi], xmm0 // V
2117 lea edx, [edx + 8]
2118 jg convertloop
2119
2120 pop edi
2121 pop esi
2122 ret
2123 }
2124 }
2125
2126 __declspec(naked) __declspec(align(16))
RGBAToUVRow_Unaligned_SSSE3(const uint8 * src_argb0,int src_stride_argb,uint8 * dst_u,uint8 * dst_v,int width)2127 void RGBAToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
2128 uint8* dst_u, uint8* dst_v, int width) {
2129 __asm {
2130 push esi
2131 push edi
2132 mov eax, [esp + 8 + 4] // src_argb
2133 mov esi, [esp + 8 + 8] // src_stride_argb
2134 mov edx, [esp + 8 + 12] // dst_u
2135 mov edi, [esp + 8 + 16] // dst_v
2136 mov ecx, [esp + 8 + 20] // pix
2137 movdqa xmm7, kRGBAToU
2138 movdqa xmm6, kRGBAToV
2139 movdqa xmm5, kAddUV128
2140 sub edi, edx // stride from u to v
2141
2142 align 4
2143 convertloop:
2144 /* step 1 - subsample 16x2 argb pixels to 8x1 */
2145 movdqu xmm0, [eax]
2146 movdqu xmm1, [eax + 16]
2147 movdqu xmm2, [eax + 32]
2148 movdqu xmm3, [eax + 48]
2149 movdqu xmm4, [eax + esi]
2150 pavgb xmm0, xmm4
2151 movdqu xmm4, [eax + esi + 16]
2152 pavgb xmm1, xmm4
2153 movdqu xmm4, [eax + esi + 32]
2154 pavgb xmm2, xmm4
2155 movdqu xmm4, [eax + esi + 48]
2156 pavgb xmm3, xmm4
2157 lea eax, [eax + 64]
2158 movdqa xmm4, xmm0
2159 shufps xmm0, xmm1, 0x88
2160 shufps xmm4, xmm1, 0xdd
2161 pavgb xmm0, xmm4
2162 movdqa xmm4, xmm2
2163 shufps xmm2, xmm3, 0x88
2164 shufps xmm4, xmm3, 0xdd
2165 pavgb xmm2, xmm4
2166
2167 // step 2 - convert to U and V
2168 // from here down is very similar to Y code except
2169 // instead of 16 different pixels, its 8 pixels of U and 8 of V
2170 movdqa xmm1, xmm0
2171 movdqa xmm3, xmm2
2172 pmaddubsw xmm0, xmm7 // U
2173 pmaddubsw xmm2, xmm7
2174 pmaddubsw xmm1, xmm6 // V
2175 pmaddubsw xmm3, xmm6
2176 phaddw xmm0, xmm2
2177 phaddw xmm1, xmm3
2178 psraw xmm0, 8
2179 psraw xmm1, 8
2180 packsswb xmm0, xmm1
2181 paddb xmm0, xmm5 // -> unsigned
2182
2183 // step 3 - store 8 U and 8 V values
2184 sub ecx, 16
2185 movlps qword ptr [edx], xmm0 // U
2186 movhps qword ptr [edx + edi], xmm0 // V
2187 lea edx, [edx + 8]
2188 jg convertloop
2189
2190 pop edi
2191 pop esi
2192 ret
2193 }
2194 }
2195 #endif // HAS_ARGBTOYROW_SSSE3
2196
2197 #ifdef HAS_I422TOARGBROW_AVX2
2198
2199 static const lvec8 kUVToB_AVX = {
2200 UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB,
2201 UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB
2202 };
2203 static const lvec8 kUVToR_AVX = {
2204 UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR,
2205 UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR
2206 };
2207 static const lvec8 kUVToG_AVX = {
2208 UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG,
2209 UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG
2210 };
2211 static const lvec16 kYToRgb_AVX = {
2212 YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG
2213 };
2214 static const lvec16 kYSub16_AVX = {
2215 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
2216 };
2217 static const lvec16 kUVBiasB_AVX = {
2218 BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB
2219 };
2220 static const lvec16 kUVBiasG_AVX = {
2221 BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG
2222 };
2223 static const lvec16 kUVBiasR_AVX = {
2224 BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR
2225 };
2226
2227 // 16 pixels
2228 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
2229 __declspec(naked) __declspec(align(16))
I422ToARGBRow_AVX2(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * dst_argb,int width)2230 void I422ToARGBRow_AVX2(const uint8* y_buf,
2231 const uint8* u_buf,
2232 const uint8* v_buf,
2233 uint8* dst_argb,
2234 int width) {
2235 __asm {
2236 push esi
2237 push edi
2238 mov eax, [esp + 8 + 4] // Y
2239 mov esi, [esp + 8 + 8] // U
2240 mov edi, [esp + 8 + 12] // V
2241 mov edx, [esp + 8 + 16] // argb
2242 mov ecx, [esp + 8 + 20] // width
2243 sub edi, esi
2244 vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
2245 vpxor ymm4, ymm4, ymm4
2246
2247 align 4
2248 convertloop:
2249 vmovq xmm0, qword ptr [esi] // U
2250 vmovq xmm1, qword ptr [esi + edi] // V
2251 lea esi, [esi + 8]
2252 vpunpcklbw ymm0, ymm0, ymm1 // UV
2253 vpermq ymm0, ymm0, 0xd8
2254 vpunpcklwd ymm0, ymm0, ymm0 // UVUV
2255 vpmaddubsw ymm2, ymm0, kUVToB_AVX // scale B UV
2256 vpmaddubsw ymm1, ymm0, kUVToG_AVX // scale G UV
2257 vpmaddubsw ymm0, ymm0, kUVToR_AVX // scale R UV
2258 vpsubw ymm2, ymm2, kUVBiasB_AVX // unbias back to signed
2259 vpsubw ymm1, ymm1, kUVBiasG_AVX
2260 vpsubw ymm0, ymm0, kUVBiasR_AVX
2261
2262 // Step 2: Find Y contribution to 16 R,G,B values
2263 vmovdqu xmm3, [eax] // NOLINT
2264 lea eax, [eax + 16]
2265 vpermq ymm3, ymm3, 0xd8
2266 vpunpcklbw ymm3, ymm3, ymm4
2267 vpsubsw ymm3, ymm3, kYSub16_AVX
2268 vpmullw ymm3, ymm3, kYToRgb_AVX
2269 vpaddsw ymm2, ymm2, ymm3 // B += Y
2270 vpaddsw ymm1, ymm1, ymm3 // G += Y
2271 vpaddsw ymm0, ymm0, ymm3 // R += Y
2272 vpsraw ymm2, ymm2, 6
2273 vpsraw ymm1, ymm1, 6
2274 vpsraw ymm0, ymm0, 6
2275 vpackuswb ymm2, ymm2, ymm2 // B
2276 vpackuswb ymm1, ymm1, ymm1 // G
2277 vpackuswb ymm0, ymm0, ymm0 // R
2278
2279 // Step 3: Weave into ARGB
2280 vpunpcklbw ymm2, ymm2, ymm1 // BG
2281 vpermq ymm2, ymm2, 0xd8
2282 vpunpcklbw ymm0, ymm0, ymm5 // RA
2283 vpermq ymm0, ymm0, 0xd8
2284 vpunpcklwd ymm1, ymm2, ymm0 // BGRA first 8 pixels
2285 vpunpckhwd ymm2, ymm2, ymm0 // BGRA next 8 pixels
2286 vmovdqu [edx], ymm1
2287 vmovdqu [edx + 32], ymm2
2288 lea edx, [edx + 64]
2289 sub ecx, 16
2290 jg convertloop
2291 vzeroupper
2292
2293 pop edi
2294 pop esi
2295 ret
2296 }
2297 }
2298 #endif // HAS_I422TOARGBROW_AVX2
2299
2300 #ifdef HAS_I422TOARGBROW_SSSE3
2301
2302 // TODO(fbarchard): Read that does half size on Y and treats 420 as 444.
2303
2304 // Read 8 UV from 444.
2305 #define READYUV444 __asm { \
2306 __asm movq xmm0, qword ptr [esi] /* U */ /* NOLINT */ \
2307 __asm movq xmm1, qword ptr [esi + edi] /* V */ /* NOLINT */ \
2308 __asm lea esi, [esi + 8] \
2309 __asm punpcklbw xmm0, xmm1 /* UV */ \
2310 }
2311
2312 // Read 4 UV from 422, upsample to 8 UV.
2313 #define READYUV422 __asm { \
2314 __asm movd xmm0, [esi] /* U */ \
2315 __asm movd xmm1, [esi + edi] /* V */ \
2316 __asm lea esi, [esi + 4] \
2317 __asm punpcklbw xmm0, xmm1 /* UV */ \
2318 __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \
2319 }
2320
2321 // Read 2 UV from 411, upsample to 8 UV.
2322 #define READYUV411 __asm { \
2323 __asm movzx ebx, word ptr [esi] /* U */ /* NOLINT */ \
2324 __asm movd xmm0, ebx \
2325 __asm movzx ebx, word ptr [esi + edi] /* V */ /* NOLINT */ \
2326 __asm movd xmm1, ebx \
2327 __asm lea esi, [esi + 2] \
2328 __asm punpcklbw xmm0, xmm1 /* UV */ \
2329 __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \
2330 __asm punpckldq xmm0, xmm0 /* UVUV (upsample) */ \
2331 }
2332
2333 // Read 4 UV from NV12, upsample to 8 UV.
2334 #define READNV12 __asm { \
2335 __asm movq xmm0, qword ptr [esi] /* UV */ /* NOLINT */ \
2336 __asm lea esi, [esi + 8] \
2337 __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \
2338 }
2339
2340 // Convert 8 pixels: 8 UV and 8 Y.
2341 #define YUVTORGB __asm { \
2342 /* Step 1: Find 4 UV contributions to 8 R,G,B values */ \
2343 __asm movdqa xmm1, xmm0 \
2344 __asm movdqa xmm2, xmm0 \
2345 __asm pmaddubsw xmm0, kUVToB /* scale B UV */ \
2346 __asm pmaddubsw xmm1, kUVToG /* scale G UV */ \
2347 __asm pmaddubsw xmm2, kUVToR /* scale R UV */ \
2348 __asm psubw xmm0, kUVBiasB /* unbias back to signed */ \
2349 __asm psubw xmm1, kUVBiasG \
2350 __asm psubw xmm2, kUVBiasR \
2351 /* Step 2: Find Y contribution to 8 R,G,B values */ \
2352 __asm movq xmm3, qword ptr [eax] /* NOLINT */ \
2353 __asm lea eax, [eax + 8] \
2354 __asm punpcklbw xmm3, xmm4 \
2355 __asm psubsw xmm3, kYSub16 \
2356 __asm pmullw xmm3, kYToRgb \
2357 __asm paddsw xmm0, xmm3 /* B += Y */ \
2358 __asm paddsw xmm1, xmm3 /* G += Y */ \
2359 __asm paddsw xmm2, xmm3 /* R += Y */ \
2360 __asm psraw xmm0, 6 \
2361 __asm psraw xmm1, 6 \
2362 __asm psraw xmm2, 6 \
2363 __asm packuswb xmm0, xmm0 /* B */ \
2364 __asm packuswb xmm1, xmm1 /* G */ \
2365 __asm packuswb xmm2, xmm2 /* R */ \
2366 }
2367
2368 // Convert 8 pixels: 8 VU and 8 Y.
2369 #define YVUTORGB __asm { \
2370 /* Step 1: Find 4 UV contributions to 8 R,G,B values */ \
2371 __asm movdqa xmm1, xmm0 \
2372 __asm movdqa xmm2, xmm0 \
2373 __asm pmaddubsw xmm0, kVUToB /* scale B UV */ \
2374 __asm pmaddubsw xmm1, kVUToG /* scale G UV */ \
2375 __asm pmaddubsw xmm2, kVUToR /* scale R UV */ \
2376 __asm psubw xmm0, kUVBiasB /* unbias back to signed */ \
2377 __asm psubw xmm1, kUVBiasG \
2378 __asm psubw xmm2, kUVBiasR \
2379 /* Step 2: Find Y contribution to 8 R,G,B values */ \
2380 __asm movq xmm3, qword ptr [eax] /* NOLINT */ \
2381 __asm lea eax, [eax + 8] \
2382 __asm punpcklbw xmm3, xmm4 \
2383 __asm psubsw xmm3, kYSub16 \
2384 __asm pmullw xmm3, kYToRgb \
2385 __asm paddsw xmm0, xmm3 /* B += Y */ \
2386 __asm paddsw xmm1, xmm3 /* G += Y */ \
2387 __asm paddsw xmm2, xmm3 /* R += Y */ \
2388 __asm psraw xmm0, 6 \
2389 __asm psraw xmm1, 6 \
2390 __asm psraw xmm2, 6 \
2391 __asm packuswb xmm0, xmm0 /* B */ \
2392 __asm packuswb xmm1, xmm1 /* G */ \
2393 __asm packuswb xmm2, xmm2 /* R */ \
2394 }
2395
2396 // 8 pixels, dest aligned 16.
2397 // 8 UV values, mixed with 8 Y producing 8 ARGB (32 bytes).
2398 __declspec(naked) __declspec(align(16))
I444ToARGBRow_SSSE3(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * dst_argb,int width)2399 void I444ToARGBRow_SSSE3(const uint8* y_buf,
2400 const uint8* u_buf,
2401 const uint8* v_buf,
2402 uint8* dst_argb,
2403 int width) {
2404 __asm {
2405 push esi
2406 push edi
2407 mov eax, [esp + 8 + 4] // Y
2408 mov esi, [esp + 8 + 8] // U
2409 mov edi, [esp + 8 + 12] // V
2410 mov edx, [esp + 8 + 16] // argb
2411 mov ecx, [esp + 8 + 20] // width
2412 sub edi, esi
2413 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
2414 pxor xmm4, xmm4
2415
2416 align 4
2417 convertloop:
2418 READYUV444
2419 YUVTORGB
2420
2421 // Step 3: Weave into ARGB
2422 punpcklbw xmm0, xmm1 // BG
2423 punpcklbw xmm2, xmm5 // RA
2424 movdqa xmm1, xmm0
2425 punpcklwd xmm0, xmm2 // BGRA first 4 pixels
2426 punpckhwd xmm1, xmm2 // BGRA next 4 pixels
2427 movdqa [edx], xmm0
2428 movdqa [edx + 16], xmm1
2429 lea edx, [edx + 32]
2430 sub ecx, 8
2431 jg convertloop
2432
2433 pop edi
2434 pop esi
2435 ret
2436 }
2437 }
2438
2439 // 8 pixels, dest aligned 16.
2440 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
2441 __declspec(naked) __declspec(align(16))
I422ToRGB24Row_SSSE3(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * dst_rgb24,int width)2442 void I422ToRGB24Row_SSSE3(const uint8* y_buf,
2443 const uint8* u_buf,
2444 const uint8* v_buf,
2445 uint8* dst_rgb24,
2446 int width) {
2447 __asm {
2448 push esi
2449 push edi
2450 mov eax, [esp + 8 + 4] // Y
2451 mov esi, [esp + 8 + 8] // U
2452 mov edi, [esp + 8 + 12] // V
2453 mov edx, [esp + 8 + 16] // rgb24
2454 mov ecx, [esp + 8 + 20] // width
2455 sub edi, esi
2456 pxor xmm4, xmm4
2457 movdqa xmm5, kShuffleMaskARGBToRGB24_0
2458 movdqa xmm6, kShuffleMaskARGBToRGB24
2459
2460 align 4
2461 convertloop:
2462 READYUV422
2463 YUVTORGB
2464
2465 // Step 3: Weave into RRGB
2466 punpcklbw xmm0, xmm1 // BG
2467 punpcklbw xmm2, xmm2 // RR
2468 movdqa xmm1, xmm0
2469 punpcklwd xmm0, xmm2 // BGRR first 4 pixels
2470 punpckhwd xmm1, xmm2 // BGRR next 4 pixels
2471 pshufb xmm0, xmm5 // Pack into first 8 and last 4 bytes.
2472 pshufb xmm1, xmm6 // Pack into first 12 bytes.
2473 palignr xmm1, xmm0, 12 // last 4 bytes of xmm0 + 12 from xmm1
2474 movq qword ptr [edx], xmm0 // First 8 bytes
2475 movdqu [edx + 8], xmm1 // Last 16 bytes. = 24 bytes, 8 RGB pixels.
2476 lea edx, [edx + 24]
2477 sub ecx, 8
2478 jg convertloop
2479
2480 pop edi
2481 pop esi
2482 ret
2483 }
2484 }
2485
2486 // 8 pixels, dest aligned 16.
2487 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
2488 __declspec(naked) __declspec(align(16))
I422ToRAWRow_SSSE3(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * dst_raw,int width)2489 void I422ToRAWRow_SSSE3(const uint8* y_buf,
2490 const uint8* u_buf,
2491 const uint8* v_buf,
2492 uint8* dst_raw,
2493 int width) {
2494 __asm {
2495 push esi
2496 push edi
2497 mov eax, [esp + 8 + 4] // Y
2498 mov esi, [esp + 8 + 8] // U
2499 mov edi, [esp + 8 + 12] // V
2500 mov edx, [esp + 8 + 16] // raw
2501 mov ecx, [esp + 8 + 20] // width
2502 sub edi, esi
2503 pxor xmm4, xmm4
2504 movdqa xmm5, kShuffleMaskARGBToRAW_0
2505 movdqa xmm6, kShuffleMaskARGBToRAW
2506
2507 align 4
2508 convertloop:
2509 READYUV422
2510 YUVTORGB
2511
2512 // Step 3: Weave into RRGB
2513 punpcklbw xmm0, xmm1 // BG
2514 punpcklbw xmm2, xmm2 // RR
2515 movdqa xmm1, xmm0
2516 punpcklwd xmm0, xmm2 // BGRR first 4 pixels
2517 punpckhwd xmm1, xmm2 // BGRR next 4 pixels
2518 pshufb xmm0, xmm5 // Pack into first 8 and last 4 bytes.
2519 pshufb xmm1, xmm6 // Pack into first 12 bytes.
2520 palignr xmm1, xmm0, 12 // last 4 bytes of xmm0 + 12 from xmm1
2521 movq qword ptr [edx], xmm0 // First 8 bytes
2522 movdqu [edx + 8], xmm1 // Last 16 bytes. = 24 bytes, 8 RGB pixels.
2523 lea edx, [edx + 24]
2524 sub ecx, 8
2525 jg convertloop
2526
2527 pop edi
2528 pop esi
2529 ret
2530 }
2531 }
2532
2533 // 8 pixels, dest unaligned.
2534 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
2535 __declspec(naked) __declspec(align(16))
I422ToRGB565Row_SSSE3(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * rgb565_buf,int width)2536 void I422ToRGB565Row_SSSE3(const uint8* y_buf,
2537 const uint8* u_buf,
2538 const uint8* v_buf,
2539 uint8* rgb565_buf,
2540 int width) {
2541 __asm {
2542 push esi
2543 push edi
2544 mov eax, [esp + 8 + 4] // Y
2545 mov esi, [esp + 8 + 8] // U
2546 mov edi, [esp + 8 + 12] // V
2547 mov edx, [esp + 8 + 16] // rgb565
2548 mov ecx, [esp + 8 + 20] // width
2549 sub edi, esi
2550 pxor xmm4, xmm4
2551 pcmpeqb xmm5, xmm5 // generate mask 0x0000001f
2552 psrld xmm5, 27
2553 pcmpeqb xmm6, xmm6 // generate mask 0x000007e0
2554 psrld xmm6, 26
2555 pslld xmm6, 5
2556 pcmpeqb xmm7, xmm7 // generate mask 0xfffff800
2557 pslld xmm7, 11
2558
2559 align 4
2560 convertloop:
2561 READYUV422
2562 YUVTORGB
2563
2564 // Step 3: Weave into RRGB
2565 punpcklbw xmm0, xmm1 // BG
2566 punpcklbw xmm2, xmm2 // RR
2567 movdqa xmm1, xmm0
2568 punpcklwd xmm0, xmm2 // BGRR first 4 pixels
2569 punpckhwd xmm1, xmm2 // BGRR next 4 pixels
2570
2571 // Step 3b: RRGB -> RGB565
2572 movdqa xmm3, xmm0 // B first 4 pixels of argb
2573 movdqa xmm2, xmm0 // G
2574 pslld xmm0, 8 // R
2575 psrld xmm3, 3 // B
2576 psrld xmm2, 5 // G
2577 psrad xmm0, 16 // R
2578 pand xmm3, xmm5 // B
2579 pand xmm2, xmm6 // G
2580 pand xmm0, xmm7 // R
2581 por xmm3, xmm2 // BG
2582 por xmm0, xmm3 // BGR
2583 movdqa xmm3, xmm1 // B next 4 pixels of argb
2584 movdqa xmm2, xmm1 // G
2585 pslld xmm1, 8 // R
2586 psrld xmm3, 3 // B
2587 psrld xmm2, 5 // G
2588 psrad xmm1, 16 // R
2589 pand xmm3, xmm5 // B
2590 pand xmm2, xmm6 // G
2591 pand xmm1, xmm7 // R
2592 por xmm3, xmm2 // BG
2593 por xmm1, xmm3 // BGR
2594 packssdw xmm0, xmm1
2595 sub ecx, 8
2596 movdqu [edx], xmm0 // store 8 pixels of RGB565
2597 lea edx, [edx + 16]
2598 jg convertloop
2599
2600 pop edi
2601 pop esi
2602 ret
2603 }
2604 }
2605
2606 // 8 pixels, dest aligned 16.
2607 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
2608 __declspec(naked) __declspec(align(16))
I422ToARGBRow_SSSE3(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * dst_argb,int width)2609 void I422ToARGBRow_SSSE3(const uint8* y_buf,
2610 const uint8* u_buf,
2611 const uint8* v_buf,
2612 uint8* dst_argb,
2613 int width) {
2614 __asm {
2615 push esi
2616 push edi
2617 mov eax, [esp + 8 + 4] // Y
2618 mov esi, [esp + 8 + 8] // U
2619 mov edi, [esp + 8 + 12] // V
2620 mov edx, [esp + 8 + 16] // argb
2621 mov ecx, [esp + 8 + 20] // width
2622 sub edi, esi
2623 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
2624 pxor xmm4, xmm4
2625
2626 align 4
2627 convertloop:
2628 READYUV422
2629 YUVTORGB
2630
2631 // Step 3: Weave into ARGB
2632 punpcklbw xmm0, xmm1 // BG
2633 punpcklbw xmm2, xmm5 // RA
2634 movdqa xmm1, xmm0
2635 punpcklwd xmm0, xmm2 // BGRA first 4 pixels
2636 punpckhwd xmm1, xmm2 // BGRA next 4 pixels
2637 movdqa [edx], xmm0
2638 movdqa [edx + 16], xmm1
2639 lea edx, [edx + 32]
2640 sub ecx, 8
2641 jg convertloop
2642
2643 pop edi
2644 pop esi
2645 ret
2646 }
2647 }
2648
2649 // 8 pixels, dest aligned 16.
2650 // 2 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
2651 // Similar to I420 but duplicate UV once more.
2652 __declspec(naked) __declspec(align(16))
I411ToARGBRow_SSSE3(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * dst_argb,int width)2653 void I411ToARGBRow_SSSE3(const uint8* y_buf,
2654 const uint8* u_buf,
2655 const uint8* v_buf,
2656 uint8* dst_argb,
2657 int width) {
2658 __asm {
2659 push ebx
2660 push esi
2661 push edi
2662 mov eax, [esp + 12 + 4] // Y
2663 mov esi, [esp + 12 + 8] // U
2664 mov edi, [esp + 12 + 12] // V
2665 mov edx, [esp + 12 + 16] // argb
2666 mov ecx, [esp + 12 + 20] // width
2667 sub edi, esi
2668 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
2669 pxor xmm4, xmm4
2670
2671 align 4
2672 convertloop:
2673 READYUV411 // modifies EBX
2674 YUVTORGB
2675
2676 // Step 3: Weave into ARGB
2677 punpcklbw xmm0, xmm1 // BG
2678 punpcklbw xmm2, xmm5 // RA
2679 movdqa xmm1, xmm0
2680 punpcklwd xmm0, xmm2 // BGRA first 4 pixels
2681 punpckhwd xmm1, xmm2 // BGRA next 4 pixels
2682 movdqa [edx], xmm0
2683 movdqa [edx + 16], xmm1
2684 lea edx, [edx + 32]
2685 sub ecx, 8
2686 jg convertloop
2687
2688 pop edi
2689 pop esi
2690 pop ebx
2691 ret
2692 }
2693 }
2694
2695 // 8 pixels, dest aligned 16.
2696 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
2697 __declspec(naked) __declspec(align(16))
NV12ToARGBRow_SSSE3(const uint8 * y_buf,const uint8 * uv_buf,uint8 * dst_argb,int width)2698 void NV12ToARGBRow_SSSE3(const uint8* y_buf,
2699 const uint8* uv_buf,
2700 uint8* dst_argb,
2701 int width) {
2702 __asm {
2703 push esi
2704 mov eax, [esp + 4 + 4] // Y
2705 mov esi, [esp + 4 + 8] // UV
2706 mov edx, [esp + 4 + 12] // argb
2707 mov ecx, [esp + 4 + 16] // width
2708 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
2709 pxor xmm4, xmm4
2710
2711 align 4
2712 convertloop:
2713 READNV12
2714 YUVTORGB
2715
2716 // Step 3: Weave into ARGB
2717 punpcklbw xmm0, xmm1 // BG
2718 punpcklbw xmm2, xmm5 // RA
2719 movdqa xmm1, xmm0
2720 punpcklwd xmm0, xmm2 // BGRA first 4 pixels
2721 punpckhwd xmm1, xmm2 // BGRA next 4 pixels
2722 movdqa [edx], xmm0
2723 movdqa [edx + 16], xmm1
2724 lea edx, [edx + 32]
2725 sub ecx, 8
2726 jg convertloop
2727
2728 pop esi
2729 ret
2730 }
2731 }
2732
2733 // 8 pixels, dest aligned 16.
2734 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
2735 __declspec(naked) __declspec(align(16))
NV21ToARGBRow_SSSE3(const uint8 * y_buf,const uint8 * uv_buf,uint8 * dst_argb,int width)2736 void NV21ToARGBRow_SSSE3(const uint8* y_buf,
2737 const uint8* uv_buf,
2738 uint8* dst_argb,
2739 int width) {
2740 __asm {
2741 push esi
2742 mov eax, [esp + 4 + 4] // Y
2743 mov esi, [esp + 4 + 8] // VU
2744 mov edx, [esp + 4 + 12] // argb
2745 mov ecx, [esp + 4 + 16] // width
2746 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
2747 pxor xmm4, xmm4
2748
2749 align 4
2750 convertloop:
2751 READNV12
2752 YVUTORGB
2753
2754 // Step 3: Weave into ARGB
2755 punpcklbw xmm0, xmm1 // BG
2756 punpcklbw xmm2, xmm5 // RA
2757 movdqa xmm1, xmm0
2758 punpcklwd xmm0, xmm2 // BGRA first 4 pixels
2759 punpckhwd xmm1, xmm2 // BGRA next 4 pixels
2760 movdqa [edx], xmm0
2761 movdqa [edx + 16], xmm1
2762 lea edx, [edx + 32]
2763 sub ecx, 8
2764 jg convertloop
2765
2766 pop esi
2767 ret
2768 }
2769 }
2770
2771 // 8 pixels, unaligned.
2772 // 8 UV values, mixed with 8 Y producing 8 ARGB (32 bytes).
2773 __declspec(naked) __declspec(align(16))
I444ToARGBRow_Unaligned_SSSE3(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * dst_argb,int width)2774 void I444ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
2775 const uint8* u_buf,
2776 const uint8* v_buf,
2777 uint8* dst_argb,
2778 int width) {
2779 __asm {
2780 push esi
2781 push edi
2782 mov eax, [esp + 8 + 4] // Y
2783 mov esi, [esp + 8 + 8] // U
2784 mov edi, [esp + 8 + 12] // V
2785 mov edx, [esp + 8 + 16] // argb
2786 mov ecx, [esp + 8 + 20] // width
2787 sub edi, esi
2788 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
2789 pxor xmm4, xmm4
2790
2791 align 4
2792 convertloop:
2793 READYUV444
2794 YUVTORGB
2795
2796 // Step 3: Weave into ARGB
2797 punpcklbw xmm0, xmm1 // BG
2798 punpcklbw xmm2, xmm5 // RA
2799 movdqa xmm1, xmm0
2800 punpcklwd xmm0, xmm2 // BGRA first 4 pixels
2801 punpckhwd xmm1, xmm2 // BGRA next 4 pixels
2802 movdqu [edx], xmm0
2803 movdqu [edx + 16], xmm1
2804 lea edx, [edx + 32]
2805 sub ecx, 8
2806 jg convertloop
2807
2808 pop edi
2809 pop esi
2810 ret
2811 }
2812 }
2813
2814 // 8 pixels, unaligned.
2815 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
2816 __declspec(naked) __declspec(align(16))
I422ToARGBRow_Unaligned_SSSE3(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * dst_argb,int width)2817 void I422ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
2818 const uint8* u_buf,
2819 const uint8* v_buf,
2820 uint8* dst_argb,
2821 int width) {
2822 __asm {
2823 push esi
2824 push edi
2825 mov eax, [esp + 8 + 4] // Y
2826 mov esi, [esp + 8 + 8] // U
2827 mov edi, [esp + 8 + 12] // V
2828 mov edx, [esp + 8 + 16] // argb
2829 mov ecx, [esp + 8 + 20] // width
2830 sub edi, esi
2831 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
2832 pxor xmm4, xmm4
2833
2834 align 4
2835 convertloop:
2836 READYUV422
2837 YUVTORGB
2838
2839 // Step 3: Weave into ARGB
2840 punpcklbw xmm0, xmm1 // BG
2841 punpcklbw xmm2, xmm5 // RA
2842 movdqa xmm1, xmm0
2843 punpcklwd xmm0, xmm2 // BGRA first 4 pixels
2844 punpckhwd xmm1, xmm2 // BGRA next 4 pixels
2845 movdqu [edx], xmm0
2846 movdqu [edx + 16], xmm1
2847 lea edx, [edx + 32]
2848 sub ecx, 8
2849 jg convertloop
2850
2851 pop edi
2852 pop esi
2853 ret
2854 }
2855 }
2856
2857 // 8 pixels, unaligned.
2858 // 2 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
2859 // Similar to I420 but duplicate UV once more.
2860 __declspec(naked) __declspec(align(16))
I411ToARGBRow_Unaligned_SSSE3(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * dst_argb,int width)2861 void I411ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
2862 const uint8* u_buf,
2863 const uint8* v_buf,
2864 uint8* dst_argb,
2865 int width) {
2866 __asm {
2867 push ebx
2868 push esi
2869 push edi
2870 mov eax, [esp + 12 + 4] // Y
2871 mov esi, [esp + 12 + 8] // U
2872 mov edi, [esp + 12 + 12] // V
2873 mov edx, [esp + 12 + 16] // argb
2874 mov ecx, [esp + 12 + 20] // width
2875 sub edi, esi
2876 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
2877 pxor xmm4, xmm4
2878
2879 align 4
2880 convertloop:
2881 READYUV411 // modifies EBX
2882 YUVTORGB
2883
2884 // Step 3: Weave into ARGB
2885 punpcklbw xmm0, xmm1 // BG
2886 punpcklbw xmm2, xmm5 // RA
2887 movdqa xmm1, xmm0
2888 punpcklwd xmm0, xmm2 // BGRA first 4 pixels
2889 punpckhwd xmm1, xmm2 // BGRA next 4 pixels
2890 movdqu [edx], xmm0
2891 movdqu [edx + 16], xmm1
2892 lea edx, [edx + 32]
2893 sub ecx, 8
2894 jg convertloop
2895
2896 pop edi
2897 pop esi
2898 pop ebx
2899 ret
2900 }
2901 }
2902
2903 // 8 pixels, dest aligned 16.
2904 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
2905 __declspec(naked) __declspec(align(16))
NV12ToARGBRow_Unaligned_SSSE3(const uint8 * y_buf,const uint8 * uv_buf,uint8 * dst_argb,int width)2906 void NV12ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
2907 const uint8* uv_buf,
2908 uint8* dst_argb,
2909 int width) {
2910 __asm {
2911 push esi
2912 mov eax, [esp + 4 + 4] // Y
2913 mov esi, [esp + 4 + 8] // UV
2914 mov edx, [esp + 4 + 12] // argb
2915 mov ecx, [esp + 4 + 16] // width
2916 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
2917 pxor xmm4, xmm4
2918
2919 align 4
2920 convertloop:
2921 READNV12
2922 YUVTORGB
2923
2924 // Step 3: Weave into ARGB
2925 punpcklbw xmm0, xmm1 // BG
2926 punpcklbw xmm2, xmm5 // RA
2927 movdqa xmm1, xmm0
2928 punpcklwd xmm0, xmm2 // BGRA first 4 pixels
2929 punpckhwd xmm1, xmm2 // BGRA next 4 pixels
2930 movdqu [edx], xmm0
2931 movdqu [edx + 16], xmm1
2932 lea edx, [edx + 32]
2933 sub ecx, 8
2934 jg convertloop
2935
2936 pop esi
2937 ret
2938 }
2939 }
2940
2941 // 8 pixels, dest aligned 16.
2942 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
2943 __declspec(naked) __declspec(align(16))
NV21ToARGBRow_Unaligned_SSSE3(const uint8 * y_buf,const uint8 * uv_buf,uint8 * dst_argb,int width)2944 void NV21ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
2945 const uint8* uv_buf,
2946 uint8* dst_argb,
2947 int width) {
2948 __asm {
2949 push esi
2950 mov eax, [esp + 4 + 4] // Y
2951 mov esi, [esp + 4 + 8] // VU
2952 mov edx, [esp + 4 + 12] // argb
2953 mov ecx, [esp + 4 + 16] // width
2954 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
2955 pxor xmm4, xmm4
2956
2957 align 4
2958 convertloop:
2959 READNV12
2960 YVUTORGB
2961
2962 // Step 3: Weave into ARGB
2963 punpcklbw xmm0, xmm1 // BG
2964 punpcklbw xmm2, xmm5 // RA
2965 movdqa xmm1, xmm0
2966 punpcklwd xmm0, xmm2 // BGRA first 4 pixels
2967 punpckhwd xmm1, xmm2 // BGRA next 4 pixels
2968 movdqu [edx], xmm0
2969 movdqu [edx + 16], xmm1
2970 lea edx, [edx + 32]
2971 sub ecx, 8
2972 jg convertloop
2973
2974 pop esi
2975 ret
2976 }
2977 }
2978
2979 __declspec(naked) __declspec(align(16))
I422ToBGRARow_SSSE3(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * dst_bgra,int width)2980 void I422ToBGRARow_SSSE3(const uint8* y_buf,
2981 const uint8* u_buf,
2982 const uint8* v_buf,
2983 uint8* dst_bgra,
2984 int width) {
2985 __asm {
2986 push esi
2987 push edi
2988 mov eax, [esp + 8 + 4] // Y
2989 mov esi, [esp + 8 + 8] // U
2990 mov edi, [esp + 8 + 12] // V
2991 mov edx, [esp + 8 + 16] // bgra
2992 mov ecx, [esp + 8 + 20] // width
2993 sub edi, esi
2994 pxor xmm4, xmm4
2995
2996 align 4
2997 convertloop:
2998 READYUV422
2999 YUVTORGB
3000
3001 // Step 3: Weave into BGRA
3002 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
3003 punpcklbw xmm1, xmm0 // GB
3004 punpcklbw xmm5, xmm2 // AR
3005 movdqa xmm0, xmm5
3006 punpcklwd xmm5, xmm1 // BGRA first 4 pixels
3007 punpckhwd xmm0, xmm1 // BGRA next 4 pixels
3008 movdqa [edx], xmm5
3009 movdqa [edx + 16], xmm0
3010 lea edx, [edx + 32]
3011 sub ecx, 8
3012 jg convertloop
3013
3014 pop edi
3015 pop esi
3016 ret
3017 }
3018 }
3019
3020 __declspec(naked) __declspec(align(16))
I422ToBGRARow_Unaligned_SSSE3(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * dst_bgra,int width)3021 void I422ToBGRARow_Unaligned_SSSE3(const uint8* y_buf,
3022 const uint8* u_buf,
3023 const uint8* v_buf,
3024 uint8* dst_bgra,
3025 int width) {
3026 __asm {
3027 push esi
3028 push edi
3029 mov eax, [esp + 8 + 4] // Y
3030 mov esi, [esp + 8 + 8] // U
3031 mov edi, [esp + 8 + 12] // V
3032 mov edx, [esp + 8 + 16] // bgra
3033 mov ecx, [esp + 8 + 20] // width
3034 sub edi, esi
3035 pxor xmm4, xmm4
3036
3037 align 4
3038 convertloop:
3039 READYUV422
3040 YUVTORGB
3041
3042 // Step 3: Weave into BGRA
3043 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
3044 punpcklbw xmm1, xmm0 // GB
3045 punpcklbw xmm5, xmm2 // AR
3046 movdqa xmm0, xmm5
3047 punpcklwd xmm5, xmm1 // BGRA first 4 pixels
3048 punpckhwd xmm0, xmm1 // BGRA next 4 pixels
3049 movdqu [edx], xmm5
3050 movdqu [edx + 16], xmm0
3051 lea edx, [edx + 32]
3052 sub ecx, 8
3053 jg convertloop
3054
3055 pop edi
3056 pop esi
3057 ret
3058 }
3059 }
3060
3061 __declspec(naked) __declspec(align(16))
I422ToABGRRow_SSSE3(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * dst_abgr,int width)3062 void I422ToABGRRow_SSSE3(const uint8* y_buf,
3063 const uint8* u_buf,
3064 const uint8* v_buf,
3065 uint8* dst_abgr,
3066 int width) {
3067 __asm {
3068 push esi
3069 push edi
3070 mov eax, [esp + 8 + 4] // Y
3071 mov esi, [esp + 8 + 8] // U
3072 mov edi, [esp + 8 + 12] // V
3073 mov edx, [esp + 8 + 16] // abgr
3074 mov ecx, [esp + 8 + 20] // width
3075 sub edi, esi
3076 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
3077 pxor xmm4, xmm4
3078
3079 align 4
3080 convertloop:
3081 READYUV422
3082 YUVTORGB
3083
3084 // Step 3: Weave into ARGB
3085 punpcklbw xmm2, xmm1 // RG
3086 punpcklbw xmm0, xmm5 // BA
3087 movdqa xmm1, xmm2
3088 punpcklwd xmm2, xmm0 // RGBA first 4 pixels
3089 punpckhwd xmm1, xmm0 // RGBA next 4 pixels
3090 movdqa [edx], xmm2
3091 movdqa [edx + 16], xmm1
3092 lea edx, [edx + 32]
3093 sub ecx, 8
3094 jg convertloop
3095
3096 pop edi
3097 pop esi
3098 ret
3099 }
3100 }
3101
3102 __declspec(naked) __declspec(align(16))
I422ToABGRRow_Unaligned_SSSE3(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * dst_abgr,int width)3103 void I422ToABGRRow_Unaligned_SSSE3(const uint8* y_buf,
3104 const uint8* u_buf,
3105 const uint8* v_buf,
3106 uint8* dst_abgr,
3107 int width) {
3108 __asm {
3109 push esi
3110 push edi
3111 mov eax, [esp + 8 + 4] // Y
3112 mov esi, [esp + 8 + 8] // U
3113 mov edi, [esp + 8 + 12] // V
3114 mov edx, [esp + 8 + 16] // abgr
3115 mov ecx, [esp + 8 + 20] // width
3116 sub edi, esi
3117 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
3118 pxor xmm4, xmm4
3119
3120 align 4
3121 convertloop:
3122 READYUV422
3123 YUVTORGB
3124
3125 // Step 3: Weave into ARGB
3126 punpcklbw xmm2, xmm1 // RG
3127 punpcklbw xmm0, xmm5 // BA
3128 movdqa xmm1, xmm2
3129 punpcklwd xmm2, xmm0 // RGBA first 4 pixels
3130 punpckhwd xmm1, xmm0 // RGBA next 4 pixels
3131 movdqu [edx], xmm2
3132 movdqu [edx + 16], xmm1
3133 lea edx, [edx + 32]
3134 sub ecx, 8
3135 jg convertloop
3136
3137 pop edi
3138 pop esi
3139 ret
3140 }
3141 }
3142
3143 __declspec(naked) __declspec(align(16))
I422ToRGBARow_SSSE3(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * dst_rgba,int width)3144 void I422ToRGBARow_SSSE3(const uint8* y_buf,
3145 const uint8* u_buf,
3146 const uint8* v_buf,
3147 uint8* dst_rgba,
3148 int width) {
3149 __asm {
3150 push esi
3151 push edi
3152 mov eax, [esp + 8 + 4] // Y
3153 mov esi, [esp + 8 + 8] // U
3154 mov edi, [esp + 8 + 12] // V
3155 mov edx, [esp + 8 + 16] // rgba
3156 mov ecx, [esp + 8 + 20] // width
3157 sub edi, esi
3158 pxor xmm4, xmm4
3159
3160 align 4
3161 convertloop:
3162 READYUV422
3163 YUVTORGB
3164
3165 // Step 3: Weave into RGBA
3166 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
3167 punpcklbw xmm1, xmm2 // GR
3168 punpcklbw xmm5, xmm0 // AB
3169 movdqa xmm0, xmm5
3170 punpcklwd xmm5, xmm1 // RGBA first 4 pixels
3171 punpckhwd xmm0, xmm1 // RGBA next 4 pixels
3172 movdqa [edx], xmm5
3173 movdqa [edx + 16], xmm0
3174 lea edx, [edx + 32]
3175 sub ecx, 8
3176 jg convertloop
3177
3178 pop edi
3179 pop esi
3180 ret
3181 }
3182 }
3183
3184 __declspec(naked) __declspec(align(16))
I422ToRGBARow_Unaligned_SSSE3(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * dst_rgba,int width)3185 void I422ToRGBARow_Unaligned_SSSE3(const uint8* y_buf,
3186 const uint8* u_buf,
3187 const uint8* v_buf,
3188 uint8* dst_rgba,
3189 int width) {
3190 __asm {
3191 push esi
3192 push edi
3193 mov eax, [esp + 8 + 4] // Y
3194 mov esi, [esp + 8 + 8] // U
3195 mov edi, [esp + 8 + 12] // V
3196 mov edx, [esp + 8 + 16] // rgba
3197 mov ecx, [esp + 8 + 20] // width
3198 sub edi, esi
3199 pxor xmm4, xmm4
3200
3201 align 4
3202 convertloop:
3203 READYUV422
3204 YUVTORGB
3205
3206 // Step 3: Weave into RGBA
3207 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
3208 punpcklbw xmm1, xmm2 // GR
3209 punpcklbw xmm5, xmm0 // AB
3210 movdqa xmm0, xmm5
3211 punpcklwd xmm5, xmm1 // RGBA first 4 pixels
3212 punpckhwd xmm0, xmm1 // RGBA next 4 pixels
3213 movdqu [edx], xmm5
3214 movdqu [edx + 16], xmm0
3215 lea edx, [edx + 32]
3216 sub ecx, 8
3217 jg convertloop
3218
3219 pop edi
3220 pop esi
3221 ret
3222 }
3223 }
3224
3225 #endif // HAS_I422TOARGBROW_SSSE3
3226
3227 #ifdef HAS_YTOARGBROW_SSE2
3228 __declspec(naked) __declspec(align(16))
YToARGBRow_SSE2(const uint8 * y_buf,uint8 * rgb_buf,int width)3229 void YToARGBRow_SSE2(const uint8* y_buf,
3230 uint8* rgb_buf,
3231 int width) {
3232 __asm {
3233 pxor xmm5, xmm5
3234 pcmpeqb xmm4, xmm4 // generate mask 0xff000000
3235 pslld xmm4, 24
3236 mov eax, 0x00100010
3237 movd xmm3, eax
3238 pshufd xmm3, xmm3, 0
3239 mov eax, 0x004a004a // 74
3240 movd xmm2, eax
3241 pshufd xmm2, xmm2,0
3242 mov eax, [esp + 4] // Y
3243 mov edx, [esp + 8] // rgb
3244 mov ecx, [esp + 12] // width
3245
3246 align 4
3247 convertloop:
3248 // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
3249 movq xmm0, qword ptr [eax]
3250 lea eax, [eax + 8]
3251 punpcklbw xmm0, xmm5 // 0.Y
3252 psubusw xmm0, xmm3
3253 pmullw xmm0, xmm2
3254 psrlw xmm0, 6
3255 packuswb xmm0, xmm0 // G
3256
3257 // Step 2: Weave into ARGB
3258 punpcklbw xmm0, xmm0 // GG
3259 movdqa xmm1, xmm0
3260 punpcklwd xmm0, xmm0 // BGRA first 4 pixels
3261 punpckhwd xmm1, xmm1 // BGRA next 4 pixels
3262 por xmm0, xmm4
3263 por xmm1, xmm4
3264 movdqa [edx], xmm0
3265 movdqa [edx + 16], xmm1
3266 lea edx, [edx + 32]
3267 sub ecx, 8
3268 jg convertloop
3269
3270 ret
3271 }
3272 }
3273 #endif // HAS_YTOARGBROW_SSE2
3274
3275 #ifdef HAS_MIRRORROW_SSSE3
3276 // Shuffle table for reversing the bytes.
3277 static const uvec8 kShuffleMirror = {
3278 15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
3279 };
3280
3281 __declspec(naked) __declspec(align(16))
MirrorRow_SSSE3(const uint8 * src,uint8 * dst,int width)3282 void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
3283 __asm {
3284 mov eax, [esp + 4] // src
3285 mov edx, [esp + 8] // dst
3286 mov ecx, [esp + 12] // width
3287 movdqa xmm5, kShuffleMirror
3288 lea eax, [eax - 16]
3289
3290 align 4
3291 convertloop:
3292 movdqa xmm0, [eax + ecx]
3293 pshufb xmm0, xmm5
3294 sub ecx, 16
3295 movdqa [edx], xmm0
3296 lea edx, [edx + 16]
3297 jg convertloop
3298 ret
3299 }
3300 }
3301 #endif // HAS_MIRRORROW_SSSE3
3302
3303 #ifdef HAS_MIRRORROW_AVX2
3304 // Shuffle table for reversing the bytes.
3305 static const ulvec8 kShuffleMirror_AVX2 = {
3306 15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u,
3307 15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
3308 };
3309
3310 __declspec(naked) __declspec(align(16))
MirrorRow_AVX2(const uint8 * src,uint8 * dst,int width)3311 void MirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
3312 __asm {
3313 mov eax, [esp + 4] // src
3314 mov edx, [esp + 8] // dst
3315 mov ecx, [esp + 12] // width
3316 vmovdqa ymm5, kShuffleMirror_AVX2
3317 lea eax, [eax - 32]
3318
3319 align 4
3320 convertloop:
3321 vmovdqu ymm0, [eax + ecx]
3322 vpshufb ymm0, ymm0, ymm5
3323 vpermq ymm0, ymm0, 0x4e // swap high and low halfs
3324 sub ecx, 32
3325 vmovdqu [edx], ymm0
3326 lea edx, [edx + 32]
3327 jg convertloop
3328 vzeroupper
3329 ret
3330 }
3331 }
3332 #endif // HAS_MIRRORROW_AVX2
3333
3334 #ifdef HAS_MIRRORROW_SSE2
3335 // SSE2 version has movdqu so it can be used on unaligned buffers when SSSE3
3336 // version can not.
3337 __declspec(naked) __declspec(align(16))
MirrorRow_SSE2(const uint8 * src,uint8 * dst,int width)3338 void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
3339 __asm {
3340 mov eax, [esp + 4] // src
3341 mov edx, [esp + 8] // dst
3342 mov ecx, [esp + 12] // width
3343 lea eax, [eax - 16]
3344
3345 align 4
3346 convertloop:
3347 movdqu xmm0, [eax + ecx]
3348 movdqa xmm1, xmm0 // swap bytes
3349 psllw xmm0, 8
3350 psrlw xmm1, 8
3351 por xmm0, xmm1
3352 pshuflw xmm0, xmm0, 0x1b // swap words
3353 pshufhw xmm0, xmm0, 0x1b
3354 pshufd xmm0, xmm0, 0x4e // swap qwords
3355 sub ecx, 16
3356 movdqu [edx], xmm0
3357 lea edx, [edx + 16]
3358 jg convertloop
3359 ret
3360 }
3361 }
3362 #endif // HAS_MIRRORROW_SSE2
3363
3364 #ifdef HAS_MIRRORROW_UV_SSSE3
3365 // Shuffle table for reversing the bytes of UV channels.
3366 static const uvec8 kShuffleMirrorUV = {
3367 14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u
3368 };
3369
3370 __declspec(naked) __declspec(align(16))
MirrorUVRow_SSSE3(const uint8 * src,uint8 * dst_u,uint8 * dst_v,int width)3371 void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
3372 int width) {
3373 __asm {
3374 push edi
3375 mov eax, [esp + 4 + 4] // src
3376 mov edx, [esp + 4 + 8] // dst_u
3377 mov edi, [esp + 4 + 12] // dst_v
3378 mov ecx, [esp + 4 + 16] // width
3379 movdqa xmm1, kShuffleMirrorUV
3380 lea eax, [eax + ecx * 2 - 16]
3381 sub edi, edx
3382
3383 align 4
3384 convertloop:
3385 movdqa xmm0, [eax]
3386 lea eax, [eax - 16]
3387 pshufb xmm0, xmm1
3388 sub ecx, 8
3389 movlpd qword ptr [edx], xmm0
3390 movhpd qword ptr [edx + edi], xmm0
3391 lea edx, [edx + 8]
3392 jg convertloop
3393
3394 pop edi
3395 ret
3396 }
3397 }
3398 #endif // HAS_MIRRORROW_UV_SSSE3
3399
3400 #ifdef HAS_ARGBMIRRORROW_SSSE3
3401 // Shuffle table for reversing the bytes.
3402 static const uvec8 kARGBShuffleMirror = {
3403 12u, 13u, 14u, 15u, 8u, 9u, 10u, 11u, 4u, 5u, 6u, 7u, 0u, 1u, 2u, 3u
3404 };
3405
3406 __declspec(naked) __declspec(align(16))
ARGBMirrorRow_SSSE3(const uint8 * src,uint8 * dst,int width)3407 void ARGBMirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
3408 __asm {
3409 mov eax, [esp + 4] // src
3410 mov edx, [esp + 8] // dst
3411 mov ecx, [esp + 12] // width
3412 lea eax, [eax - 16 + ecx * 4] // last 4 pixels.
3413 movdqa xmm5, kARGBShuffleMirror
3414
3415 align 4
3416 convertloop:
3417 movdqa xmm0, [eax]
3418 lea eax, [eax - 16]
3419 pshufb xmm0, xmm5
3420 sub ecx, 4
3421 movdqa [edx], xmm0
3422 lea edx, [edx + 16]
3423 jg convertloop
3424 ret
3425 }
3426 }
3427 #endif // HAS_ARGBMIRRORROW_SSSE3
3428
3429 #ifdef HAS_ARGBMIRRORROW_AVX2
3430 // Shuffle table for reversing the bytes.
3431 static const ulvec32 kARGBShuffleMirror_AVX2 = {
3432 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
3433 };
3434
3435 __declspec(naked) __declspec(align(16))
ARGBMirrorRow_AVX2(const uint8 * src,uint8 * dst,int width)3436 void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
3437 __asm {
3438 mov eax, [esp + 4] // src
3439 mov edx, [esp + 8] // dst
3440 mov ecx, [esp + 12] // width
3441 lea eax, [eax - 32]
3442 vmovdqa ymm5, kARGBShuffleMirror_AVX2
3443
3444 align 4
3445 convertloop:
3446 vpermd ymm0, ymm5, [eax + ecx * 4] // permute dword order
3447 sub ecx, 8
3448 vmovdqu [edx], ymm0
3449 lea edx, [edx + 32]
3450 jg convertloop
3451 vzeroupper
3452 ret
3453 }
3454 }
3455 #endif // HAS_ARGBMIRRORROW_AVX2
3456
3457 #ifdef HAS_SPLITUVROW_SSE2
3458 __declspec(naked) __declspec(align(16))
SplitUVRow_SSE2(const uint8 * src_uv,uint8 * dst_u,uint8 * dst_v,int pix)3459 void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
3460 __asm {
3461 push edi
3462 mov eax, [esp + 4 + 4] // src_uv
3463 mov edx, [esp + 4 + 8] // dst_u
3464 mov edi, [esp + 4 + 12] // dst_v
3465 mov ecx, [esp + 4 + 16] // pix
3466 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
3467 psrlw xmm5, 8
3468 sub edi, edx
3469
3470 align 4
3471 convertloop:
3472 movdqa xmm0, [eax]
3473 movdqa xmm1, [eax + 16]
3474 lea eax, [eax + 32]
3475 movdqa xmm2, xmm0
3476 movdqa xmm3, xmm1
3477 pand xmm0, xmm5 // even bytes
3478 pand xmm1, xmm5
3479 packuswb xmm0, xmm1
3480 psrlw xmm2, 8 // odd bytes
3481 psrlw xmm3, 8
3482 packuswb xmm2, xmm3
3483 movdqa [edx], xmm0
3484 movdqa [edx + edi], xmm2
3485 lea edx, [edx + 16]
3486 sub ecx, 16
3487 jg convertloop
3488
3489 pop edi
3490 ret
3491 }
3492 }
3493
3494 __declspec(naked) __declspec(align(16))
SplitUVRow_Unaligned_SSE2(const uint8 * src_uv,uint8 * dst_u,uint8 * dst_v,int pix)3495 void SplitUVRow_Unaligned_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
3496 int pix) {
3497 __asm {
3498 push edi
3499 mov eax, [esp + 4 + 4] // src_uv
3500 mov edx, [esp + 4 + 8] // dst_u
3501 mov edi, [esp + 4 + 12] // dst_v
3502 mov ecx, [esp + 4 + 16] // pix
3503 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
3504 psrlw xmm5, 8
3505 sub edi, edx
3506
3507 align 4
3508 convertloop:
3509 movdqu xmm0, [eax]
3510 movdqu xmm1, [eax + 16]
3511 lea eax, [eax + 32]
3512 movdqa xmm2, xmm0
3513 movdqa xmm3, xmm1
3514 pand xmm0, xmm5 // even bytes
3515 pand xmm1, xmm5
3516 packuswb xmm0, xmm1
3517 psrlw xmm2, 8 // odd bytes
3518 psrlw xmm3, 8
3519 packuswb xmm2, xmm3
3520 movdqu [edx], xmm0
3521 movdqu [edx + edi], xmm2
3522 lea edx, [edx + 16]
3523 sub ecx, 16
3524 jg convertloop
3525
3526 pop edi
3527 ret
3528 }
3529 }
3530 #endif // HAS_SPLITUVROW_SSE2
3531
3532 #ifdef HAS_SPLITUVROW_AVX2
3533 __declspec(naked) __declspec(align(16))
SplitUVRow_AVX2(const uint8 * src_uv,uint8 * dst_u,uint8 * dst_v,int pix)3534 void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
3535 __asm {
3536 push edi
3537 mov eax, [esp + 4 + 4] // src_uv
3538 mov edx, [esp + 4 + 8] // dst_u
3539 mov edi, [esp + 4 + 12] // dst_v
3540 mov ecx, [esp + 4 + 16] // pix
3541 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff
3542 vpsrlw ymm5, ymm5, 8
3543 sub edi, edx
3544
3545 align 4
3546 convertloop:
3547 vmovdqu ymm0, [eax]
3548 vmovdqu ymm1, [eax + 32]
3549 lea eax, [eax + 64]
3550 vpsrlw ymm2, ymm0, 8 // odd bytes
3551 vpsrlw ymm3, ymm1, 8
3552 vpand ymm0, ymm0, ymm5 // even bytes
3553 vpand ymm1, ymm1, ymm5
3554 vpackuswb ymm0, ymm0, ymm1
3555 vpackuswb ymm2, ymm2, ymm3
3556 vpermq ymm0, ymm0, 0xd8
3557 vpermq ymm2, ymm2, 0xd8
3558 vmovdqu [edx], ymm0
3559 vmovdqu [edx + edi], ymm2
3560 lea edx, [edx + 32]
3561 sub ecx, 32
3562 jg convertloop
3563
3564 pop edi
3565 vzeroupper
3566 ret
3567 }
3568 }
3569 #endif // HAS_SPLITUVROW_AVX2
3570
3571 #ifdef HAS_MERGEUVROW_SSE2
3572 __declspec(naked) __declspec(align(16))
MergeUVRow_SSE2(const uint8 * src_u,const uint8 * src_v,uint8 * dst_uv,int width)3573 void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
3574 int width) {
3575 __asm {
3576 push edi
3577 mov eax, [esp + 4 + 4] // src_u
3578 mov edx, [esp + 4 + 8] // src_v
3579 mov edi, [esp + 4 + 12] // dst_uv
3580 mov ecx, [esp + 4 + 16] // width
3581 sub edx, eax
3582
3583 align 4
3584 convertloop:
3585 movdqa xmm0, [eax] // read 16 U's
3586 movdqa xmm1, [eax + edx] // and 16 V's
3587 lea eax, [eax + 16]
3588 movdqa xmm2, xmm0
3589 punpcklbw xmm0, xmm1 // first 8 UV pairs
3590 punpckhbw xmm2, xmm1 // next 8 UV pairs
3591 movdqa [edi], xmm0
3592 movdqa [edi + 16], xmm2
3593 lea edi, [edi + 32]
3594 sub ecx, 16
3595 jg convertloop
3596
3597 pop edi
3598 ret
3599 }
3600 }
3601
3602 __declspec(naked) __declspec(align(16))
MergeUVRow_Unaligned_SSE2(const uint8 * src_u,const uint8 * src_v,uint8 * dst_uv,int width)3603 void MergeUVRow_Unaligned_SSE2(const uint8* src_u, const uint8* src_v,
3604 uint8* dst_uv, int width) {
3605 __asm {
3606 push edi
3607 mov eax, [esp + 4 + 4] // src_u
3608 mov edx, [esp + 4 + 8] // src_v
3609 mov edi, [esp + 4 + 12] // dst_uv
3610 mov ecx, [esp + 4 + 16] // width
3611 sub edx, eax
3612
3613 align 4
3614 convertloop:
3615 movdqu xmm0, [eax] // read 16 U's
3616 movdqu xmm1, [eax + edx] // and 16 V's
3617 lea eax, [eax + 16]
3618 movdqa xmm2, xmm0
3619 punpcklbw xmm0, xmm1 // first 8 UV pairs
3620 punpckhbw xmm2, xmm1 // next 8 UV pairs
3621 movdqu [edi], xmm0
3622 movdqu [edi + 16], xmm2
3623 lea edi, [edi + 32]
3624 sub ecx, 16
3625 jg convertloop
3626
3627 pop edi
3628 ret
3629 }
3630 }
3631 #endif // HAS_MERGEUVROW_SSE2
3632
3633 #ifdef HAS_MERGEUVROW_AVX2
3634 __declspec(naked) __declspec(align(16))
MergeUVRow_AVX2(const uint8 * src_u,const uint8 * src_v,uint8 * dst_uv,int width)3635 void MergeUVRow_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
3636 int width) {
3637 __asm {
3638 push edi
3639 mov eax, [esp + 4 + 4] // src_u
3640 mov edx, [esp + 4 + 8] // src_v
3641 mov edi, [esp + 4 + 12] // dst_uv
3642 mov ecx, [esp + 4 + 16] // width
3643 sub edx, eax
3644
3645 align 4
3646 convertloop:
3647 vmovdqu ymm0, [eax] // read 32 U's
3648 vmovdqu ymm1, [eax + edx] // and 32 V's
3649 lea eax, [eax + 32]
3650 vpunpcklbw ymm2, ymm0, ymm1 // low 16 UV pairs. mutated qqword 0,2
3651 vpunpckhbw ymm0, ymm0, ymm1 // high 16 UV pairs. mutated qqword 1,3
3652 vperm2i128 ymm1, ymm2, ymm0, 0x20 // low 128 of ymm2 and low 128 of ymm0
3653 vperm2i128 ymm2, ymm2, ymm0, 0x31 // high 128 of ymm2 and high 128 of ymm0
3654 vmovdqu [edi], ymm1
3655 vmovdqu [edi + 32], ymm2
3656 lea edi, [edi + 64]
3657 sub ecx, 32
3658 jg convertloop
3659
3660 pop edi
3661 vzeroupper
3662 ret
3663 }
3664 }
3665 #endif // HAS_MERGEUVROW_AVX2
3666
3667 #ifdef HAS_COPYROW_SSE2
3668 // CopyRow copys 'count' bytes using a 16 byte load/store, 32 bytes at time.
3669 __declspec(naked) __declspec(align(16))
CopyRow_SSE2(const uint8 * src,uint8 * dst,int count)3670 void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
3671 __asm {
3672 mov eax, [esp + 4] // src
3673 mov edx, [esp + 8] // dst
3674 mov ecx, [esp + 12] // count
3675
3676 align 4
3677 convertloop:
3678 movdqa xmm0, [eax]
3679 movdqa xmm1, [eax + 16]
3680 lea eax, [eax + 32]
3681 movdqa [edx], xmm0
3682 movdqa [edx + 16], xmm1
3683 lea edx, [edx + 32]
3684 sub ecx, 32
3685 jg convertloop
3686 ret
3687 }
3688 }
3689 #endif // HAS_COPYROW_SSE2
3690
3691 // Unaligned Multiple of 1.
3692 __declspec(naked) __declspec(align(16))
CopyRow_ERMS(const uint8 * src,uint8 * dst,int count)3693 void CopyRow_ERMS(const uint8* src, uint8* dst, int count) {
3694 __asm {
3695 mov eax, esi
3696 mov edx, edi
3697 mov esi, [esp + 4] // src
3698 mov edi, [esp + 8] // dst
3699 mov ecx, [esp + 12] // count
3700 rep movsb
3701 mov edi, edx
3702 mov esi, eax
3703 ret
3704 }
3705 }
3706
3707 #ifdef HAS_COPYROW_X86
3708 __declspec(naked) __declspec(align(16))
CopyRow_X86(const uint8 * src,uint8 * dst,int count)3709 void CopyRow_X86(const uint8* src, uint8* dst, int count) {
3710 __asm {
3711 mov eax, esi
3712 mov edx, edi
3713 mov esi, [esp + 4] // src
3714 mov edi, [esp + 8] // dst
3715 mov ecx, [esp + 12] // count
3716 shr ecx, 2
3717 rep movsd
3718 mov edi, edx
3719 mov esi, eax
3720 ret
3721 }
3722 }
3723 #endif // HAS_COPYROW_X86
3724
3725 #ifdef HAS_ARGBCOPYALPHAROW_SSE2
3726 // width in pixels
3727 __declspec(naked) __declspec(align(16))
ARGBCopyAlphaRow_SSE2(const uint8 * src,uint8 * dst,int width)3728 void ARGBCopyAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
3729 __asm {
3730 mov eax, [esp + 4] // src
3731 mov edx, [esp + 8] // dst
3732 mov ecx, [esp + 12] // count
3733 pcmpeqb xmm0, xmm0 // generate mask 0xff000000
3734 pslld xmm0, 24
3735 pcmpeqb xmm1, xmm1 // generate mask 0x00ffffff
3736 psrld xmm1, 8
3737
3738 align 4
3739 convertloop:
3740 movdqa xmm2, [eax]
3741 movdqa xmm3, [eax + 16]
3742 lea eax, [eax + 32]
3743 movdqa xmm4, [edx]
3744 movdqa xmm5, [edx + 16]
3745 pand xmm2, xmm0
3746 pand xmm3, xmm0
3747 pand xmm4, xmm1
3748 pand xmm5, xmm1
3749 por xmm2, xmm4
3750 por xmm3, xmm5
3751 movdqa [edx], xmm2
3752 movdqa [edx + 16], xmm3
3753 lea edx, [edx + 32]
3754 sub ecx, 8
3755 jg convertloop
3756
3757 ret
3758 }
3759 }
3760 #endif // HAS_ARGBCOPYALPHAROW_SSE2
3761
3762 #ifdef HAS_ARGBCOPYALPHAROW_AVX2
3763 // width in pixels
3764 __declspec(naked) __declspec(align(16))
ARGBCopyAlphaRow_AVX2(const uint8 * src,uint8 * dst,int width)3765 void ARGBCopyAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
3766 __asm {
3767 mov eax, [esp + 4] // src
3768 mov edx, [esp + 8] // dst
3769 mov ecx, [esp + 12] // count
3770 vpcmpeqb ymm0, ymm0, ymm0
3771 vpsrld ymm0, ymm0, 8 // generate mask 0x00ffffff
3772
3773 align 4
3774 convertloop:
3775 vmovdqu ymm1, [eax]
3776 vmovdqu ymm2, [eax + 32]
3777 lea eax, [eax + 64]
3778 vpblendvb ymm1, ymm1, [edx], ymm0
3779 vpblendvb ymm2, ymm2, [edx + 32], ymm0
3780 vmovdqu [edx], ymm1
3781 vmovdqu [edx + 32], ymm2
3782 lea edx, [edx + 64]
3783 sub ecx, 16
3784 jg convertloop
3785
3786 vzeroupper
3787 ret
3788 }
3789 }
3790 #endif // HAS_ARGBCOPYALPHAROW_AVX2
3791
3792 #ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2
3793 // width in pixels
3794 __declspec(naked) __declspec(align(16))
ARGBCopyYToAlphaRow_SSE2(const uint8 * src,uint8 * dst,int width)3795 void ARGBCopyYToAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
3796 __asm {
3797 mov eax, [esp + 4] // src
3798 mov edx, [esp + 8] // dst
3799 mov ecx, [esp + 12] // count
3800 pcmpeqb xmm0, xmm0 // generate mask 0xff000000
3801 pslld xmm0, 24
3802 pcmpeqb xmm1, xmm1 // generate mask 0x00ffffff
3803 psrld xmm1, 8
3804
3805 align 4
3806 convertloop:
3807 movq xmm2, qword ptr [eax] // 8 Y's
3808 lea eax, [eax + 8]
3809 punpcklbw xmm2, xmm2
3810 punpckhwd xmm3, xmm2
3811 punpcklwd xmm2, xmm2
3812 movdqa xmm4, [edx]
3813 movdqa xmm5, [edx + 16]
3814 pand xmm2, xmm0
3815 pand xmm3, xmm0
3816 pand xmm4, xmm1
3817 pand xmm5, xmm1
3818 por xmm2, xmm4
3819 por xmm3, xmm5
3820 movdqa [edx], xmm2
3821 movdqa [edx + 16], xmm3
3822 lea edx, [edx + 32]
3823 sub ecx, 8
3824 jg convertloop
3825
3826 ret
3827 }
3828 }
3829 #endif // HAS_ARGBCOPYYTOALPHAROW_SSE2
3830
3831 #ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2
3832 // width in pixels
3833 __declspec(naked) __declspec(align(16))
ARGBCopyYToAlphaRow_AVX2(const uint8 * src,uint8 * dst,int width)3834 void ARGBCopyYToAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
3835 __asm {
3836 mov eax, [esp + 4] // src
3837 mov edx, [esp + 8] // dst
3838 mov ecx, [esp + 12] // count
3839 vpcmpeqb ymm0, ymm0, ymm0
3840 vpsrld ymm0, ymm0, 8 // generate mask 0x00ffffff
3841
3842 align 4
3843 convertloop:
3844 vpmovzxbd ymm1, qword ptr [eax]
3845 vpmovzxbd ymm2, qword ptr [eax + 8]
3846 lea eax, [eax + 16]
3847 vpslld ymm1, ymm1, 24
3848 vpslld ymm2, ymm2, 24
3849 vpblendvb ymm1, ymm1, [edx], ymm0
3850 vpblendvb ymm2, ymm2, [edx + 32], ymm0
3851 vmovdqu [edx], ymm1
3852 vmovdqu [edx + 32], ymm2
3853 lea edx, [edx + 64]
3854 sub ecx, 16
3855 jg convertloop
3856
3857 vzeroupper
3858 ret
3859 }
3860 }
3861 #endif // HAS_ARGBCOPYYTOALPHAROW_AVX2
3862
3863 #ifdef HAS_SETROW_X86
3864 // SetRow8 writes 'count' bytes using a 32 bit value repeated.
3865 __declspec(naked) __declspec(align(16))
SetRow_X86(uint8 * dst,uint32 v32,int count)3866 void SetRow_X86(uint8* dst, uint32 v32, int count) {
3867 __asm {
3868 mov edx, edi
3869 mov edi, [esp + 4] // dst
3870 mov eax, [esp + 8] // v32
3871 mov ecx, [esp + 12] // count
3872 shr ecx, 2
3873 rep stosd
3874 mov edi, edx
3875 ret
3876 }
3877 }
3878
3879 // SetRow32 writes 'count' words using a 32 bit value repeated.
3880 __declspec(naked) __declspec(align(16))
ARGBSetRows_X86(uint8 * dst,uint32 v32,int width,int dst_stride,int height)3881 void ARGBSetRows_X86(uint8* dst, uint32 v32, int width,
3882 int dst_stride, int height) {
3883 __asm {
3884 push esi
3885 push edi
3886 push ebp
3887 mov edi, [esp + 12 + 4] // dst
3888 mov eax, [esp + 12 + 8] // v32
3889 mov ebp, [esp + 12 + 12] // width
3890 mov edx, [esp + 12 + 16] // dst_stride
3891 mov esi, [esp + 12 + 20] // height
3892 lea ecx, [ebp * 4]
3893 sub edx, ecx // stride - width * 4
3894
3895 align 4
3896 convertloop:
3897 mov ecx, ebp
3898 rep stosd
3899 add edi, edx
3900 sub esi, 1
3901 jg convertloop
3902
3903 pop ebp
3904 pop edi
3905 pop esi
3906 ret
3907 }
3908 }
3909 #endif // HAS_SETROW_X86
3910
3911 #ifdef HAS_YUY2TOYROW_AVX2
3912 __declspec(naked) __declspec(align(16))
YUY2ToYRow_AVX2(const uint8 * src_yuy2,uint8 * dst_y,int pix)3913 void YUY2ToYRow_AVX2(const uint8* src_yuy2,
3914 uint8* dst_y, int pix) {
3915 __asm {
3916 mov eax, [esp + 4] // src_yuy2
3917 mov edx, [esp + 8] // dst_y
3918 mov ecx, [esp + 12] // pix
3919 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff
3920 vpsrlw ymm5, ymm5, 8
3921
3922 align 4
3923 convertloop:
3924 vmovdqu ymm0, [eax]
3925 vmovdqu ymm1, [eax + 32]
3926 lea eax, [eax + 64]
3927 vpand ymm0, ymm0, ymm5 // even bytes are Y
3928 vpand ymm1, ymm1, ymm5
3929 vpackuswb ymm0, ymm0, ymm1 // mutates.
3930 vpermq ymm0, ymm0, 0xd8
3931 sub ecx, 32
3932 vmovdqu [edx], ymm0
3933 lea edx, [edx + 32]
3934 jg convertloop
3935 vzeroupper
3936 ret
3937 }
3938 }
3939
3940 __declspec(naked) __declspec(align(16))
YUY2ToUVRow_AVX2(const uint8 * src_yuy2,int stride_yuy2,uint8 * dst_u,uint8 * dst_v,int pix)3941 void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2,
3942 uint8* dst_u, uint8* dst_v, int pix) {
3943 __asm {
3944 push esi
3945 push edi
3946 mov eax, [esp + 8 + 4] // src_yuy2
3947 mov esi, [esp + 8 + 8] // stride_yuy2
3948 mov edx, [esp + 8 + 12] // dst_u
3949 mov edi, [esp + 8 + 16] // dst_v
3950 mov ecx, [esp + 8 + 20] // pix
3951 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff
3952 vpsrlw ymm5, ymm5, 8
3953 sub edi, edx
3954
3955 align 4
3956 convertloop:
3957 vmovdqu ymm0, [eax]
3958 vmovdqu ymm1, [eax + 32]
3959 vpavgb ymm0, ymm0, [eax + esi]
3960 vpavgb ymm1, ymm1, [eax + esi + 32]
3961 lea eax, [eax + 64]
3962 vpsrlw ymm0, ymm0, 8 // YUYV -> UVUV
3963 vpsrlw ymm1, ymm1, 8
3964 vpackuswb ymm0, ymm0, ymm1 // mutates.
3965 vpermq ymm0, ymm0, 0xd8
3966 vpand ymm1, ymm0, ymm5 // U
3967 vpsrlw ymm0, ymm0, 8 // V
3968 vpackuswb ymm1, ymm1, ymm1 // mutates.
3969 vpackuswb ymm0, ymm0, ymm0 // mutates.
3970 vpermq ymm1, ymm1, 0xd8
3971 vpermq ymm0, ymm0, 0xd8
3972 vextractf128 [edx], ymm1, 0 // U
3973 vextractf128 [edx + edi], ymm0, 0 // V
3974 lea edx, [edx + 16]
3975 sub ecx, 32
3976 jg convertloop
3977
3978 pop edi
3979 pop esi
3980 vzeroupper
3981 ret
3982 }
3983 }
3984
3985 __declspec(naked) __declspec(align(16))
YUY2ToUV422Row_AVX2(const uint8 * src_yuy2,uint8 * dst_u,uint8 * dst_v,int pix)3986 void YUY2ToUV422Row_AVX2(const uint8* src_yuy2,
3987 uint8* dst_u, uint8* dst_v, int pix) {
3988 __asm {
3989 push edi
3990 mov eax, [esp + 4 + 4] // src_yuy2
3991 mov edx, [esp + 4 + 8] // dst_u
3992 mov edi, [esp + 4 + 12] // dst_v
3993 mov ecx, [esp + 4 + 16] // pix
3994 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff
3995 vpsrlw ymm5, ymm5, 8
3996 sub edi, edx
3997
3998 align 4
3999 convertloop:
4000 vmovdqu ymm0, [eax]
4001 vmovdqu ymm1, [eax + 32]
4002 lea eax, [eax + 64]
4003 vpsrlw ymm0, ymm0, 8 // YUYV -> UVUV
4004 vpsrlw ymm1, ymm1, 8
4005 vpackuswb ymm0, ymm0, ymm1 // mutates.
4006 vpermq ymm0, ymm0, 0xd8
4007 vpand ymm1, ymm0, ymm5 // U
4008 vpsrlw ymm0, ymm0, 8 // V
4009 vpackuswb ymm1, ymm1, ymm1 // mutates.
4010 vpackuswb ymm0, ymm0, ymm0 // mutates.
4011 vpermq ymm1, ymm1, 0xd8
4012 vpermq ymm0, ymm0, 0xd8
4013 vextractf128 [edx], ymm1, 0 // U
4014 vextractf128 [edx + edi], ymm0, 0 // V
4015 lea edx, [edx + 16]
4016 sub ecx, 32
4017 jg convertloop
4018
4019 pop edi
4020 vzeroupper
4021 ret
4022 }
4023 }
4024
4025 __declspec(naked) __declspec(align(16))
UYVYToYRow_AVX2(const uint8 * src_uyvy,uint8 * dst_y,int pix)4026 void UYVYToYRow_AVX2(const uint8* src_uyvy,
4027 uint8* dst_y, int pix) {
4028 __asm {
4029 mov eax, [esp + 4] // src_uyvy
4030 mov edx, [esp + 8] // dst_y
4031 mov ecx, [esp + 12] // pix
4032
4033 align 4
4034 convertloop:
4035 vmovdqu ymm0, [eax]
4036 vmovdqu ymm1, [eax + 32]
4037 lea eax, [eax + 64]
4038 vpsrlw ymm0, ymm0, 8 // odd bytes are Y
4039 vpsrlw ymm1, ymm1, 8
4040 vpackuswb ymm0, ymm0, ymm1 // mutates.
4041 vpermq ymm0, ymm0, 0xd8
4042 sub ecx, 32
4043 vmovdqu [edx], ymm0
4044 lea edx, [edx + 32]
4045 jg convertloop
4046 ret
4047 vzeroupper
4048 }
4049 }
4050
4051 __declspec(naked) __declspec(align(16))
UYVYToUVRow_AVX2(const uint8 * src_uyvy,int stride_uyvy,uint8 * dst_u,uint8 * dst_v,int pix)4052 void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy,
4053 uint8* dst_u, uint8* dst_v, int pix) {
4054 __asm {
4055 push esi
4056 push edi
4057 mov eax, [esp + 8 + 4] // src_yuy2
4058 mov esi, [esp + 8 + 8] // stride_yuy2
4059 mov edx, [esp + 8 + 12] // dst_u
4060 mov edi, [esp + 8 + 16] // dst_v
4061 mov ecx, [esp + 8 + 20] // pix
4062 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff
4063 vpsrlw ymm5, ymm5, 8
4064 sub edi, edx
4065
4066 align 4
4067 convertloop:
4068 vmovdqu ymm0, [eax]
4069 vmovdqu ymm1, [eax + 32]
4070 vpavgb ymm0, ymm0, [eax + esi]
4071 vpavgb ymm1, ymm1, [eax + esi + 32]
4072 lea eax, [eax + 64]
4073 vpand ymm0, ymm0, ymm5 // UYVY -> UVUV
4074 vpand ymm1, ymm1, ymm5
4075 vpackuswb ymm0, ymm0, ymm1 // mutates.
4076 vpermq ymm0, ymm0, 0xd8
4077 vpand ymm1, ymm0, ymm5 // U
4078 vpsrlw ymm0, ymm0, 8 // V
4079 vpackuswb ymm1, ymm1, ymm1 // mutates.
4080 vpackuswb ymm0, ymm0, ymm0 // mutates.
4081 vpermq ymm1, ymm1, 0xd8
4082 vpermq ymm0, ymm0, 0xd8
4083 vextractf128 [edx], ymm1, 0 // U
4084 vextractf128 [edx + edi], ymm0, 0 // V
4085 lea edx, [edx + 16]
4086 sub ecx, 32
4087 jg convertloop
4088
4089 pop edi
4090 pop esi
4091 vzeroupper
4092 ret
4093 }
4094 }
4095
4096 __declspec(naked) __declspec(align(16))
UYVYToUV422Row_AVX2(const uint8 * src_uyvy,uint8 * dst_u,uint8 * dst_v,int pix)4097 void UYVYToUV422Row_AVX2(const uint8* src_uyvy,
4098 uint8* dst_u, uint8* dst_v, int pix) {
4099 __asm {
4100 push edi
4101 mov eax, [esp + 4 + 4] // src_yuy2
4102 mov edx, [esp + 4 + 8] // dst_u
4103 mov edi, [esp + 4 + 12] // dst_v
4104 mov ecx, [esp + 4 + 16] // pix
4105 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff
4106 vpsrlw ymm5, ymm5, 8
4107 sub edi, edx
4108
4109 align 4
4110 convertloop:
4111 vmovdqu ymm0, [eax]
4112 vmovdqu ymm1, [eax + 32]
4113 lea eax, [eax + 64]
4114 vpand ymm0, ymm0, ymm5 // UYVY -> UVUV
4115 vpand ymm1, ymm1, ymm5
4116 vpackuswb ymm0, ymm0, ymm1 // mutates.
4117 vpermq ymm0, ymm0, 0xd8
4118 vpand ymm1, ymm0, ymm5 // U
4119 vpsrlw ymm0, ymm0, 8 // V
4120 vpackuswb ymm1, ymm1, ymm1 // mutates.
4121 vpackuswb ymm0, ymm0, ymm0 // mutates.
4122 vpermq ymm1, ymm1, 0xd8
4123 vpermq ymm0, ymm0, 0xd8
4124 vextractf128 [edx], ymm1, 0 // U
4125 vextractf128 [edx + edi], ymm0, 0 // V
4126 lea edx, [edx + 16]
4127 sub ecx, 32
4128 jg convertloop
4129
4130 pop edi
4131 vzeroupper
4132 ret
4133 }
4134 }
4135 #endif // HAS_YUY2TOYROW_AVX2
4136
4137 #ifdef HAS_YUY2TOYROW_SSE2
4138 __declspec(naked) __declspec(align(16))
YUY2ToYRow_SSE2(const uint8 * src_yuy2,uint8 * dst_y,int pix)4139 void YUY2ToYRow_SSE2(const uint8* src_yuy2,
4140 uint8* dst_y, int pix) {
4141 __asm {
4142 mov eax, [esp + 4] // src_yuy2
4143 mov edx, [esp + 8] // dst_y
4144 mov ecx, [esp + 12] // pix
4145 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
4146 psrlw xmm5, 8
4147
4148 align 4
4149 convertloop:
4150 movdqa xmm0, [eax]
4151 movdqa xmm1, [eax + 16]
4152 lea eax, [eax + 32]
4153 pand xmm0, xmm5 // even bytes are Y
4154 pand xmm1, xmm5
4155 packuswb xmm0, xmm1
4156 sub ecx, 16
4157 movdqa [edx], xmm0
4158 lea edx, [edx + 16]
4159 jg convertloop
4160 ret
4161 }
4162 }
4163
4164 __declspec(naked) __declspec(align(16))
YUY2ToUVRow_SSE2(const uint8 * src_yuy2,int stride_yuy2,uint8 * dst_u,uint8 * dst_v,int pix)4165 void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
4166 uint8* dst_u, uint8* dst_v, int pix) {
4167 __asm {
4168 push esi
4169 push edi
4170 mov eax, [esp + 8 + 4] // src_yuy2
4171 mov esi, [esp + 8 + 8] // stride_yuy2
4172 mov edx, [esp + 8 + 12] // dst_u
4173 mov edi, [esp + 8 + 16] // dst_v
4174 mov ecx, [esp + 8 + 20] // pix
4175 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
4176 psrlw xmm5, 8
4177 sub edi, edx
4178
4179 align 4
4180 convertloop:
4181 movdqa xmm0, [eax]
4182 movdqa xmm1, [eax + 16]
4183 movdqa xmm2, [eax + esi]
4184 movdqa xmm3, [eax + esi + 16]
4185 lea eax, [eax + 32]
4186 pavgb xmm0, xmm2
4187 pavgb xmm1, xmm3
4188 psrlw xmm0, 8 // YUYV -> UVUV
4189 psrlw xmm1, 8
4190 packuswb xmm0, xmm1
4191 movdqa xmm1, xmm0
4192 pand xmm0, xmm5 // U
4193 packuswb xmm0, xmm0
4194 psrlw xmm1, 8 // V
4195 packuswb xmm1, xmm1
4196 movq qword ptr [edx], xmm0
4197 movq qword ptr [edx + edi], xmm1
4198 lea edx, [edx + 8]
4199 sub ecx, 16
4200 jg convertloop
4201
4202 pop edi
4203 pop esi
4204 ret
4205 }
4206 }
4207
4208 __declspec(naked) __declspec(align(16))
YUY2ToUV422Row_SSE2(const uint8 * src_yuy2,uint8 * dst_u,uint8 * dst_v,int pix)4209 void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
4210 uint8* dst_u, uint8* dst_v, int pix) {
4211 __asm {
4212 push edi
4213 mov eax, [esp + 4 + 4] // src_yuy2
4214 mov edx, [esp + 4 + 8] // dst_u
4215 mov edi, [esp + 4 + 12] // dst_v
4216 mov ecx, [esp + 4 + 16] // pix
4217 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
4218 psrlw xmm5, 8
4219 sub edi, edx
4220
4221 align 4
4222 convertloop:
4223 movdqa xmm0, [eax]
4224 movdqa xmm1, [eax + 16]
4225 lea eax, [eax + 32]
4226 psrlw xmm0, 8 // YUYV -> UVUV
4227 psrlw xmm1, 8
4228 packuswb xmm0, xmm1
4229 movdqa xmm1, xmm0
4230 pand xmm0, xmm5 // U
4231 packuswb xmm0, xmm0
4232 psrlw xmm1, 8 // V
4233 packuswb xmm1, xmm1
4234 movq qword ptr [edx], xmm0
4235 movq qword ptr [edx + edi], xmm1
4236 lea edx, [edx + 8]
4237 sub ecx, 16
4238 jg convertloop
4239
4240 pop edi
4241 ret
4242 }
4243 }
4244
4245 __declspec(naked) __declspec(align(16))
YUY2ToYRow_Unaligned_SSE2(const uint8 * src_yuy2,uint8 * dst_y,int pix)4246 void YUY2ToYRow_Unaligned_SSE2(const uint8* src_yuy2,
4247 uint8* dst_y, int pix) {
4248 __asm {
4249 mov eax, [esp + 4] // src_yuy2
4250 mov edx, [esp + 8] // dst_y
4251 mov ecx, [esp + 12] // pix
4252 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
4253 psrlw xmm5, 8
4254
4255 align 4
4256 convertloop:
4257 movdqu xmm0, [eax]
4258 movdqu xmm1, [eax + 16]
4259 lea eax, [eax + 32]
4260 pand xmm0, xmm5 // even bytes are Y
4261 pand xmm1, xmm5
4262 packuswb xmm0, xmm1
4263 sub ecx, 16
4264 movdqu [edx], xmm0
4265 lea edx, [edx + 16]
4266 jg convertloop
4267 ret
4268 }
4269 }
4270
4271 __declspec(naked) __declspec(align(16))
YUY2ToUVRow_Unaligned_SSE2(const uint8 * src_yuy2,int stride_yuy2,uint8 * dst_u,uint8 * dst_v,int pix)4272 void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2, int stride_yuy2,
4273 uint8* dst_u, uint8* dst_v, int pix) {
4274 __asm {
4275 push esi
4276 push edi
4277 mov eax, [esp + 8 + 4] // src_yuy2
4278 mov esi, [esp + 8 + 8] // stride_yuy2
4279 mov edx, [esp + 8 + 12] // dst_u
4280 mov edi, [esp + 8 + 16] // dst_v
4281 mov ecx, [esp + 8 + 20] // pix
4282 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
4283 psrlw xmm5, 8
4284 sub edi, edx
4285
4286 align 4
4287 convertloop:
4288 movdqu xmm0, [eax]
4289 movdqu xmm1, [eax + 16]
4290 movdqu xmm2, [eax + esi]
4291 movdqu xmm3, [eax + esi + 16]
4292 lea eax, [eax + 32]
4293 pavgb xmm0, xmm2
4294 pavgb xmm1, xmm3
4295 psrlw xmm0, 8 // YUYV -> UVUV
4296 psrlw xmm1, 8
4297 packuswb xmm0, xmm1
4298 movdqa xmm1, xmm0
4299 pand xmm0, xmm5 // U
4300 packuswb xmm0, xmm0
4301 psrlw xmm1, 8 // V
4302 packuswb xmm1, xmm1
4303 movq qword ptr [edx], xmm0
4304 movq qword ptr [edx + edi], xmm1
4305 lea edx, [edx + 8]
4306 sub ecx, 16
4307 jg convertloop
4308
4309 pop edi
4310 pop esi
4311 ret
4312 }
4313 }
4314
4315 __declspec(naked) __declspec(align(16))
YUY2ToUV422Row_Unaligned_SSE2(const uint8 * src_yuy2,uint8 * dst_u,uint8 * dst_v,int pix)4316 void YUY2ToUV422Row_Unaligned_SSE2(const uint8* src_yuy2,
4317 uint8* dst_u, uint8* dst_v, int pix) {
4318 __asm {
4319 push edi
4320 mov eax, [esp + 4 + 4] // src_yuy2
4321 mov edx, [esp + 4 + 8] // dst_u
4322 mov edi, [esp + 4 + 12] // dst_v
4323 mov ecx, [esp + 4 + 16] // pix
4324 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
4325 psrlw xmm5, 8
4326 sub edi, edx
4327
4328 align 4
4329 convertloop:
4330 movdqu xmm0, [eax]
4331 movdqu xmm1, [eax + 16]
4332 lea eax, [eax + 32]
4333 psrlw xmm0, 8 // YUYV -> UVUV
4334 psrlw xmm1, 8
4335 packuswb xmm0, xmm1
4336 movdqa xmm1, xmm0
4337 pand xmm0, xmm5 // U
4338 packuswb xmm0, xmm0
4339 psrlw xmm1, 8 // V
4340 packuswb xmm1, xmm1
4341 movq qword ptr [edx], xmm0
4342 movq qword ptr [edx + edi], xmm1
4343 lea edx, [edx + 8]
4344 sub ecx, 16
4345 jg convertloop
4346
4347 pop edi
4348 ret
4349 }
4350 }
4351
4352 __declspec(naked) __declspec(align(16))
UYVYToYRow_SSE2(const uint8 * src_uyvy,uint8 * dst_y,int pix)4353 void UYVYToYRow_SSE2(const uint8* src_uyvy,
4354 uint8* dst_y, int pix) {
4355 __asm {
4356 mov eax, [esp + 4] // src_uyvy
4357 mov edx, [esp + 8] // dst_y
4358 mov ecx, [esp + 12] // pix
4359
4360 align 4
4361 convertloop:
4362 movdqa xmm0, [eax]
4363 movdqa xmm1, [eax + 16]
4364 lea eax, [eax + 32]
4365 psrlw xmm0, 8 // odd bytes are Y
4366 psrlw xmm1, 8
4367 packuswb xmm0, xmm1
4368 sub ecx, 16
4369 movdqa [edx], xmm0
4370 lea edx, [edx + 16]
4371 jg convertloop
4372 ret
4373 }
4374 }
4375
4376 __declspec(naked) __declspec(align(16))
UYVYToUVRow_SSE2(const uint8 * src_uyvy,int stride_uyvy,uint8 * dst_u,uint8 * dst_v,int pix)4377 void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
4378 uint8* dst_u, uint8* dst_v, int pix) {
4379 __asm {
4380 push esi
4381 push edi
4382 mov eax, [esp + 8 + 4] // src_yuy2
4383 mov esi, [esp + 8 + 8] // stride_yuy2
4384 mov edx, [esp + 8 + 12] // dst_u
4385 mov edi, [esp + 8 + 16] // dst_v
4386 mov ecx, [esp + 8 + 20] // pix
4387 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
4388 psrlw xmm5, 8
4389 sub edi, edx
4390
4391 align 4
4392 convertloop:
4393 movdqa xmm0, [eax]
4394 movdqa xmm1, [eax + 16]
4395 movdqa xmm2, [eax + esi]
4396 movdqa xmm3, [eax + esi + 16]
4397 lea eax, [eax + 32]
4398 pavgb xmm0, xmm2
4399 pavgb xmm1, xmm3
4400 pand xmm0, xmm5 // UYVY -> UVUV
4401 pand xmm1, xmm5
4402 packuswb xmm0, xmm1
4403 movdqa xmm1, xmm0
4404 pand xmm0, xmm5 // U
4405 packuswb xmm0, xmm0
4406 psrlw xmm1, 8 // V
4407 packuswb xmm1, xmm1
4408 movq qword ptr [edx], xmm0
4409 movq qword ptr [edx + edi], xmm1
4410 lea edx, [edx + 8]
4411 sub ecx, 16
4412 jg convertloop
4413
4414 pop edi
4415 pop esi
4416 ret
4417 }
4418 }
4419
4420 __declspec(naked) __declspec(align(16))
UYVYToUV422Row_SSE2(const uint8 * src_uyvy,uint8 * dst_u,uint8 * dst_v,int pix)4421 void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
4422 uint8* dst_u, uint8* dst_v, int pix) {
4423 __asm {
4424 push edi
4425 mov eax, [esp + 4 + 4] // src_yuy2
4426 mov edx, [esp + 4 + 8] // dst_u
4427 mov edi, [esp + 4 + 12] // dst_v
4428 mov ecx, [esp + 4 + 16] // pix
4429 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
4430 psrlw xmm5, 8
4431 sub edi, edx
4432
4433 align 4
4434 convertloop:
4435 movdqa xmm0, [eax]
4436 movdqa xmm1, [eax + 16]
4437 lea eax, [eax + 32]
4438 pand xmm0, xmm5 // UYVY -> UVUV
4439 pand xmm1, xmm5
4440 packuswb xmm0, xmm1
4441 movdqa xmm1, xmm0
4442 pand xmm0, xmm5 // U
4443 packuswb xmm0, xmm0
4444 psrlw xmm1, 8 // V
4445 packuswb xmm1, xmm1
4446 movq qword ptr [edx], xmm0
4447 movq qword ptr [edx + edi], xmm1
4448 lea edx, [edx + 8]
4449 sub ecx, 16
4450 jg convertloop
4451
4452 pop edi
4453 ret
4454 }
4455 }
4456
4457 __declspec(naked) __declspec(align(16))
UYVYToYRow_Unaligned_SSE2(const uint8 * src_uyvy,uint8 * dst_y,int pix)4458 void UYVYToYRow_Unaligned_SSE2(const uint8* src_uyvy,
4459 uint8* dst_y, int pix) {
4460 __asm {
4461 mov eax, [esp + 4] // src_uyvy
4462 mov edx, [esp + 8] // dst_y
4463 mov ecx, [esp + 12] // pix
4464
4465 align 4
4466 convertloop:
4467 movdqu xmm0, [eax]
4468 movdqu xmm1, [eax + 16]
4469 lea eax, [eax + 32]
4470 psrlw xmm0, 8 // odd bytes are Y
4471 psrlw xmm1, 8
4472 packuswb xmm0, xmm1
4473 sub ecx, 16
4474 movdqu [edx], xmm0
4475 lea edx, [edx + 16]
4476 jg convertloop
4477 ret
4478 }
4479 }
4480
4481 __declspec(naked) __declspec(align(16))
UYVYToUVRow_Unaligned_SSE2(const uint8 * src_uyvy,int stride_uyvy,uint8 * dst_u,uint8 * dst_v,int pix)4482 void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy,
4483 uint8* dst_u, uint8* dst_v, int pix) {
4484 __asm {
4485 push esi
4486 push edi
4487 mov eax, [esp + 8 + 4] // src_yuy2
4488 mov esi, [esp + 8 + 8] // stride_yuy2
4489 mov edx, [esp + 8 + 12] // dst_u
4490 mov edi, [esp + 8 + 16] // dst_v
4491 mov ecx, [esp + 8 + 20] // pix
4492 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
4493 psrlw xmm5, 8
4494 sub edi, edx
4495
4496 align 4
4497 convertloop:
4498 movdqu xmm0, [eax]
4499 movdqu xmm1, [eax + 16]
4500 movdqu xmm2, [eax + esi]
4501 movdqu xmm3, [eax + esi + 16]
4502 lea eax, [eax + 32]
4503 pavgb xmm0, xmm2
4504 pavgb xmm1, xmm3
4505 pand xmm0, xmm5 // UYVY -> UVUV
4506 pand xmm1, xmm5
4507 packuswb xmm0, xmm1
4508 movdqa xmm1, xmm0
4509 pand xmm0, xmm5 // U
4510 packuswb xmm0, xmm0
4511 psrlw xmm1, 8 // V
4512 packuswb xmm1, xmm1
4513 movq qword ptr [edx], xmm0
4514 movq qword ptr [edx + edi], xmm1
4515 lea edx, [edx + 8]
4516 sub ecx, 16
4517 jg convertloop
4518
4519 pop edi
4520 pop esi
4521 ret
4522 }
4523 }
4524
4525 __declspec(naked) __declspec(align(16))
UYVYToUV422Row_Unaligned_SSE2(const uint8 * src_uyvy,uint8 * dst_u,uint8 * dst_v,int pix)4526 void UYVYToUV422Row_Unaligned_SSE2(const uint8* src_uyvy,
4527 uint8* dst_u, uint8* dst_v, int pix) {
4528 __asm {
4529 push edi
4530 mov eax, [esp + 4 + 4] // src_yuy2
4531 mov edx, [esp + 4 + 8] // dst_u
4532 mov edi, [esp + 4 + 12] // dst_v
4533 mov ecx, [esp + 4 + 16] // pix
4534 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
4535 psrlw xmm5, 8
4536 sub edi, edx
4537
4538 align 4
4539 convertloop:
4540 movdqu xmm0, [eax]
4541 movdqu xmm1, [eax + 16]
4542 lea eax, [eax + 32]
4543 pand xmm0, xmm5 // UYVY -> UVUV
4544 pand xmm1, xmm5
4545 packuswb xmm0, xmm1
4546 movdqa xmm1, xmm0
4547 pand xmm0, xmm5 // U
4548 packuswb xmm0, xmm0
4549 psrlw xmm1, 8 // V
4550 packuswb xmm1, xmm1
4551 movq qword ptr [edx], xmm0
4552 movq qword ptr [edx + edi], xmm1
4553 lea edx, [edx + 8]
4554 sub ecx, 16
4555 jg convertloop
4556
4557 pop edi
4558 ret
4559 }
4560 }
4561 #endif // HAS_YUY2TOYROW_SSE2
4562
4563 #ifdef HAS_ARGBBLENDROW_SSE2
4564 // Blend 8 pixels at a time.
4565 __declspec(naked) __declspec(align(16))
ARGBBlendRow_SSE2(const uint8 * src_argb0,const uint8 * src_argb1,uint8 * dst_argb,int width)4566 void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
4567 uint8* dst_argb, int width) {
4568 __asm {
4569 push esi
4570 mov eax, [esp + 4 + 4] // src_argb0
4571 mov esi, [esp + 4 + 8] // src_argb1
4572 mov edx, [esp + 4 + 12] // dst_argb
4573 mov ecx, [esp + 4 + 16] // width
4574 pcmpeqb xmm7, xmm7 // generate constant 1
4575 psrlw xmm7, 15
4576 pcmpeqb xmm6, xmm6 // generate mask 0x00ff00ff
4577 psrlw xmm6, 8
4578 pcmpeqb xmm5, xmm5 // generate mask 0xff00ff00
4579 psllw xmm5, 8
4580 pcmpeqb xmm4, xmm4 // generate mask 0xff000000
4581 pslld xmm4, 24
4582
4583 sub ecx, 1
4584 je convertloop1 // only 1 pixel?
4585 jl convertloop1b
4586
4587 // 1 pixel loop until destination pointer is aligned.
4588 alignloop1:
4589 test edx, 15 // aligned?
4590 je alignloop1b
4591 movd xmm3, [eax]
4592 lea eax, [eax + 4]
4593 movdqa xmm0, xmm3 // src argb
4594 pxor xmm3, xmm4 // ~alpha
4595 movd xmm2, [esi] // _r_b
4596 psrlw xmm3, 8 // alpha
4597 pshufhw xmm3, xmm3, 0F5h // 8 alpha words
4598 pshuflw xmm3, xmm3, 0F5h
4599 pand xmm2, xmm6 // _r_b
4600 paddw xmm3, xmm7 // 256 - alpha
4601 pmullw xmm2, xmm3 // _r_b * alpha
4602 movd xmm1, [esi] // _a_g
4603 lea esi, [esi + 4]
4604 psrlw xmm1, 8 // _a_g
4605 por xmm0, xmm4 // set alpha to 255
4606 pmullw xmm1, xmm3 // _a_g * alpha
4607 psrlw xmm2, 8 // _r_b convert to 8 bits again
4608 paddusb xmm0, xmm2 // + src argb
4609 pand xmm1, xmm5 // a_g_ convert to 8 bits again
4610 paddusb xmm0, xmm1 // + src argb
4611 sub ecx, 1
4612 movd [edx], xmm0
4613 lea edx, [edx + 4]
4614 jge alignloop1
4615
4616 alignloop1b:
4617 add ecx, 1 - 4
4618 jl convertloop4b
4619
4620 // 4 pixel loop.
4621 convertloop4:
4622 movdqu xmm3, [eax] // src argb
4623 lea eax, [eax + 16]
4624 movdqa xmm0, xmm3 // src argb
4625 pxor xmm3, xmm4 // ~alpha
4626 movdqu xmm2, [esi] // _r_b
4627 psrlw xmm3, 8 // alpha
4628 pshufhw xmm3, xmm3, 0F5h // 8 alpha words
4629 pshuflw xmm3, xmm3, 0F5h
4630 pand xmm2, xmm6 // _r_b
4631 paddw xmm3, xmm7 // 256 - alpha
4632 pmullw xmm2, xmm3 // _r_b * alpha
4633 movdqu xmm1, [esi] // _a_g
4634 lea esi, [esi + 16]
4635 psrlw xmm1, 8 // _a_g
4636 por xmm0, xmm4 // set alpha to 255
4637 pmullw xmm1, xmm3 // _a_g * alpha
4638 psrlw xmm2, 8 // _r_b convert to 8 bits again
4639 paddusb xmm0, xmm2 // + src argb
4640 pand xmm1, xmm5 // a_g_ convert to 8 bits again
4641 paddusb xmm0, xmm1 // + src argb
4642 sub ecx, 4
4643 movdqa [edx], xmm0
4644 lea edx, [edx + 16]
4645 jge convertloop4
4646
4647 convertloop4b:
4648 add ecx, 4 - 1
4649 jl convertloop1b
4650
4651 // 1 pixel loop.
4652 convertloop1:
4653 movd xmm3, [eax] // src argb
4654 lea eax, [eax + 4]
4655 movdqa xmm0, xmm3 // src argb
4656 pxor xmm3, xmm4 // ~alpha
4657 movd xmm2, [esi] // _r_b
4658 psrlw xmm3, 8 // alpha
4659 pshufhw xmm3, xmm3, 0F5h // 8 alpha words
4660 pshuflw xmm3, xmm3, 0F5h
4661 pand xmm2, xmm6 // _r_b
4662 paddw xmm3, xmm7 // 256 - alpha
4663 pmullw xmm2, xmm3 // _r_b * alpha
4664 movd xmm1, [esi] // _a_g
4665 lea esi, [esi + 4]
4666 psrlw xmm1, 8 // _a_g
4667 por xmm0, xmm4 // set alpha to 255
4668 pmullw xmm1, xmm3 // _a_g * alpha
4669 psrlw xmm2, 8 // _r_b convert to 8 bits again
4670 paddusb xmm0, xmm2 // + src argb
4671 pand xmm1, xmm5 // a_g_ convert to 8 bits again
4672 paddusb xmm0, xmm1 // + src argb
4673 sub ecx, 1
4674 movd [edx], xmm0
4675 lea edx, [edx + 4]
4676 jge convertloop1
4677
4678 convertloop1b:
4679 pop esi
4680 ret
4681 }
4682 }
4683 #endif // HAS_ARGBBLENDROW_SSE2
4684
4685 #ifdef HAS_ARGBBLENDROW_SSSE3
4686 // Shuffle table for isolating alpha.
4687 static const uvec8 kShuffleAlpha = {
4688 3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80,
4689 11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80
4690 };
4691 // Same as SSE2, but replaces:
4692 // psrlw xmm3, 8 // alpha
4693 // pshufhw xmm3, xmm3, 0F5h // 8 alpha words
4694 // pshuflw xmm3, xmm3, 0F5h
4695 // with..
4696 // pshufb xmm3, kShuffleAlpha // alpha
4697 // Blend 8 pixels at a time.
4698
4699 __declspec(naked) __declspec(align(16))
ARGBBlendRow_SSSE3(const uint8 * src_argb0,const uint8 * src_argb1,uint8 * dst_argb,int width)4700 void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
4701 uint8* dst_argb, int width) {
4702 __asm {
4703 push esi
4704 mov eax, [esp + 4 + 4] // src_argb0
4705 mov esi, [esp + 4 + 8] // src_argb1
4706 mov edx, [esp + 4 + 12] // dst_argb
4707 mov ecx, [esp + 4 + 16] // width
4708 pcmpeqb xmm7, xmm7 // generate constant 0x0001
4709 psrlw xmm7, 15
4710 pcmpeqb xmm6, xmm6 // generate mask 0x00ff00ff
4711 psrlw xmm6, 8
4712 pcmpeqb xmm5, xmm5 // generate mask 0xff00ff00
4713 psllw xmm5, 8
4714 pcmpeqb xmm4, xmm4 // generate mask 0xff000000
4715 pslld xmm4, 24
4716
4717 sub ecx, 1
4718 je convertloop1 // only 1 pixel?
4719 jl convertloop1b
4720
4721 // 1 pixel loop until destination pointer is aligned.
4722 alignloop1:
4723 test edx, 15 // aligned?
4724 je alignloop1b
4725 movd xmm3, [eax]
4726 lea eax, [eax + 4]
4727 movdqa xmm0, xmm3 // src argb
4728 pxor xmm3, xmm4 // ~alpha
4729 movd xmm2, [esi] // _r_b
4730 pshufb xmm3, kShuffleAlpha // alpha
4731 pand xmm2, xmm6 // _r_b
4732 paddw xmm3, xmm7 // 256 - alpha
4733 pmullw xmm2, xmm3 // _r_b * alpha
4734 movd xmm1, [esi] // _a_g
4735 lea esi, [esi + 4]
4736 psrlw xmm1, 8 // _a_g
4737 por xmm0, xmm4 // set alpha to 255
4738 pmullw xmm1, xmm3 // _a_g * alpha
4739 psrlw xmm2, 8 // _r_b convert to 8 bits again
4740 paddusb xmm0, xmm2 // + src argb
4741 pand xmm1, xmm5 // a_g_ convert to 8 bits again
4742 paddusb xmm0, xmm1 // + src argb
4743 sub ecx, 1
4744 movd [edx], xmm0
4745 lea edx, [edx + 4]
4746 jge alignloop1
4747
4748 alignloop1b:
4749 add ecx, 1 - 4
4750 jl convertloop4b
4751
4752 test eax, 15 // unaligned?
4753 jne convertuloop4
4754 test esi, 15 // unaligned?
4755 jne convertuloop4
4756
4757 // 4 pixel loop.
4758 convertloop4:
4759 movdqa xmm3, [eax] // src argb
4760 lea eax, [eax + 16]
4761 movdqa xmm0, xmm3 // src argb
4762 pxor xmm3, xmm4 // ~alpha
4763 movdqa xmm2, [esi] // _r_b
4764 pshufb xmm3, kShuffleAlpha // alpha
4765 pand xmm2, xmm6 // _r_b
4766 paddw xmm3, xmm7 // 256 - alpha
4767 pmullw xmm2, xmm3 // _r_b * alpha
4768 movdqa xmm1, [esi] // _a_g
4769 lea esi, [esi + 16]
4770 psrlw xmm1, 8 // _a_g
4771 por xmm0, xmm4 // set alpha to 255
4772 pmullw xmm1, xmm3 // _a_g * alpha
4773 psrlw xmm2, 8 // _r_b convert to 8 bits again
4774 paddusb xmm0, xmm2 // + src argb
4775 pand xmm1, xmm5 // a_g_ convert to 8 bits again
4776 paddusb xmm0, xmm1 // + src argb
4777 sub ecx, 4
4778 movdqa [edx], xmm0
4779 lea edx, [edx + 16]
4780 jge convertloop4
4781 jmp convertloop4b
4782
4783 // 4 pixel unaligned loop.
4784 convertuloop4:
4785 movdqu xmm3, [eax] // src argb
4786 lea eax, [eax + 16]
4787 movdqa xmm0, xmm3 // src argb
4788 pxor xmm3, xmm4 // ~alpha
4789 movdqu xmm2, [esi] // _r_b
4790 pshufb xmm3, kShuffleAlpha // alpha
4791 pand xmm2, xmm6 // _r_b
4792 paddw xmm3, xmm7 // 256 - alpha
4793 pmullw xmm2, xmm3 // _r_b * alpha
4794 movdqu xmm1, [esi] // _a_g
4795 lea esi, [esi + 16]
4796 psrlw xmm1, 8 // _a_g
4797 por xmm0, xmm4 // set alpha to 255
4798 pmullw xmm1, xmm3 // _a_g * alpha
4799 psrlw xmm2, 8 // _r_b convert to 8 bits again
4800 paddusb xmm0, xmm2 // + src argb
4801 pand xmm1, xmm5 // a_g_ convert to 8 bits again
4802 paddusb xmm0, xmm1 // + src argb
4803 sub ecx, 4
4804 movdqa [edx], xmm0
4805 lea edx, [edx + 16]
4806 jge convertuloop4
4807
4808 convertloop4b:
4809 add ecx, 4 - 1
4810 jl convertloop1b
4811
4812 // 1 pixel loop.
4813 convertloop1:
4814 movd xmm3, [eax] // src argb
4815 lea eax, [eax + 4]
4816 movdqa xmm0, xmm3 // src argb
4817 pxor xmm3, xmm4 // ~alpha
4818 movd xmm2, [esi] // _r_b
4819 pshufb xmm3, kShuffleAlpha // alpha
4820 pand xmm2, xmm6 // _r_b
4821 paddw xmm3, xmm7 // 256 - alpha
4822 pmullw xmm2, xmm3 // _r_b * alpha
4823 movd xmm1, [esi] // _a_g
4824 lea esi, [esi + 4]
4825 psrlw xmm1, 8 // _a_g
4826 por xmm0, xmm4 // set alpha to 255
4827 pmullw xmm1, xmm3 // _a_g * alpha
4828 psrlw xmm2, 8 // _r_b convert to 8 bits again
4829 paddusb xmm0, xmm2 // + src argb
4830 pand xmm1, xmm5 // a_g_ convert to 8 bits again
4831 paddusb xmm0, xmm1 // + src argb
4832 sub ecx, 1
4833 movd [edx], xmm0
4834 lea edx, [edx + 4]
4835 jge convertloop1
4836
4837 convertloop1b:
4838 pop esi
4839 ret
4840 }
4841 }
4842 #endif // HAS_ARGBBLENDROW_SSSE3
4843
4844 #ifdef HAS_ARGBATTENUATEROW_SSE2
4845 // Attenuate 4 pixels at a time.
4846 // Aligned to 16 bytes.
4847 __declspec(naked) __declspec(align(16))
ARGBAttenuateRow_SSE2(const uint8 * src_argb,uint8 * dst_argb,int width)4848 void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
4849 __asm {
4850 mov eax, [esp + 4] // src_argb0
4851 mov edx, [esp + 8] // dst_argb
4852 mov ecx, [esp + 12] // width
4853 pcmpeqb xmm4, xmm4 // generate mask 0xff000000
4854 pslld xmm4, 24
4855 pcmpeqb xmm5, xmm5 // generate mask 0x00ffffff
4856 psrld xmm5, 8
4857
4858 align 4
4859 convertloop:
4860 movdqa xmm0, [eax] // read 4 pixels
4861 punpcklbw xmm0, xmm0 // first 2
4862 pshufhw xmm2, xmm0, 0FFh // 8 alpha words
4863 pshuflw xmm2, xmm2, 0FFh
4864 pmulhuw xmm0, xmm2 // rgb * a
4865 movdqa xmm1, [eax] // read 4 pixels
4866 punpckhbw xmm1, xmm1 // next 2 pixels
4867 pshufhw xmm2, xmm1, 0FFh // 8 alpha words
4868 pshuflw xmm2, xmm2, 0FFh
4869 pmulhuw xmm1, xmm2 // rgb * a
4870 movdqa xmm2, [eax] // alphas
4871 lea eax, [eax + 16]
4872 psrlw xmm0, 8
4873 pand xmm2, xmm4
4874 psrlw xmm1, 8
4875 packuswb xmm0, xmm1
4876 pand xmm0, xmm5 // keep original alphas
4877 por xmm0, xmm2
4878 sub ecx, 4
4879 movdqa [edx], xmm0
4880 lea edx, [edx + 16]
4881 jg convertloop
4882
4883 ret
4884 }
4885 }
4886 #endif // HAS_ARGBATTENUATEROW_SSE2
4887
4888 #ifdef HAS_ARGBATTENUATEROW_SSSE3
4889 // Shuffle table duplicating alpha.
4890 static const uvec8 kShuffleAlpha0 = {
4891 3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u,
4892 };
4893 static const uvec8 kShuffleAlpha1 = {
4894 11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
4895 15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u,
4896 };
4897 __declspec(naked) __declspec(align(16))
ARGBAttenuateRow_SSSE3(const uint8 * src_argb,uint8 * dst_argb,int width)4898 void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
4899 __asm {
4900 mov eax, [esp + 4] // src_argb0
4901 mov edx, [esp + 8] // dst_argb
4902 mov ecx, [esp + 12] // width
4903 pcmpeqb xmm3, xmm3 // generate mask 0xff000000
4904 pslld xmm3, 24
4905 movdqa xmm4, kShuffleAlpha0
4906 movdqa xmm5, kShuffleAlpha1
4907
4908 align 4
4909 convertloop:
4910 movdqu xmm0, [eax] // read 4 pixels
4911 pshufb xmm0, xmm4 // isolate first 2 alphas
4912 movdqu xmm1, [eax] // read 4 pixels
4913 punpcklbw xmm1, xmm1 // first 2 pixel rgbs
4914 pmulhuw xmm0, xmm1 // rgb * a
4915 movdqu xmm1, [eax] // read 4 pixels
4916 pshufb xmm1, xmm5 // isolate next 2 alphas
4917 movdqu xmm2, [eax] // read 4 pixels
4918 punpckhbw xmm2, xmm2 // next 2 pixel rgbs
4919 pmulhuw xmm1, xmm2 // rgb * a
4920 movdqu xmm2, [eax] // mask original alpha
4921 lea eax, [eax + 16]
4922 pand xmm2, xmm3
4923 psrlw xmm0, 8
4924 psrlw xmm1, 8
4925 packuswb xmm0, xmm1
4926 por xmm0, xmm2 // copy original alpha
4927 sub ecx, 4
4928 movdqu [edx], xmm0
4929 lea edx, [edx + 16]
4930 jg convertloop
4931
4932 ret
4933 }
4934 }
4935 #endif // HAS_ARGBATTENUATEROW_SSSE3
4936
4937 #ifdef HAS_ARGBATTENUATEROW_AVX2
4938 // Shuffle table duplicating alpha.
4939 static const ulvec8 kShuffleAlpha_AVX2 = {
4940 6u, 7u, 6u, 7u, 6u, 7u, 128u, 128u,
4941 14u, 15u, 14u, 15u, 14u, 15u, 128u, 128u,
4942 6u, 7u, 6u, 7u, 6u, 7u, 128u, 128u,
4943 14u, 15u, 14u, 15u, 14u, 15u, 128u, 128u,
4944 };
4945 __declspec(naked) __declspec(align(16))
ARGBAttenuateRow_AVX2(const uint8 * src_argb,uint8 * dst_argb,int width)4946 void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) {
4947 __asm {
4948 mov eax, [esp + 4] // src_argb0
4949 mov edx, [esp + 8] // dst_argb
4950 mov ecx, [esp + 12] // width
4951 sub edx, eax
4952 vmovdqa ymm4, kShuffleAlpha_AVX2
4953 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xff000000
4954 vpslld ymm5, ymm5, 24
4955
4956 align 4
4957 convertloop:
4958 vmovdqu ymm6, [eax] // read 8 pixels.
4959 vpunpcklbw ymm0, ymm6, ymm6 // low 4 pixels. mutated.
4960 vpunpckhbw ymm1, ymm6, ymm6 // high 4 pixels. mutated.
4961 vpshufb ymm2, ymm0, ymm4 // low 4 alphas
4962 vpshufb ymm3, ymm1, ymm4 // high 4 alphas
4963 vpmulhuw ymm0, ymm0, ymm2 // rgb * a
4964 vpmulhuw ymm1, ymm1, ymm3 // rgb * a
4965 vpand ymm6, ymm6, ymm5 // isolate alpha
4966 vpsrlw ymm0, ymm0, 8
4967 vpsrlw ymm1, ymm1, 8
4968 vpackuswb ymm0, ymm0, ymm1 // unmutated.
4969 vpor ymm0, ymm0, ymm6 // copy original alpha
4970 sub ecx, 8
4971 vmovdqu [eax + edx], ymm0
4972 lea eax, [eax + 32]
4973 jg convertloop
4974
4975 vzeroupper
4976 ret
4977 }
4978 }
4979 #endif // HAS_ARGBATTENUATEROW_AVX2
4980
4981 #ifdef HAS_ARGBUNATTENUATEROW_SSE2
4982 // Unattenuate 4 pixels at a time.
4983 // Aligned to 16 bytes.
4984 __declspec(naked) __declspec(align(16))
ARGBUnattenuateRow_SSE2(const uint8 * src_argb,uint8 * dst_argb,int width)4985 void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
4986 int width) {
4987 __asm {
4988 push esi
4989 push edi
4990 mov eax, [esp + 8 + 4] // src_argb0
4991 mov edx, [esp + 8 + 8] // dst_argb
4992 mov ecx, [esp + 8 + 12] // width
4993
4994 align 4
4995 convertloop:
4996 movdqu xmm0, [eax] // read 4 pixels
4997 movzx esi, byte ptr [eax + 3] // first alpha
4998 movzx edi, byte ptr [eax + 7] // second alpha
4999 punpcklbw xmm0, xmm0 // first 2
5000 movd xmm2, dword ptr fixed_invtbl8[esi * 4]
5001 movd xmm3, dword ptr fixed_invtbl8[edi * 4]
5002 pshuflw xmm2, xmm2, 040h // first 4 inv_alpha words. 1, a, a, a
5003 pshuflw xmm3, xmm3, 040h // next 4 inv_alpha words
5004 movlhps xmm2, xmm3
5005 pmulhuw xmm0, xmm2 // rgb * a
5006
5007 movdqu xmm1, [eax] // read 4 pixels
5008 movzx esi, byte ptr [eax + 11] // third alpha
5009 movzx edi, byte ptr [eax + 15] // forth alpha
5010 punpckhbw xmm1, xmm1 // next 2
5011 movd xmm2, dword ptr fixed_invtbl8[esi * 4]
5012 movd xmm3, dword ptr fixed_invtbl8[edi * 4]
5013 pshuflw xmm2, xmm2, 040h // first 4 inv_alpha words
5014 pshuflw xmm3, xmm3, 040h // next 4 inv_alpha words
5015 movlhps xmm2, xmm3
5016 pmulhuw xmm1, xmm2 // rgb * a
5017 lea eax, [eax + 16]
5018
5019 packuswb xmm0, xmm1
5020 sub ecx, 4
5021 movdqu [edx], xmm0
5022 lea edx, [edx + 16]
5023 jg convertloop
5024 pop edi
5025 pop esi
5026 ret
5027 }
5028 }
5029 #endif // HAS_ARGBUNATTENUATEROW_SSE2
5030
5031 #ifdef HAS_ARGBUNATTENUATEROW_AVX2
5032 // Shuffle table duplicating alpha.
5033 static const ulvec8 kUnattenShuffleAlpha_AVX2 = {
5034 0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15,
5035 0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15,
5036 };
5037 // TODO(fbarchard): Enable USE_GATHER for future hardware if faster.
5038 // USE_GATHER is not on by default, due to being a slow instruction.
5039 #ifdef USE_GATHER
5040 __declspec(naked) __declspec(align(16))
ARGBUnattenuateRow_AVX2(const uint8 * src_argb,uint8 * dst_argb,int width)5041 void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,
5042 int width) {
5043 __asm {
5044 mov eax, [esp + 4] // src_argb0
5045 mov edx, [esp + 8] // dst_argb
5046 mov ecx, [esp + 12] // width
5047 sub edx, eax
5048 vmovdqa ymm4, kUnattenShuffleAlpha_AVX2
5049
5050 align 4
5051 convertloop:
5052 vmovdqu ymm6, [eax] // read 8 pixels.
5053 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xffffffff for gather.
5054 vpsrld ymm2, ymm6, 24 // alpha in low 8 bits.
5055 vpunpcklbw ymm0, ymm6, ymm6 // low 4 pixels. mutated.
5056 vpunpckhbw ymm1, ymm6, ymm6 // high 4 pixels. mutated.
5057 vpgatherdd ymm3, [ymm2 * 4 + fixed_invtbl8], ymm5 // ymm5 cleared. 1, a
5058 vpunpcklwd ymm2, ymm3, ymm3 // low 4 inverted alphas. mutated. 1, 1, a, a
5059 vpunpckhwd ymm3, ymm3, ymm3 // high 4 inverted alphas. mutated.
5060 vpshufb ymm2, ymm2, ymm4 // replicate low 4 alphas. 1, a, a, a
5061 vpshufb ymm3, ymm3, ymm4 // replicate high 4 alphas
5062 vpmulhuw ymm0, ymm0, ymm2 // rgb * ia
5063 vpmulhuw ymm1, ymm1, ymm3 // rgb * ia
5064 vpackuswb ymm0, ymm0, ymm1 // unmutated.
5065 sub ecx, 8
5066 vmovdqu [eax + edx], ymm0
5067 lea eax, [eax + 32]
5068 jg convertloop
5069
5070 vzeroupper
5071 ret
5072 }
5073 }
5074 #else // USE_GATHER
5075 __declspec(naked) __declspec(align(16))
ARGBUnattenuateRow_AVX2(const uint8 * src_argb,uint8 * dst_argb,int width)5076 void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,
5077 int width) {
5078 __asm {
5079
5080 mov eax, [esp + 4] // src_argb0
5081 mov edx, [esp + 8] // dst_argb
5082 mov ecx, [esp + 12] // width
5083 sub edx, eax
5084 vmovdqa ymm5, kUnattenShuffleAlpha_AVX2
5085
5086 push esi
5087 push edi
5088
5089 align 4
5090 convertloop:
5091 // replace VPGATHER
5092 movzx esi, byte ptr [eax + 3] // alpha0
5093 movzx edi, byte ptr [eax + 7] // alpha1
5094 vmovd xmm0, dword ptr fixed_invtbl8[esi * 4] // [1,a0]
5095 vmovd xmm1, dword ptr fixed_invtbl8[edi * 4] // [1,a1]
5096 movzx esi, byte ptr [eax + 11] // alpha2
5097 movzx edi, byte ptr [eax + 15] // alpha3
5098 vpunpckldq xmm6, xmm0, xmm1 // [1,a1,1,a0]
5099 vmovd xmm2, dword ptr fixed_invtbl8[esi * 4] // [1,a2]
5100 vmovd xmm3, dword ptr fixed_invtbl8[edi * 4] // [1,a3]
5101 movzx esi, byte ptr [eax + 19] // alpha4
5102 movzx edi, byte ptr [eax + 23] // alpha5
5103 vpunpckldq xmm7, xmm2, xmm3 // [1,a3,1,a2]
5104 vmovd xmm0, dword ptr fixed_invtbl8[esi * 4] // [1,a4]
5105 vmovd xmm1, dword ptr fixed_invtbl8[edi * 4] // [1,a5]
5106 movzx esi, byte ptr [eax + 27] // alpha6
5107 movzx edi, byte ptr [eax + 31] // alpha7
5108 vpunpckldq xmm0, xmm0, xmm1 // [1,a5,1,a4]
5109 vmovd xmm2, dword ptr fixed_invtbl8[esi * 4] // [1,a6]
5110 vmovd xmm3, dword ptr fixed_invtbl8[edi * 4] // [1,a7]
5111 vpunpckldq xmm2, xmm2, xmm3 // [1,a7,1,a6]
5112 vpunpcklqdq xmm3, xmm6, xmm7 // [1,a3,1,a2,1,a1,1,a0]
5113 vpunpcklqdq xmm0, xmm0, xmm2 // [1,a7,1,a6,1,a5,1,a4]
5114 vinserti128 ymm3, ymm3, xmm0, 1 // [1,a7,1,a6,1,a5,1,a4,1,a3,1,a2,1,a1,1,a0]
5115 // end of VPGATHER
5116
5117 vmovdqu ymm6, [eax] // read 8 pixels.
5118 vpunpcklbw ymm0, ymm6, ymm6 // low 4 pixels. mutated.
5119 vpunpckhbw ymm1, ymm6, ymm6 // high 4 pixels. mutated.
5120 vpunpcklwd ymm2, ymm3, ymm3 // low 4 inverted alphas. mutated. 1, 1, a, a
5121 vpunpckhwd ymm3, ymm3, ymm3 // high 4 inverted alphas. mutated.
5122 vpshufb ymm2, ymm2, ymm5 // replicate low 4 alphas. 1, a, a, a
5123 vpshufb ymm3, ymm3, ymm5 // replicate high 4 alphas
5124 vpmulhuw ymm0, ymm0, ymm2 // rgb * ia
5125 vpmulhuw ymm1, ymm1, ymm3 // rgb * ia
5126 vpackuswb ymm0, ymm0, ymm1 // unmutated.
5127 sub ecx, 8
5128 vmovdqu [eax + edx], ymm0
5129 lea eax, [eax + 32]
5130 jg convertloop
5131
5132 pop edi
5133 pop esi
5134 vzeroupper
5135 ret
5136 }
5137 }
5138 #endif // USE_GATHER
5139 #endif // HAS_ARGBATTENUATEROW_AVX2
5140
5141 #ifdef HAS_ARGBGRAYROW_SSSE3
5142 // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels.
5143 __declspec(naked) __declspec(align(16))
ARGBGrayRow_SSSE3(const uint8 * src_argb,uint8 * dst_argb,int width)5144 void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
5145 __asm {
5146 mov eax, [esp + 4] /* src_argb */
5147 mov edx, [esp + 8] /* dst_argb */
5148 mov ecx, [esp + 12] /* width */
5149 movdqa xmm4, kARGBToYJ
5150 movdqa xmm5, kAddYJ64
5151
5152 align 4
5153 convertloop:
5154 movdqa xmm0, [eax] // G
5155 movdqa xmm1, [eax + 16]
5156 pmaddubsw xmm0, xmm4
5157 pmaddubsw xmm1, xmm4
5158 phaddw xmm0, xmm1
5159 paddw xmm0, xmm5 // Add .5 for rounding.
5160 psrlw xmm0, 7
5161 packuswb xmm0, xmm0 // 8 G bytes
5162 movdqa xmm2, [eax] // A
5163 movdqa xmm3, [eax + 16]
5164 lea eax, [eax + 32]
5165 psrld xmm2, 24
5166 psrld xmm3, 24
5167 packuswb xmm2, xmm3
5168 packuswb xmm2, xmm2 // 8 A bytes
5169 movdqa xmm3, xmm0 // Weave into GG, GA, then GGGA
5170 punpcklbw xmm0, xmm0 // 8 GG words
5171 punpcklbw xmm3, xmm2 // 8 GA words
5172 movdqa xmm1, xmm0
5173 punpcklwd xmm0, xmm3 // GGGA first 4
5174 punpckhwd xmm1, xmm3 // GGGA next 4
5175 sub ecx, 8
5176 movdqa [edx], xmm0
5177 movdqa [edx + 16], xmm1
5178 lea edx, [edx + 32]
5179 jg convertloop
5180 ret
5181 }
5182 }
5183 #endif // HAS_ARGBGRAYROW_SSSE3
5184
5185 #ifdef HAS_ARGBSEPIAROW_SSSE3
5186 // b = (r * 35 + g * 68 + b * 17) >> 7
5187 // g = (r * 45 + g * 88 + b * 22) >> 7
5188 // r = (r * 50 + g * 98 + b * 24) >> 7
5189 // Constant for ARGB color to sepia tone.
5190 static const vec8 kARGBToSepiaB = {
5191 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0
5192 };
5193
5194 static const vec8 kARGBToSepiaG = {
5195 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0
5196 };
5197
5198 static const vec8 kARGBToSepiaR = {
5199 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0
5200 };
5201
5202 // Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
5203 __declspec(naked) __declspec(align(16))
ARGBSepiaRow_SSSE3(uint8 * dst_argb,int width)5204 void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
5205 __asm {
5206 mov eax, [esp + 4] /* dst_argb */
5207 mov ecx, [esp + 8] /* width */
5208 movdqa xmm2, kARGBToSepiaB
5209 movdqa xmm3, kARGBToSepiaG
5210 movdqa xmm4, kARGBToSepiaR
5211
5212 align 4
5213 convertloop:
5214 movdqa xmm0, [eax] // B
5215 movdqa xmm6, [eax + 16]
5216 pmaddubsw xmm0, xmm2
5217 pmaddubsw xmm6, xmm2
5218 phaddw xmm0, xmm6
5219 psrlw xmm0, 7
5220 packuswb xmm0, xmm0 // 8 B values
5221 movdqa xmm5, [eax] // G
5222 movdqa xmm1, [eax + 16]
5223 pmaddubsw xmm5, xmm3
5224 pmaddubsw xmm1, xmm3
5225 phaddw xmm5, xmm1
5226 psrlw xmm5, 7
5227 packuswb xmm5, xmm5 // 8 G values
5228 punpcklbw xmm0, xmm5 // 8 BG values
5229 movdqa xmm5, [eax] // R
5230 movdqa xmm1, [eax + 16]
5231 pmaddubsw xmm5, xmm4
5232 pmaddubsw xmm1, xmm4
5233 phaddw xmm5, xmm1
5234 psrlw xmm5, 7
5235 packuswb xmm5, xmm5 // 8 R values
5236 movdqa xmm6, [eax] // A
5237 movdqa xmm1, [eax + 16]
5238 psrld xmm6, 24
5239 psrld xmm1, 24
5240 packuswb xmm6, xmm1
5241 packuswb xmm6, xmm6 // 8 A values
5242 punpcklbw xmm5, xmm6 // 8 RA values
5243 movdqa xmm1, xmm0 // Weave BG, RA together
5244 punpcklwd xmm0, xmm5 // BGRA first 4
5245 punpckhwd xmm1, xmm5 // BGRA next 4
5246 sub ecx, 8
5247 movdqa [eax], xmm0
5248 movdqa [eax + 16], xmm1
5249 lea eax, [eax + 32]
5250 jg convertloop
5251 ret
5252 }
5253 }
5254 #endif // HAS_ARGBSEPIAROW_SSSE3
5255
5256 #ifdef HAS_ARGBCOLORMATRIXROW_SSSE3
5257 // Tranform 8 ARGB pixels (32 bytes) with color matrix.
5258 // Same as Sepia except matrix is provided.
5259 // TODO(fbarchard): packuswbs only use half of the reg. To make RGBA, combine R
5260 // and B into a high and low, then G/A, unpackl/hbw and then unpckl/hwd.
5261 __declspec(naked) __declspec(align(16))
ARGBColorMatrixRow_SSSE3(const uint8 * src_argb,uint8 * dst_argb,const int8 * matrix_argb,int width)5262 void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
5263 const int8* matrix_argb, int width) {
5264 __asm {
5265 mov eax, [esp + 4] /* src_argb */
5266 mov edx, [esp + 8] /* dst_argb */
5267 mov ecx, [esp + 12] /* matrix_argb */
5268 movdqu xmm5, [ecx]
5269 pshufd xmm2, xmm5, 0x00
5270 pshufd xmm3, xmm5, 0x55
5271 pshufd xmm4, xmm5, 0xaa
5272 pshufd xmm5, xmm5, 0xff
5273 mov ecx, [esp + 16] /* width */
5274
5275 align 4
5276 convertloop:
5277 movdqa xmm0, [eax] // B
5278 movdqa xmm7, [eax + 16]
5279 pmaddubsw xmm0, xmm2
5280 pmaddubsw xmm7, xmm2
5281 movdqa xmm6, [eax] // G
5282 movdqa xmm1, [eax + 16]
5283 pmaddubsw xmm6, xmm3
5284 pmaddubsw xmm1, xmm3
5285 phaddsw xmm0, xmm7 // B
5286 phaddsw xmm6, xmm1 // G
5287 psraw xmm0, 6 // B
5288 psraw xmm6, 6 // G
5289 packuswb xmm0, xmm0 // 8 B values
5290 packuswb xmm6, xmm6 // 8 G values
5291 punpcklbw xmm0, xmm6 // 8 BG values
5292 movdqa xmm1, [eax] // R
5293 movdqa xmm7, [eax + 16]
5294 pmaddubsw xmm1, xmm4
5295 pmaddubsw xmm7, xmm4
5296 phaddsw xmm1, xmm7 // R
5297 movdqa xmm6, [eax] // A
5298 movdqa xmm7, [eax + 16]
5299 pmaddubsw xmm6, xmm5
5300 pmaddubsw xmm7, xmm5
5301 phaddsw xmm6, xmm7 // A
5302 psraw xmm1, 6 // R
5303 psraw xmm6, 6 // A
5304 packuswb xmm1, xmm1 // 8 R values
5305 packuswb xmm6, xmm6 // 8 A values
5306 punpcklbw xmm1, xmm6 // 8 RA values
5307 movdqa xmm6, xmm0 // Weave BG, RA together
5308 punpcklwd xmm0, xmm1 // BGRA first 4
5309 punpckhwd xmm6, xmm1 // BGRA next 4
5310 sub ecx, 8
5311 movdqa [edx], xmm0
5312 movdqa [edx + 16], xmm6
5313 lea eax, [eax + 32]
5314 lea edx, [edx + 32]
5315 jg convertloop
5316 ret
5317 }
5318 }
5319 #endif // HAS_ARGBCOLORMATRIXROW_SSSE3
5320
5321 #ifdef HAS_ARGBQUANTIZEROW_SSE2
5322 // Quantize 4 ARGB pixels (16 bytes).
5323 // Aligned to 16 bytes.
5324 __declspec(naked) __declspec(align(16))
ARGBQuantizeRow_SSE2(uint8 * dst_argb,int scale,int interval_size,int interval_offset,int width)5325 void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,
5326 int interval_offset, int width) {
5327 __asm {
5328 mov eax, [esp + 4] /* dst_argb */
5329 movd xmm2, [esp + 8] /* scale */
5330 movd xmm3, [esp + 12] /* interval_size */
5331 movd xmm4, [esp + 16] /* interval_offset */
5332 mov ecx, [esp + 20] /* width */
5333 pshuflw xmm2, xmm2, 040h
5334 pshufd xmm2, xmm2, 044h
5335 pshuflw xmm3, xmm3, 040h
5336 pshufd xmm3, xmm3, 044h
5337 pshuflw xmm4, xmm4, 040h
5338 pshufd xmm4, xmm4, 044h
5339 pxor xmm5, xmm5 // constant 0
5340 pcmpeqb xmm6, xmm6 // generate mask 0xff000000
5341 pslld xmm6, 24
5342
5343 align 4
5344 convertloop:
5345 movdqa xmm0, [eax] // read 4 pixels
5346 punpcklbw xmm0, xmm5 // first 2 pixels
5347 pmulhuw xmm0, xmm2 // pixel * scale >> 16
5348 movdqa xmm1, [eax] // read 4 pixels
5349 punpckhbw xmm1, xmm5 // next 2 pixels
5350 pmulhuw xmm1, xmm2
5351 pmullw xmm0, xmm3 // * interval_size
5352 movdqa xmm7, [eax] // read 4 pixels
5353 pmullw xmm1, xmm3
5354 pand xmm7, xmm6 // mask alpha
5355 paddw xmm0, xmm4 // + interval_size / 2
5356 paddw xmm1, xmm4
5357 packuswb xmm0, xmm1
5358 por xmm0, xmm7
5359 sub ecx, 4
5360 movdqa [eax], xmm0
5361 lea eax, [eax + 16]
5362 jg convertloop
5363 ret
5364 }
5365 }
5366 #endif // HAS_ARGBQUANTIZEROW_SSE2
5367
5368 #ifdef HAS_ARGBSHADEROW_SSE2
5369 // Shade 4 pixels at a time by specified value.
5370 // Aligned to 16 bytes.
5371 __declspec(naked) __declspec(align(16))
ARGBShadeRow_SSE2(const uint8 * src_argb,uint8 * dst_argb,int width,uint32 value)5372 void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
5373 uint32 value) {
5374 __asm {
5375 mov eax, [esp + 4] // src_argb
5376 mov edx, [esp + 8] // dst_argb
5377 mov ecx, [esp + 12] // width
5378 movd xmm2, [esp + 16] // value
5379 punpcklbw xmm2, xmm2
5380 punpcklqdq xmm2, xmm2
5381
5382 align 4
5383 convertloop:
5384 movdqa xmm0, [eax] // read 4 pixels
5385 lea eax, [eax + 16]
5386 movdqa xmm1, xmm0
5387 punpcklbw xmm0, xmm0 // first 2
5388 punpckhbw xmm1, xmm1 // next 2
5389 pmulhuw xmm0, xmm2 // argb * value
5390 pmulhuw xmm1, xmm2 // argb * value
5391 psrlw xmm0, 8
5392 psrlw xmm1, 8
5393 packuswb xmm0, xmm1
5394 sub ecx, 4
5395 movdqa [edx], xmm0
5396 lea edx, [edx + 16]
5397 jg convertloop
5398
5399 ret
5400 }
5401 }
5402 #endif // HAS_ARGBSHADEROW_SSE2
5403
5404 #ifdef HAS_ARGBMULTIPLYROW_SSE2
5405 // Multiply 2 rows of ARGB pixels together, 4 pixels at a time.
5406 __declspec(naked) __declspec(align(16))
ARGBMultiplyRow_SSE2(const uint8 * src_argb0,const uint8 * src_argb1,uint8 * dst_argb,int width)5407 void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
5408 uint8* dst_argb, int width) {
5409 __asm {
5410 push esi
5411 mov eax, [esp + 4 + 4] // src_argb0
5412 mov esi, [esp + 4 + 8] // src_argb1
5413 mov edx, [esp + 4 + 12] // dst_argb
5414 mov ecx, [esp + 4 + 16] // width
5415 pxor xmm5, xmm5 // constant 0
5416
5417 align 4
5418 convertloop:
5419 movdqu xmm0, [eax] // read 4 pixels from src_argb0
5420 movdqu xmm2, [esi] // read 4 pixels from src_argb1
5421 movdqu xmm1, xmm0
5422 movdqu xmm3, xmm2
5423 punpcklbw xmm0, xmm0 // first 2
5424 punpckhbw xmm1, xmm1 // next 2
5425 punpcklbw xmm2, xmm5 // first 2
5426 punpckhbw xmm3, xmm5 // next 2
5427 pmulhuw xmm0, xmm2 // src_argb0 * src_argb1 first 2
5428 pmulhuw xmm1, xmm3 // src_argb0 * src_argb1 next 2
5429 lea eax, [eax + 16]
5430 lea esi, [esi + 16]
5431 packuswb xmm0, xmm1
5432 sub ecx, 4
5433 movdqu [edx], xmm0
5434 lea edx, [edx + 16]
5435 jg convertloop
5436
5437 pop esi
5438 ret
5439 }
5440 }
5441 #endif // HAS_ARGBMULTIPLYROW_SSE2
5442
5443 #ifdef HAS_ARGBADDROW_SSE2
5444 // Add 2 rows of ARGB pixels together, 4 pixels at a time.
5445 // TODO(fbarchard): Port this to posix, neon and other math functions.
5446 __declspec(naked) __declspec(align(16))
ARGBAddRow_SSE2(const uint8 * src_argb0,const uint8 * src_argb1,uint8 * dst_argb,int width)5447 void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
5448 uint8* dst_argb, int width) {
5449 __asm {
5450 push esi
5451 mov eax, [esp + 4 + 4] // src_argb0
5452 mov esi, [esp + 4 + 8] // src_argb1
5453 mov edx, [esp + 4 + 12] // dst_argb
5454 mov ecx, [esp + 4 + 16] // width
5455
5456 sub ecx, 4
5457 jl convertloop49
5458
5459 align 4
5460 convertloop4:
5461 movdqu xmm0, [eax] // read 4 pixels from src_argb0
5462 lea eax, [eax + 16]
5463 movdqu xmm1, [esi] // read 4 pixels from src_argb1
5464 lea esi, [esi + 16]
5465 paddusb xmm0, xmm1 // src_argb0 + src_argb1
5466 sub ecx, 4
5467 movdqu [edx], xmm0
5468 lea edx, [edx + 16]
5469 jge convertloop4
5470
5471 convertloop49:
5472 add ecx, 4 - 1
5473 jl convertloop19
5474
5475 convertloop1:
5476 movd xmm0, [eax] // read 1 pixels from src_argb0
5477 lea eax, [eax + 4]
5478 movd xmm1, [esi] // read 1 pixels from src_argb1
5479 lea esi, [esi + 4]
5480 paddusb xmm0, xmm1 // src_argb0 + src_argb1
5481 sub ecx, 1
5482 movd [edx], xmm0
5483 lea edx, [edx + 4]
5484 jge convertloop1
5485
5486 convertloop19:
5487 pop esi
5488 ret
5489 }
5490 }
5491 #endif // HAS_ARGBADDROW_SSE2
5492
5493 #ifdef HAS_ARGBSUBTRACTROW_SSE2
5494 // Subtract 2 rows of ARGB pixels together, 4 pixels at a time.
5495 __declspec(naked) __declspec(align(16))
ARGBSubtractRow_SSE2(const uint8 * src_argb0,const uint8 * src_argb1,uint8 * dst_argb,int width)5496 void ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
5497 uint8* dst_argb, int width) {
5498 __asm {
5499 push esi
5500 mov eax, [esp + 4 + 4] // src_argb0
5501 mov esi, [esp + 4 + 8] // src_argb1
5502 mov edx, [esp + 4 + 12] // dst_argb
5503 mov ecx, [esp + 4 + 16] // width
5504
5505 align 4
5506 convertloop:
5507 movdqu xmm0, [eax] // read 4 pixels from src_argb0
5508 lea eax, [eax + 16]
5509 movdqu xmm1, [esi] // read 4 pixels from src_argb1
5510 lea esi, [esi + 16]
5511 psubusb xmm0, xmm1 // src_argb0 - src_argb1
5512 sub ecx, 4
5513 movdqu [edx], xmm0
5514 lea edx, [edx + 16]
5515 jg convertloop
5516
5517 pop esi
5518 ret
5519 }
5520 }
5521 #endif // HAS_ARGBSUBTRACTROW_SSE2
5522
5523 #ifdef HAS_ARGBMULTIPLYROW_AVX2
5524 // Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
5525 __declspec(naked) __declspec(align(16))
ARGBMultiplyRow_AVX2(const uint8 * src_argb0,const uint8 * src_argb1,uint8 * dst_argb,int width)5526 void ARGBMultiplyRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
5527 uint8* dst_argb, int width) {
5528 __asm {
5529 push esi
5530 mov eax, [esp + 4 + 4] // src_argb0
5531 mov esi, [esp + 4 + 8] // src_argb1
5532 mov edx, [esp + 4 + 12] // dst_argb
5533 mov ecx, [esp + 4 + 16] // width
5534 vpxor ymm5, ymm5, ymm5 // constant 0
5535
5536 align 4
5537 convertloop:
5538 vmovdqu ymm1, [eax] // read 8 pixels from src_argb0
5539 lea eax, [eax + 32]
5540 vmovdqu ymm3, [esi] // read 8 pixels from src_argb1
5541 lea esi, [esi + 32]
5542 vpunpcklbw ymm0, ymm1, ymm1 // low 4
5543 vpunpckhbw ymm1, ymm1, ymm1 // high 4
5544 vpunpcklbw ymm2, ymm3, ymm5 // low 4
5545 vpunpckhbw ymm3, ymm3, ymm5 // high 4
5546 vpmulhuw ymm0, ymm0, ymm2 // src_argb0 * src_argb1 low 4
5547 vpmulhuw ymm1, ymm1, ymm3 // src_argb0 * src_argb1 high 4
5548 vpackuswb ymm0, ymm0, ymm1
5549 vmovdqu [edx], ymm0
5550 lea edx, [edx + 32]
5551 sub ecx, 8
5552 jg convertloop
5553
5554 pop esi
5555 vzeroupper
5556 ret
5557 }
5558 }
5559 #endif // HAS_ARGBMULTIPLYROW_AVX2
5560
5561 #ifdef HAS_ARGBADDROW_AVX2
5562 // Add 2 rows of ARGB pixels together, 8 pixels at a time.
5563 __declspec(naked) __declspec(align(16))
ARGBAddRow_AVX2(const uint8 * src_argb0,const uint8 * src_argb1,uint8 * dst_argb,int width)5564 void ARGBAddRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
5565 uint8* dst_argb, int width) {
5566 __asm {
5567 push esi
5568 mov eax, [esp + 4 + 4] // src_argb0
5569 mov esi, [esp + 4 + 8] // src_argb1
5570 mov edx, [esp + 4 + 12] // dst_argb
5571 mov ecx, [esp + 4 + 16] // width
5572
5573 align 4
5574 convertloop:
5575 vmovdqu ymm0, [eax] // read 8 pixels from src_argb0
5576 lea eax, [eax + 32]
5577 vpaddusb ymm0, ymm0, [esi] // add 8 pixels from src_argb1
5578 lea esi, [esi + 32]
5579 vmovdqu [edx], ymm0
5580 lea edx, [edx + 32]
5581 sub ecx, 8
5582 jg convertloop
5583
5584 pop esi
5585 vzeroupper
5586 ret
5587 }
5588 }
5589 #endif // HAS_ARGBADDROW_AVX2
5590
5591 #ifdef HAS_ARGBSUBTRACTROW_AVX2
5592 // Subtract 2 rows of ARGB pixels together, 8 pixels at a time.
5593 __declspec(naked) __declspec(align(16))
ARGBSubtractRow_AVX2(const uint8 * src_argb0,const uint8 * src_argb1,uint8 * dst_argb,int width)5594 void ARGBSubtractRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
5595 uint8* dst_argb, int width) {
5596 __asm {
5597 push esi
5598 mov eax, [esp + 4 + 4] // src_argb0
5599 mov esi, [esp + 4 + 8] // src_argb1
5600 mov edx, [esp + 4 + 12] // dst_argb
5601 mov ecx, [esp + 4 + 16] // width
5602
5603 align 4
5604 convertloop:
5605 vmovdqu ymm0, [eax] // read 8 pixels from src_argb0
5606 lea eax, [eax + 32]
5607 vpsubusb ymm0, ymm0, [esi] // src_argb0 - src_argb1
5608 lea esi, [esi + 32]
5609 vmovdqu [edx], ymm0
5610 lea edx, [edx + 32]
5611 sub ecx, 8
5612 jg convertloop
5613
5614 pop esi
5615 vzeroupper
5616 ret
5617 }
5618 }
5619 #endif // HAS_ARGBSUBTRACTROW_AVX2
5620
5621 #ifdef HAS_SOBELXROW_SSE2
5622 // SobelX as a matrix is
5623 // -1 0 1
5624 // -2 0 2
5625 // -1 0 1
5626 __declspec(naked) __declspec(align(16))
SobelXRow_SSE2(const uint8 * src_y0,const uint8 * src_y1,const uint8 * src_y2,uint8 * dst_sobelx,int width)5627 void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1,
5628 const uint8* src_y2, uint8* dst_sobelx, int width) {
5629 __asm {
5630 push esi
5631 push edi
5632 mov eax, [esp + 8 + 4] // src_y0
5633 mov esi, [esp + 8 + 8] // src_y1
5634 mov edi, [esp + 8 + 12] // src_y2
5635 mov edx, [esp + 8 + 16] // dst_sobelx
5636 mov ecx, [esp + 8 + 20] // width
5637 sub esi, eax
5638 sub edi, eax
5639 sub edx, eax
5640 pxor xmm5, xmm5 // constant 0
5641
5642 align 4
5643 convertloop:
5644 movq xmm0, qword ptr [eax] // read 8 pixels from src_y0[0]
5645 movq xmm1, qword ptr [eax + 2] // read 8 pixels from src_y0[2]
5646 punpcklbw xmm0, xmm5
5647 punpcklbw xmm1, xmm5
5648 psubw xmm0, xmm1
5649 movq xmm1, qword ptr [eax + esi] // read 8 pixels from src_y1[0]
5650 movq xmm2, qword ptr [eax + esi + 2] // read 8 pixels from src_y1[2]
5651 punpcklbw xmm1, xmm5
5652 punpcklbw xmm2, xmm5
5653 psubw xmm1, xmm2
5654 movq xmm2, qword ptr [eax + edi] // read 8 pixels from src_y2[0]
5655 movq xmm3, qword ptr [eax + edi + 2] // read 8 pixels from src_y2[2]
5656 punpcklbw xmm2, xmm5
5657 punpcklbw xmm3, xmm5
5658 psubw xmm2, xmm3
5659 paddw xmm0, xmm2
5660 paddw xmm0, xmm1
5661 paddw xmm0, xmm1
5662 pxor xmm1, xmm1 // abs = max(xmm0, -xmm0). SSSE3 could use pabsw
5663 psubw xmm1, xmm0
5664 pmaxsw xmm0, xmm1
5665 packuswb xmm0, xmm0
5666 sub ecx, 8
5667 movq qword ptr [eax + edx], xmm0
5668 lea eax, [eax + 8]
5669 jg convertloop
5670
5671 pop edi
5672 pop esi
5673 ret
5674 }
5675 }
5676 #endif // HAS_SOBELXROW_SSE2
5677
5678 #ifdef HAS_SOBELYROW_SSE2
5679 // SobelY as a matrix is
5680 // -1 -2 -1
5681 // 0 0 0
5682 // 1 2 1
5683 __declspec(naked) __declspec(align(16))
SobelYRow_SSE2(const uint8 * src_y0,const uint8 * src_y1,uint8 * dst_sobely,int width)5684 void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1,
5685 uint8* dst_sobely, int width) {
5686 __asm {
5687 push esi
5688 mov eax, [esp + 4 + 4] // src_y0
5689 mov esi, [esp + 4 + 8] // src_y1
5690 mov edx, [esp + 4 + 12] // dst_sobely
5691 mov ecx, [esp + 4 + 16] // width
5692 sub esi, eax
5693 sub edx, eax
5694 pxor xmm5, xmm5 // constant 0
5695
5696 align 4
5697 convertloop:
5698 movq xmm0, qword ptr [eax] // read 8 pixels from src_y0[0]
5699 movq xmm1, qword ptr [eax + esi] // read 8 pixels from src_y1[0]
5700 punpcklbw xmm0, xmm5
5701 punpcklbw xmm1, xmm5
5702 psubw xmm0, xmm1
5703 movq xmm1, qword ptr [eax + 1] // read 8 pixels from src_y0[1]
5704 movq xmm2, qword ptr [eax + esi + 1] // read 8 pixels from src_y1[1]
5705 punpcklbw xmm1, xmm5
5706 punpcklbw xmm2, xmm5
5707 psubw xmm1, xmm2
5708 movq xmm2, qword ptr [eax + 2] // read 8 pixels from src_y0[2]
5709 movq xmm3, qword ptr [eax + esi + 2] // read 8 pixels from src_y1[2]
5710 punpcklbw xmm2, xmm5
5711 punpcklbw xmm3, xmm5
5712 psubw xmm2, xmm3
5713 paddw xmm0, xmm2
5714 paddw xmm0, xmm1
5715 paddw xmm0, xmm1
5716 pxor xmm1, xmm1 // abs = max(xmm0, -xmm0). SSSE3 could use pabsw
5717 psubw xmm1, xmm0
5718 pmaxsw xmm0, xmm1
5719 packuswb xmm0, xmm0
5720 sub ecx, 8
5721 movq qword ptr [eax + edx], xmm0
5722 lea eax, [eax + 8]
5723 jg convertloop
5724
5725 pop esi
5726 ret
5727 }
5728 }
5729 #endif // HAS_SOBELYROW_SSE2
5730
5731 #ifdef HAS_SOBELROW_SSE2
5732 // Adds Sobel X and Sobel Y and stores Sobel into ARGB.
5733 // A = 255
5734 // R = Sobel
5735 // G = Sobel
5736 // B = Sobel
5737 __declspec(naked) __declspec(align(16))
SobelRow_SSE2(const uint8 * src_sobelx,const uint8 * src_sobely,uint8 * dst_argb,int width)5738 void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
5739 uint8* dst_argb, int width) {
5740 __asm {
5741 push esi
5742 mov eax, [esp + 4 + 4] // src_sobelx
5743 mov esi, [esp + 4 + 8] // src_sobely
5744 mov edx, [esp + 4 + 12] // dst_argb
5745 mov ecx, [esp + 4 + 16] // width
5746 sub esi, eax
5747 pcmpeqb xmm5, xmm5 // alpha 255
5748 pslld xmm5, 24 // 0xff000000
5749
5750 align 4
5751 convertloop:
5752 movdqa xmm0, [eax] // read 16 pixels src_sobelx
5753 movdqa xmm1, [eax + esi] // read 16 pixels src_sobely
5754 lea eax, [eax + 16]
5755 paddusb xmm0, xmm1 // sobel = sobelx + sobely
5756 movdqa xmm2, xmm0 // GG
5757 punpcklbw xmm2, xmm0 // First 8
5758 punpckhbw xmm0, xmm0 // Next 8
5759 movdqa xmm1, xmm2 // GGGG
5760 punpcklwd xmm1, xmm2 // First 4
5761 punpckhwd xmm2, xmm2 // Next 4
5762 por xmm1, xmm5 // GGGA
5763 por xmm2, xmm5
5764 movdqa xmm3, xmm0 // GGGG
5765 punpcklwd xmm3, xmm0 // Next 4
5766 punpckhwd xmm0, xmm0 // Last 4
5767 por xmm3, xmm5 // GGGA
5768 por xmm0, xmm5
5769 sub ecx, 16
5770 movdqa [edx], xmm1
5771 movdqa [edx + 16], xmm2
5772 movdqa [edx + 32], xmm3
5773 movdqa [edx + 48], xmm0
5774 lea edx, [edx + 64]
5775 jg convertloop
5776
5777 pop esi
5778 ret
5779 }
5780 }
5781 #endif // HAS_SOBELROW_SSE2
5782
5783 #ifdef HAS_SOBELTOPLANEROW_SSE2
5784 // Adds Sobel X and Sobel Y and stores Sobel into a plane.
5785 __declspec(naked) __declspec(align(16))
SobelToPlaneRow_SSE2(const uint8 * src_sobelx,const uint8 * src_sobely,uint8 * dst_y,int width)5786 void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
5787 uint8* dst_y, int width) {
5788 __asm {
5789 push esi
5790 mov eax, [esp + 4 + 4] // src_sobelx
5791 mov esi, [esp + 4 + 8] // src_sobely
5792 mov edx, [esp + 4 + 12] // dst_argb
5793 mov ecx, [esp + 4 + 16] // width
5794 sub esi, eax
5795
5796 align 4
5797 convertloop:
5798 movdqa xmm0, [eax] // read 16 pixels src_sobelx
5799 movdqa xmm1, [eax + esi] // read 16 pixels src_sobely
5800 lea eax, [eax + 16]
5801 paddusb xmm0, xmm1 // sobel = sobelx + sobely
5802 sub ecx, 16
5803 movdqa [edx], xmm0
5804 lea edx, [edx + 16]
5805 jg convertloop
5806
5807 pop esi
5808 ret
5809 }
5810 }
5811 #endif // HAS_SOBELTOPLANEROW_SSE2
5812
5813 #ifdef HAS_SOBELXYROW_SSE2
5814 // Mixes Sobel X, Sobel Y and Sobel into ARGB.
5815 // A = 255
5816 // R = Sobel X
5817 // G = Sobel
5818 // B = Sobel Y
5819 __declspec(naked) __declspec(align(16))
SobelXYRow_SSE2(const uint8 * src_sobelx,const uint8 * src_sobely,uint8 * dst_argb,int width)5820 void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
5821 uint8* dst_argb, int width) {
5822 __asm {
5823 push esi
5824 mov eax, [esp + 4 + 4] // src_sobelx
5825 mov esi, [esp + 4 + 8] // src_sobely
5826 mov edx, [esp + 4 + 12] // dst_argb
5827 mov ecx, [esp + 4 + 16] // width
5828 sub esi, eax
5829 pcmpeqb xmm5, xmm5 // alpha 255
5830
5831 align 4
5832 convertloop:
5833 movdqa xmm0, [eax] // read 16 pixels src_sobelx
5834 movdqa xmm1, [eax + esi] // read 16 pixels src_sobely
5835 lea eax, [eax + 16]
5836 movdqa xmm2, xmm0
5837 paddusb xmm2, xmm1 // sobel = sobelx + sobely
5838 movdqa xmm3, xmm0 // XA
5839 punpcklbw xmm3, xmm5
5840 punpckhbw xmm0, xmm5
5841 movdqa xmm4, xmm1 // YS
5842 punpcklbw xmm4, xmm2
5843 punpckhbw xmm1, xmm2
5844 movdqa xmm6, xmm4 // YSXA
5845 punpcklwd xmm6, xmm3 // First 4
5846 punpckhwd xmm4, xmm3 // Next 4
5847 movdqa xmm7, xmm1 // YSXA
5848 punpcklwd xmm7, xmm0 // Next 4
5849 punpckhwd xmm1, xmm0 // Last 4
5850 sub ecx, 16
5851 movdqa [edx], xmm6
5852 movdqa [edx + 16], xmm4
5853 movdqa [edx + 32], xmm7
5854 movdqa [edx + 48], xmm1
5855 lea edx, [edx + 64]
5856 jg convertloop
5857
5858 pop esi
5859 ret
5860 }
5861 }
5862 #endif // HAS_SOBELXYROW_SSE2
5863
5864 #ifdef HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
5865 // Consider float CumulativeSum.
5866 // Consider calling CumulativeSum one row at time as needed.
5867 // Consider circular CumulativeSum buffer of radius * 2 + 1 height.
5868 // Convert cumulative sum for an area to an average for 1 pixel.
5869 // topleft is pointer to top left of CumulativeSum buffer for area.
5870 // botleft is pointer to bottom left of CumulativeSum buffer.
5871 // width is offset from left to right of area in CumulativeSum buffer measured
5872 // in number of ints.
5873 // area is the number of pixels in the area being averaged.
5874 // dst points to pixel to store result to.
5875 // count is number of averaged pixels to produce.
5876 // Does 4 pixels at a time, requires CumulativeSum pointers to be 16 byte
5877 // aligned.
CumulativeSumToAverageRow_SSE2(const int32 * topleft,const int32 * botleft,int width,int area,uint8 * dst,int count)5878 void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft,
5879 int width, int area, uint8* dst,
5880 int count) {
5881 __asm {
5882 mov eax, topleft // eax topleft
5883 mov esi, botleft // esi botleft
5884 mov edx, width
5885 movd xmm5, area
5886 mov edi, dst
5887 mov ecx, count
5888 cvtdq2ps xmm5, xmm5
5889 rcpss xmm4, xmm5 // 1.0f / area
5890 pshufd xmm4, xmm4, 0
5891 sub ecx, 4
5892 jl l4b
5893
5894 cmp area, 128 // 128 pixels will not overflow 15 bits.
5895 ja l4
5896
5897 pshufd xmm5, xmm5, 0 // area
5898 pcmpeqb xmm6, xmm6 // constant of 65536.0 - 1 = 65535.0
5899 psrld xmm6, 16
5900 cvtdq2ps xmm6, xmm6
5901 addps xmm5, xmm6 // (65536.0 + area - 1)
5902 mulps xmm5, xmm4 // (65536.0 + area - 1) * 1 / area
5903 cvtps2dq xmm5, xmm5 // 0.16 fixed point
5904 packssdw xmm5, xmm5 // 16 bit shorts
5905
5906 // 4 pixel loop small blocks.
5907 align 4
5908 s4:
5909 // top left
5910 movdqa xmm0, [eax]
5911 movdqa xmm1, [eax + 16]
5912 movdqa xmm2, [eax + 32]
5913 movdqa xmm3, [eax + 48]
5914
5915 // - top right
5916 psubd xmm0, [eax + edx * 4]
5917 psubd xmm1, [eax + edx * 4 + 16]
5918 psubd xmm2, [eax + edx * 4 + 32]
5919 psubd xmm3, [eax + edx * 4 + 48]
5920 lea eax, [eax + 64]
5921
5922 // - bottom left
5923 psubd xmm0, [esi]
5924 psubd xmm1, [esi + 16]
5925 psubd xmm2, [esi + 32]
5926 psubd xmm3, [esi + 48]
5927
5928 // + bottom right
5929 paddd xmm0, [esi + edx * 4]
5930 paddd xmm1, [esi + edx * 4 + 16]
5931 paddd xmm2, [esi + edx * 4 + 32]
5932 paddd xmm3, [esi + edx * 4 + 48]
5933 lea esi, [esi + 64]
5934
5935 packssdw xmm0, xmm1 // pack 4 pixels into 2 registers
5936 packssdw xmm2, xmm3
5937
5938 pmulhuw xmm0, xmm5
5939 pmulhuw xmm2, xmm5
5940
5941 packuswb xmm0, xmm2
5942 movdqu [edi], xmm0
5943 lea edi, [edi + 16]
5944 sub ecx, 4
5945 jge s4
5946
5947 jmp l4b
5948
5949 // 4 pixel loop
5950 align 4
5951 l4:
5952 // top left
5953 movdqa xmm0, [eax]
5954 movdqa xmm1, [eax + 16]
5955 movdqa xmm2, [eax + 32]
5956 movdqa xmm3, [eax + 48]
5957
5958 // - top right
5959 psubd xmm0, [eax + edx * 4]
5960 psubd xmm1, [eax + edx * 4 + 16]
5961 psubd xmm2, [eax + edx * 4 + 32]
5962 psubd xmm3, [eax + edx * 4 + 48]
5963 lea eax, [eax + 64]
5964
5965 // - bottom left
5966 psubd xmm0, [esi]
5967 psubd xmm1, [esi + 16]
5968 psubd xmm2, [esi + 32]
5969 psubd xmm3, [esi + 48]
5970
5971 // + bottom right
5972 paddd xmm0, [esi + edx * 4]
5973 paddd xmm1, [esi + edx * 4 + 16]
5974 paddd xmm2, [esi + edx * 4 + 32]
5975 paddd xmm3, [esi + edx * 4 + 48]
5976 lea esi, [esi + 64]
5977
5978 cvtdq2ps xmm0, xmm0 // Average = Sum * 1 / Area
5979 cvtdq2ps xmm1, xmm1
5980 mulps xmm0, xmm4
5981 mulps xmm1, xmm4
5982 cvtdq2ps xmm2, xmm2
5983 cvtdq2ps xmm3, xmm3
5984 mulps xmm2, xmm4
5985 mulps xmm3, xmm4
5986 cvtps2dq xmm0, xmm0
5987 cvtps2dq xmm1, xmm1
5988 cvtps2dq xmm2, xmm2
5989 cvtps2dq xmm3, xmm3
5990 packssdw xmm0, xmm1
5991 packssdw xmm2, xmm3
5992 packuswb xmm0, xmm2
5993 movdqu [edi], xmm0
5994 lea edi, [edi + 16]
5995 sub ecx, 4
5996 jge l4
5997
5998 l4b:
5999 add ecx, 4 - 1
6000 jl l1b
6001
6002 // 1 pixel loop
6003 align 4
6004 l1:
6005 movdqa xmm0, [eax]
6006 psubd xmm0, [eax + edx * 4]
6007 lea eax, [eax + 16]
6008 psubd xmm0, [esi]
6009 paddd xmm0, [esi + edx * 4]
6010 lea esi, [esi + 16]
6011 cvtdq2ps xmm0, xmm0
6012 mulps xmm0, xmm4
6013 cvtps2dq xmm0, xmm0
6014 packssdw xmm0, xmm0
6015 packuswb xmm0, xmm0
6016 movd dword ptr [edi], xmm0
6017 lea edi, [edi + 4]
6018 sub ecx, 1
6019 jge l1
6020 l1b:
6021 }
6022 }
6023 #endif // HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
6024
6025 #ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2
6026 // Creates a table of cumulative sums where each value is a sum of all values
6027 // above and to the left of the value.
ComputeCumulativeSumRow_SSE2(const uint8 * row,int32 * cumsum,const int32 * previous_cumsum,int width)6028 void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
6029 const int32* previous_cumsum, int width) {
6030 __asm {
6031 mov eax, row
6032 mov edx, cumsum
6033 mov esi, previous_cumsum
6034 mov ecx, width
6035 pxor xmm0, xmm0
6036 pxor xmm1, xmm1
6037
6038 sub ecx, 4
6039 jl l4b
6040 test edx, 15
6041 jne l4b
6042
6043 // 4 pixel loop
6044 align 4
6045 l4:
6046 movdqu xmm2, [eax] // 4 argb pixels 16 bytes.
6047 lea eax, [eax + 16]
6048 movdqa xmm4, xmm2
6049
6050 punpcklbw xmm2, xmm1
6051 movdqa xmm3, xmm2
6052 punpcklwd xmm2, xmm1
6053 punpckhwd xmm3, xmm1
6054
6055 punpckhbw xmm4, xmm1
6056 movdqa xmm5, xmm4
6057 punpcklwd xmm4, xmm1
6058 punpckhwd xmm5, xmm1
6059
6060 paddd xmm0, xmm2
6061 movdqa xmm2, [esi] // previous row above.
6062 paddd xmm2, xmm0
6063
6064 paddd xmm0, xmm3
6065 movdqa xmm3, [esi + 16]
6066 paddd xmm3, xmm0
6067
6068 paddd xmm0, xmm4
6069 movdqa xmm4, [esi + 32]
6070 paddd xmm4, xmm0
6071
6072 paddd xmm0, xmm5
6073 movdqa xmm5, [esi + 48]
6074 lea esi, [esi + 64]
6075 paddd xmm5, xmm0
6076
6077 movdqa [edx], xmm2
6078 movdqa [edx + 16], xmm3
6079 movdqa [edx + 32], xmm4
6080 movdqa [edx + 48], xmm5
6081
6082 lea edx, [edx + 64]
6083 sub ecx, 4
6084 jge l4
6085
6086 l4b:
6087 add ecx, 4 - 1
6088 jl l1b
6089
6090 // 1 pixel loop
6091 align 4
6092 l1:
6093 movd xmm2, dword ptr [eax] // 1 argb pixel 4 bytes.
6094 lea eax, [eax + 4]
6095 punpcklbw xmm2, xmm1
6096 punpcklwd xmm2, xmm1
6097 paddd xmm0, xmm2
6098 movdqu xmm2, [esi]
6099 lea esi, [esi + 16]
6100 paddd xmm2, xmm0
6101 movdqu [edx], xmm2
6102 lea edx, [edx + 16]
6103 sub ecx, 1
6104 jge l1
6105
6106 l1b:
6107 }
6108 }
6109 #endif // HAS_COMPUTECUMULATIVESUMROW_SSE2
6110
6111 #ifdef HAS_ARGBAFFINEROW_SSE2
6112 // Copy ARGB pixels from source image with slope to a row of destination.
6113 __declspec(naked) __declspec(align(16))
6114 LIBYUV_API
ARGBAffineRow_SSE2(const uint8 * src_argb,int src_argb_stride,uint8 * dst_argb,const float * uv_dudv,int width)6115 void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
6116 uint8* dst_argb, const float* uv_dudv, int width) {
6117 __asm {
6118 push esi
6119 push edi
6120 mov eax, [esp + 12] // src_argb
6121 mov esi, [esp + 16] // stride
6122 mov edx, [esp + 20] // dst_argb
6123 mov ecx, [esp + 24] // pointer to uv_dudv
6124 movq xmm2, qword ptr [ecx] // uv
6125 movq xmm7, qword ptr [ecx + 8] // dudv
6126 mov ecx, [esp + 28] // width
6127 shl esi, 16 // 4, stride
6128 add esi, 4
6129 movd xmm5, esi
6130 sub ecx, 4
6131 jl l4b
6132
6133 // setup for 4 pixel loop
6134 pshufd xmm7, xmm7, 0x44 // dup dudv
6135 pshufd xmm5, xmm5, 0 // dup 4, stride
6136 movdqa xmm0, xmm2 // x0, y0, x1, y1
6137 addps xmm0, xmm7
6138 movlhps xmm2, xmm0
6139 movdqa xmm4, xmm7
6140 addps xmm4, xmm4 // dudv *= 2
6141 movdqa xmm3, xmm2 // x2, y2, x3, y3
6142 addps xmm3, xmm4
6143 addps xmm4, xmm4 // dudv *= 4
6144
6145 // 4 pixel loop
6146 align 4
6147 l4:
6148 cvttps2dq xmm0, xmm2 // x, y float to int first 2
6149 cvttps2dq xmm1, xmm3 // x, y float to int next 2
6150 packssdw xmm0, xmm1 // x, y as 8 shorts
6151 pmaddwd xmm0, xmm5 // offsets = x * 4 + y * stride.
6152 movd esi, xmm0
6153 pshufd xmm0, xmm0, 0x39 // shift right
6154 movd edi, xmm0
6155 pshufd xmm0, xmm0, 0x39 // shift right
6156 movd xmm1, [eax + esi] // read pixel 0
6157 movd xmm6, [eax + edi] // read pixel 1
6158 punpckldq xmm1, xmm6 // combine pixel 0 and 1
6159 addps xmm2, xmm4 // x, y += dx, dy first 2
6160 movq qword ptr [edx], xmm1
6161 movd esi, xmm0
6162 pshufd xmm0, xmm0, 0x39 // shift right
6163 movd edi, xmm0
6164 movd xmm6, [eax + esi] // read pixel 2
6165 movd xmm0, [eax + edi] // read pixel 3
6166 punpckldq xmm6, xmm0 // combine pixel 2 and 3
6167 addps xmm3, xmm4 // x, y += dx, dy next 2
6168 sub ecx, 4
6169 movq qword ptr 8[edx], xmm6
6170 lea edx, [edx + 16]
6171 jge l4
6172
6173 l4b:
6174 add ecx, 4 - 1
6175 jl l1b
6176
6177 // 1 pixel loop
6178 align 4
6179 l1:
6180 cvttps2dq xmm0, xmm2 // x, y float to int
6181 packssdw xmm0, xmm0 // x, y as shorts
6182 pmaddwd xmm0, xmm5 // offset = x * 4 + y * stride
6183 addps xmm2, xmm7 // x, y += dx, dy
6184 movd esi, xmm0
6185 movd xmm0, [eax + esi] // copy a pixel
6186 sub ecx, 1
6187 movd [edx], xmm0
6188 lea edx, [edx + 4]
6189 jge l1
6190 l1b:
6191 pop edi
6192 pop esi
6193 ret
6194 }
6195 }
6196 #endif // HAS_ARGBAFFINEROW_SSE2
6197
6198 #ifdef HAS_INTERPOLATEROW_AVX2
6199 // Bilinear filter 16x2 -> 16x1
6200 __declspec(naked) __declspec(align(16))
InterpolateRow_AVX2(uint8 * dst_ptr,const uint8 * src_ptr,ptrdiff_t src_stride,int dst_width,int source_y_fraction)6201 void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr,
6202 ptrdiff_t src_stride, int dst_width,
6203 int source_y_fraction) {
6204 __asm {
6205 push esi
6206 push edi
6207 mov edi, [esp + 8 + 4] // dst_ptr
6208 mov esi, [esp + 8 + 8] // src_ptr
6209 mov edx, [esp + 8 + 12] // src_stride
6210 mov ecx, [esp + 8 + 16] // dst_width
6211 mov eax, [esp + 8 + 20] // source_y_fraction (0..255)
6212 shr eax, 1
6213 // Dispatch to specialized filters if applicable.
6214 cmp eax, 0
6215 je xloop100 // 0 / 128. Blend 100 / 0.
6216 sub edi, esi
6217 cmp eax, 32
6218 je xloop75 // 32 / 128 is 0.25. Blend 75 / 25.
6219 cmp eax, 64
6220 je xloop50 // 64 / 128 is 0.50. Blend 50 / 50.
6221 cmp eax, 96
6222 je xloop25 // 96 / 128 is 0.75. Blend 25 / 75.
6223
6224 vmovd xmm0, eax // high fraction 0..127
6225 neg eax
6226 add eax, 128
6227 vmovd xmm5, eax // low fraction 128..1
6228 vpunpcklbw xmm5, xmm5, xmm0
6229 vpunpcklwd xmm5, xmm5, xmm5
6230 vpxor ymm0, ymm0, ymm0
6231 vpermd ymm5, ymm0, ymm5
6232
6233 align 4
6234 xloop:
6235 vmovdqu ymm0, [esi]
6236 vmovdqu ymm2, [esi + edx]
6237 vpunpckhbw ymm1, ymm0, ymm2 // mutates
6238 vpunpcklbw ymm0, ymm0, ymm2 // mutates
6239 vpmaddubsw ymm0, ymm0, ymm5
6240 vpmaddubsw ymm1, ymm1, ymm5
6241 vpsrlw ymm0, ymm0, 7
6242 vpsrlw ymm1, ymm1, 7
6243 vpackuswb ymm0, ymm0, ymm1 // unmutates
6244 sub ecx, 32
6245 vmovdqu [esi + edi], ymm0
6246 lea esi, [esi + 32]
6247 jg xloop
6248 jmp xloop99
6249
6250 // Blend 25 / 75.
6251 align 4
6252 xloop25:
6253 vmovdqu ymm0, [esi]
6254 vpavgb ymm0, ymm0, [esi + edx]
6255 vpavgb ymm0, ymm0, [esi + edx]
6256 sub ecx, 32
6257 vmovdqu [esi + edi], ymm0
6258 lea esi, [esi + 32]
6259 jg xloop25
6260 jmp xloop99
6261
6262 // Blend 50 / 50.
6263 align 4
6264 xloop50:
6265 vmovdqu ymm0, [esi]
6266 vpavgb ymm0, ymm0, [esi + edx]
6267 sub ecx, 32
6268 vmovdqu [esi + edi], ymm0
6269 lea esi, [esi + 32]
6270 jg xloop50
6271 jmp xloop99
6272
6273 // Blend 75 / 25.
6274 align 4
6275 xloop75:
6276 vmovdqu ymm0, [esi + edx]
6277 vpavgb ymm0, ymm0, [esi]
6278 vpavgb ymm0, ymm0, [esi]
6279 sub ecx, 32
6280 vmovdqu [esi + edi], ymm0
6281 lea esi, [esi + 32]
6282 jg xloop75
6283 jmp xloop99
6284
6285 // Blend 100 / 0 - Copy row unchanged.
6286 align 4
6287 xloop100:
6288 rep movsb
6289
6290 xloop99:
6291 pop edi
6292 pop esi
6293 vzeroupper
6294 ret
6295 }
6296 }
6297 #endif // HAS_INTERPOLATEROW_AVX2
6298
6299 #ifdef HAS_INTERPOLATEROW_SSSE3
6300 // Bilinear filter 16x2 -> 16x1
6301 __declspec(naked) __declspec(align(16))
InterpolateRow_SSSE3(uint8 * dst_ptr,const uint8 * src_ptr,ptrdiff_t src_stride,int dst_width,int source_y_fraction)6302 void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
6303 ptrdiff_t src_stride, int dst_width,
6304 int source_y_fraction) {
6305 __asm {
6306 push esi
6307 push edi
6308 mov edi, [esp + 8 + 4] // dst_ptr
6309 mov esi, [esp + 8 + 8] // src_ptr
6310 mov edx, [esp + 8 + 12] // src_stride
6311 mov ecx, [esp + 8 + 16] // dst_width
6312 mov eax, [esp + 8 + 20] // source_y_fraction (0..255)
6313 sub edi, esi
6314 shr eax, 1
6315 // Dispatch to specialized filters if applicable.
6316 cmp eax, 0
6317 je xloop100 // 0 / 128. Blend 100 / 0.
6318 cmp eax, 32
6319 je xloop75 // 32 / 128 is 0.25. Blend 75 / 25.
6320 cmp eax, 64
6321 je xloop50 // 64 / 128 is 0.50. Blend 50 / 50.
6322 cmp eax, 96
6323 je xloop25 // 96 / 128 is 0.75. Blend 25 / 75.
6324
6325 movd xmm0, eax // high fraction 0..127
6326 neg eax
6327 add eax, 128
6328 movd xmm5, eax // low fraction 128..1
6329 punpcklbw xmm5, xmm0
6330 punpcklwd xmm5, xmm5
6331 pshufd xmm5, xmm5, 0
6332
6333 align 4
6334 xloop:
6335 movdqa xmm0, [esi]
6336 movdqa xmm2, [esi + edx]
6337 movdqa xmm1, xmm0
6338 punpcklbw xmm0, xmm2
6339 punpckhbw xmm1, xmm2
6340 pmaddubsw xmm0, xmm5
6341 pmaddubsw xmm1, xmm5
6342 psrlw xmm0, 7
6343 psrlw xmm1, 7
6344 packuswb xmm0, xmm1
6345 sub ecx, 16
6346 movdqa [esi + edi], xmm0
6347 lea esi, [esi + 16]
6348 jg xloop
6349 jmp xloop99
6350
6351 // Blend 25 / 75.
6352 align 4
6353 xloop25:
6354 movdqa xmm0, [esi]
6355 movdqa xmm1, [esi + edx]
6356 pavgb xmm0, xmm1
6357 pavgb xmm0, xmm1
6358 sub ecx, 16
6359 movdqa [esi + edi], xmm0
6360 lea esi, [esi + 16]
6361 jg xloop25
6362 jmp xloop99
6363
6364 // Blend 50 / 50.
6365 align 4
6366 xloop50:
6367 movdqa xmm0, [esi]
6368 movdqa xmm1, [esi + edx]
6369 pavgb xmm0, xmm1
6370 sub ecx, 16
6371 movdqa [esi + edi], xmm0
6372 lea esi, [esi + 16]
6373 jg xloop50
6374 jmp xloop99
6375
6376 // Blend 75 / 25.
6377 align 4
6378 xloop75:
6379 movdqa xmm1, [esi]
6380 movdqa xmm0, [esi + edx]
6381 pavgb xmm0, xmm1
6382 pavgb xmm0, xmm1
6383 sub ecx, 16
6384 movdqa [esi + edi], xmm0
6385 lea esi, [esi + 16]
6386 jg xloop75
6387 jmp xloop99
6388
6389 // Blend 100 / 0 - Copy row unchanged.
6390 align 4
6391 xloop100:
6392 movdqa xmm0, [esi]
6393 sub ecx, 16
6394 movdqa [esi + edi], xmm0
6395 lea esi, [esi + 16]
6396 jg xloop100
6397
6398 xloop99:
6399 pop edi
6400 pop esi
6401 ret
6402 }
6403 }
6404 #endif // HAS_INTERPOLATEROW_SSSE3
6405
6406 #ifdef HAS_INTERPOLATEROW_SSE2
6407 // Bilinear filter 16x2 -> 16x1
6408 __declspec(naked) __declspec(align(16))
InterpolateRow_SSE2(uint8 * dst_ptr,const uint8 * src_ptr,ptrdiff_t src_stride,int dst_width,int source_y_fraction)6409 void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr,
6410 ptrdiff_t src_stride, int dst_width,
6411 int source_y_fraction) {
6412 __asm {
6413 push esi
6414 push edi
6415 mov edi, [esp + 8 + 4] // dst_ptr
6416 mov esi, [esp + 8 + 8] // src_ptr
6417 mov edx, [esp + 8 + 12] // src_stride
6418 mov ecx, [esp + 8 + 16] // dst_width
6419 mov eax, [esp + 8 + 20] // source_y_fraction (0..255)
6420 sub edi, esi
6421 // Dispatch to specialized filters if applicable.
6422 cmp eax, 0
6423 je xloop100 // 0 / 256. Blend 100 / 0.
6424 cmp eax, 64
6425 je xloop75 // 64 / 256 is 0.25. Blend 75 / 25.
6426 cmp eax, 128
6427 je xloop50 // 128 / 256 is 0.50. Blend 50 / 50.
6428 cmp eax, 192
6429 je xloop25 // 192 / 256 is 0.75. Blend 25 / 75.
6430
6431 movd xmm5, eax // xmm5 = y fraction
6432 punpcklbw xmm5, xmm5
6433 psrlw xmm5, 1
6434 punpcklwd xmm5, xmm5
6435 punpckldq xmm5, xmm5
6436 punpcklqdq xmm5, xmm5
6437 pxor xmm4, xmm4
6438
6439 align 4
6440 xloop:
6441 movdqa xmm0, [esi] // row0
6442 movdqa xmm2, [esi + edx] // row1
6443 movdqa xmm1, xmm0
6444 movdqa xmm3, xmm2
6445 punpcklbw xmm2, xmm4
6446 punpckhbw xmm3, xmm4
6447 punpcklbw xmm0, xmm4
6448 punpckhbw xmm1, xmm4
6449 psubw xmm2, xmm0 // row1 - row0
6450 psubw xmm3, xmm1
6451 paddw xmm2, xmm2 // 9 bits * 15 bits = 8.16
6452 paddw xmm3, xmm3
6453 pmulhw xmm2, xmm5 // scale diff
6454 pmulhw xmm3, xmm5
6455 paddw xmm0, xmm2 // sum rows
6456 paddw xmm1, xmm3
6457 packuswb xmm0, xmm1
6458 sub ecx, 16
6459 movdqa [esi + edi], xmm0
6460 lea esi, [esi + 16]
6461 jg xloop
6462 jmp xloop99
6463
6464 // Blend 25 / 75.
6465 align 4
6466 xloop25:
6467 movdqa xmm0, [esi]
6468 movdqa xmm1, [esi + edx]
6469 pavgb xmm0, xmm1
6470 pavgb xmm0, xmm1
6471 sub ecx, 16
6472 movdqa [esi + edi], xmm0
6473 lea esi, [esi + 16]
6474 jg xloop25
6475 jmp xloop99
6476
6477 // Blend 50 / 50.
6478 align 4
6479 xloop50:
6480 movdqa xmm0, [esi]
6481 movdqa xmm1, [esi + edx]
6482 pavgb xmm0, xmm1
6483 sub ecx, 16
6484 movdqa [esi + edi], xmm0
6485 lea esi, [esi + 16]
6486 jg xloop50
6487 jmp xloop99
6488
6489 // Blend 75 / 25.
6490 align 4
6491 xloop75:
6492 movdqa xmm1, [esi]
6493 movdqa xmm0, [esi + edx]
6494 pavgb xmm0, xmm1
6495 pavgb xmm0, xmm1
6496 sub ecx, 16
6497 movdqa [esi + edi], xmm0
6498 lea esi, [esi + 16]
6499 jg xloop75
6500 jmp xloop99
6501
6502 // Blend 100 / 0 - Copy row unchanged.
6503 align 4
6504 xloop100:
6505 movdqa xmm0, [esi]
6506 sub ecx, 16
6507 movdqa [esi + edi], xmm0
6508 lea esi, [esi + 16]
6509 jg xloop100
6510
6511 xloop99:
6512 pop edi
6513 pop esi
6514 ret
6515 }
6516 }
6517 #endif // HAS_INTERPOLATEROW_SSE2
6518
6519 // Bilinear filter 16x2 -> 16x1
6520 __declspec(naked) __declspec(align(16))
InterpolateRow_Unaligned_SSSE3(uint8 * dst_ptr,const uint8 * src_ptr,ptrdiff_t src_stride,int dst_width,int source_y_fraction)6521 void InterpolateRow_Unaligned_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
6522 ptrdiff_t src_stride, int dst_width,
6523 int source_y_fraction) {
6524 __asm {
6525 push esi
6526 push edi
6527 mov edi, [esp + 8 + 4] // dst_ptr
6528 mov esi, [esp + 8 + 8] // src_ptr
6529 mov edx, [esp + 8 + 12] // src_stride
6530 mov ecx, [esp + 8 + 16] // dst_width
6531 mov eax, [esp + 8 + 20] // source_y_fraction (0..255)
6532 sub edi, esi
6533 shr eax, 1
6534 // Dispatch to specialized filters if applicable.
6535 cmp eax, 0
6536 je xloop100 // 0 / 128. Blend 100 / 0.
6537 cmp eax, 32
6538 je xloop75 // 32 / 128 is 0.25. Blend 75 / 25.
6539 cmp eax, 64
6540 je xloop50 // 64 / 128 is 0.50. Blend 50 / 50.
6541 cmp eax, 96
6542 je xloop25 // 96 / 128 is 0.75. Blend 25 / 75.
6543
6544 movd xmm0, eax // high fraction 0..127
6545 neg eax
6546 add eax, 128
6547 movd xmm5, eax // low fraction 128..1
6548 punpcklbw xmm5, xmm0
6549 punpcklwd xmm5, xmm5
6550 pshufd xmm5, xmm5, 0
6551
6552 align 4
6553 xloop:
6554 movdqu xmm0, [esi]
6555 movdqu xmm2, [esi + edx]
6556 movdqu xmm1, xmm0
6557 punpcklbw xmm0, xmm2
6558 punpckhbw xmm1, xmm2
6559 pmaddubsw xmm0, xmm5
6560 pmaddubsw xmm1, xmm5
6561 psrlw xmm0, 7
6562 psrlw xmm1, 7
6563 packuswb xmm0, xmm1
6564 sub ecx, 16
6565 movdqu [esi + edi], xmm0
6566 lea esi, [esi + 16]
6567 jg xloop
6568 jmp xloop99
6569
6570 // Blend 25 / 75.
6571 align 4
6572 xloop25:
6573 movdqu xmm0, [esi]
6574 movdqu xmm1, [esi + edx]
6575 pavgb xmm0, xmm1
6576 pavgb xmm0, xmm1
6577 sub ecx, 16
6578 movdqu [esi + edi], xmm0
6579 lea esi, [esi + 16]
6580 jg xloop25
6581 jmp xloop99
6582
6583 // Blend 50 / 50.
6584 align 4
6585 xloop50:
6586 movdqu xmm0, [esi]
6587 movdqu xmm1, [esi + edx]
6588 pavgb xmm0, xmm1
6589 sub ecx, 16
6590 movdqu [esi + edi], xmm0
6591 lea esi, [esi + 16]
6592 jg xloop50
6593 jmp xloop99
6594
6595 // Blend 75 / 25.
6596 align 4
6597 xloop75:
6598 movdqu xmm1, [esi]
6599 movdqu xmm0, [esi + edx]
6600 pavgb xmm0, xmm1
6601 pavgb xmm0, xmm1
6602 sub ecx, 16
6603 movdqu [esi + edi], xmm0
6604 lea esi, [esi + 16]
6605 jg xloop75
6606 jmp xloop99
6607
6608 // Blend 100 / 0 - Copy row unchanged.
6609 align 4
6610 xloop100:
6611 movdqu xmm0, [esi]
6612 sub ecx, 16
6613 movdqu [esi + edi], xmm0
6614 lea esi, [esi + 16]
6615 jg xloop100
6616
6617 xloop99:
6618 pop edi
6619 pop esi
6620 ret
6621 }
6622 }
6623
6624 #ifdef HAS_INTERPOLATEROW_SSE2
6625 // Bilinear filter 16x2 -> 16x1
6626 __declspec(naked) __declspec(align(16))
InterpolateRow_Unaligned_SSE2(uint8 * dst_ptr,const uint8 * src_ptr,ptrdiff_t src_stride,int dst_width,int source_y_fraction)6627 void InterpolateRow_Unaligned_SSE2(uint8* dst_ptr, const uint8* src_ptr,
6628 ptrdiff_t src_stride, int dst_width,
6629 int source_y_fraction) {
6630 __asm {
6631 push esi
6632 push edi
6633 mov edi, [esp + 8 + 4] // dst_ptr
6634 mov esi, [esp + 8 + 8] // src_ptr
6635 mov edx, [esp + 8 + 12] // src_stride
6636 mov ecx, [esp + 8 + 16] // dst_width
6637 mov eax, [esp + 8 + 20] // source_y_fraction (0..255)
6638 sub edi, esi
6639 // Dispatch to specialized filters if applicable.
6640 cmp eax, 0
6641 je xloop100 // 0 / 256. Blend 100 / 0.
6642 cmp eax, 64
6643 je xloop75 // 64 / 256 is 0.25. Blend 75 / 25.
6644 cmp eax, 128
6645 je xloop50 // 128 / 256 is 0.50. Blend 50 / 50.
6646 cmp eax, 192
6647 je xloop25 // 192 / 256 is 0.75. Blend 25 / 75.
6648
6649 movd xmm5, eax // xmm5 = y fraction
6650 punpcklbw xmm5, xmm5
6651 psrlw xmm5, 1
6652 punpcklwd xmm5, xmm5
6653 punpckldq xmm5, xmm5
6654 punpcklqdq xmm5, xmm5
6655 pxor xmm4, xmm4
6656
6657 align 4
6658 xloop:
6659 movdqu xmm0, [esi] // row0
6660 movdqu xmm2, [esi + edx] // row1
6661 movdqu xmm1, xmm0
6662 movdqu xmm3, xmm2
6663 punpcklbw xmm2, xmm4
6664 punpckhbw xmm3, xmm4
6665 punpcklbw xmm0, xmm4
6666 punpckhbw xmm1, xmm4
6667 psubw xmm2, xmm0 // row1 - row0
6668 psubw xmm3, xmm1
6669 paddw xmm2, xmm2 // 9 bits * 15 bits = 8.16
6670 paddw xmm3, xmm3
6671 pmulhw xmm2, xmm5 // scale diff
6672 pmulhw xmm3, xmm5
6673 paddw xmm0, xmm2 // sum rows
6674 paddw xmm1, xmm3
6675 packuswb xmm0, xmm1
6676 sub ecx, 16
6677 movdqu [esi + edi], xmm0
6678 lea esi, [esi + 16]
6679 jg xloop
6680 jmp xloop99
6681
6682 // Blend 25 / 75.
6683 align 4
6684 xloop25:
6685 movdqu xmm0, [esi]
6686 movdqu xmm1, [esi + edx]
6687 pavgb xmm0, xmm1
6688 pavgb xmm0, xmm1
6689 sub ecx, 16
6690 movdqu [esi + edi], xmm0
6691 lea esi, [esi + 16]
6692 jg xloop25
6693 jmp xloop99
6694
6695 // Blend 50 / 50.
6696 align 4
6697 xloop50:
6698 movdqu xmm0, [esi]
6699 movdqu xmm1, [esi + edx]
6700 pavgb xmm0, xmm1
6701 sub ecx, 16
6702 movdqu [esi + edi], xmm0
6703 lea esi, [esi + 16]
6704 jg xloop50
6705 jmp xloop99
6706
6707 // Blend 75 / 25.
6708 align 4
6709 xloop75:
6710 movdqu xmm1, [esi]
6711 movdqu xmm0, [esi + edx]
6712 pavgb xmm0, xmm1
6713 pavgb xmm0, xmm1
6714 sub ecx, 16
6715 movdqu [esi + edi], xmm0
6716 lea esi, [esi + 16]
6717 jg xloop75
6718 jmp xloop99
6719
6720 // Blend 100 / 0 - Copy row unchanged.
6721 align 4
6722 xloop100:
6723 movdqu xmm0, [esi]
6724 sub ecx, 16
6725 movdqu [esi + edi], xmm0
6726 lea esi, [esi + 16]
6727 jg xloop100
6728
6729 xloop99:
6730 pop edi
6731 pop esi
6732 ret
6733 }
6734 }
6735 #endif // HAS_INTERPOLATEROW_SSE2
6736
6737 __declspec(naked) __declspec(align(16))
HalfRow_SSE2(const uint8 * src_uv,int src_uv_stride,uint8 * dst_uv,int pix)6738 void HalfRow_SSE2(const uint8* src_uv, int src_uv_stride,
6739 uint8* dst_uv, int pix) {
6740 __asm {
6741 push edi
6742 mov eax, [esp + 4 + 4] // src_uv
6743 mov edx, [esp + 4 + 8] // src_uv_stride
6744 mov edi, [esp + 4 + 12] // dst_v
6745 mov ecx, [esp + 4 + 16] // pix
6746 sub edi, eax
6747
6748 align 4
6749 convertloop:
6750 movdqa xmm0, [eax]
6751 pavgb xmm0, [eax + edx]
6752 sub ecx, 16
6753 movdqa [eax + edi], xmm0
6754 lea eax, [eax + 16]
6755 jg convertloop
6756 pop edi
6757 ret
6758 }
6759 }
6760
6761 #ifdef HAS_HALFROW_AVX2
6762 __declspec(naked) __declspec(align(16))
HalfRow_AVX2(const uint8 * src_uv,int src_uv_stride,uint8 * dst_uv,int pix)6763 void HalfRow_AVX2(const uint8* src_uv, int src_uv_stride,
6764 uint8* dst_uv, int pix) {
6765 __asm {
6766 push edi
6767 mov eax, [esp + 4 + 4] // src_uv
6768 mov edx, [esp + 4 + 8] // src_uv_stride
6769 mov edi, [esp + 4 + 12] // dst_v
6770 mov ecx, [esp + 4 + 16] // pix
6771 sub edi, eax
6772
6773 align 4
6774 convertloop:
6775 vmovdqu ymm0, [eax]
6776 vpavgb ymm0, ymm0, [eax + edx]
6777 sub ecx, 32
6778 vmovdqu [eax + edi], ymm0
6779 lea eax, [eax + 32]
6780 jg convertloop
6781
6782 pop edi
6783 vzeroupper
6784 ret
6785 }
6786 }
6787 #endif // HAS_HALFROW_AVX2
6788
6789 __declspec(naked) __declspec(align(16))
ARGBToBayerRow_SSSE3(const uint8 * src_argb,uint8 * dst_bayer,uint32 selector,int pix)6790 void ARGBToBayerRow_SSSE3(const uint8* src_argb, uint8* dst_bayer,
6791 uint32 selector, int pix) {
6792 __asm {
6793 mov eax, [esp + 4] // src_argb
6794 mov edx, [esp + 8] // dst_bayer
6795 movd xmm5, [esp + 12] // selector
6796 mov ecx, [esp + 16] // pix
6797 pshufd xmm5, xmm5, 0
6798
6799 align 4
6800 wloop:
6801 movdqa xmm0, [eax]
6802 movdqa xmm1, [eax + 16]
6803 lea eax, [eax + 32]
6804 pshufb xmm0, xmm5
6805 pshufb xmm1, xmm5
6806 punpckldq xmm0, xmm1
6807 sub ecx, 8
6808 movq qword ptr [edx], xmm0
6809 lea edx, [edx + 8]
6810 jg wloop
6811 ret
6812 }
6813 }
6814
6815 // Specialized ARGB to Bayer that just isolates G channel.
6816 __declspec(naked) __declspec(align(16))
ARGBToBayerGGRow_SSE2(const uint8 * src_argb,uint8 * dst_bayer,uint32 selector,int pix)6817 void ARGBToBayerGGRow_SSE2(const uint8* src_argb, uint8* dst_bayer,
6818 uint32 selector, int pix) {
6819 __asm {
6820 mov eax, [esp + 4] // src_argb
6821 mov edx, [esp + 8] // dst_bayer
6822 // selector
6823 mov ecx, [esp + 16] // pix
6824 pcmpeqb xmm5, xmm5 // generate mask 0x000000ff
6825 psrld xmm5, 24
6826
6827 align 4
6828 wloop:
6829 movdqa xmm0, [eax]
6830 movdqa xmm1, [eax + 16]
6831 lea eax, [eax + 32]
6832 psrld xmm0, 8 // Move green to bottom.
6833 psrld xmm1, 8
6834 pand xmm0, xmm5
6835 pand xmm1, xmm5
6836 packssdw xmm0, xmm1
6837 packuswb xmm0, xmm1
6838 sub ecx, 8
6839 movq qword ptr [edx], xmm0
6840 lea edx, [edx + 8]
6841 jg wloop
6842 ret
6843 }
6844 }
6845
6846 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
6847 __declspec(naked) __declspec(align(16))
ARGBShuffleRow_SSSE3(const uint8 * src_argb,uint8 * dst_argb,const uint8 * shuffler,int pix)6848 void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
6849 const uint8* shuffler, int pix) {
6850 __asm {
6851 mov eax, [esp + 4] // src_argb
6852 mov edx, [esp + 8] // dst_argb
6853 mov ecx, [esp + 12] // shuffler
6854 movdqa xmm5, [ecx]
6855 mov ecx, [esp + 16] // pix
6856
6857 align 4
6858 wloop:
6859 movdqa xmm0, [eax]
6860 movdqa xmm1, [eax + 16]
6861 lea eax, [eax + 32]
6862 pshufb xmm0, xmm5
6863 pshufb xmm1, xmm5
6864 sub ecx, 8
6865 movdqa [edx], xmm0
6866 movdqa [edx + 16], xmm1
6867 lea edx, [edx + 32]
6868 jg wloop
6869 ret
6870 }
6871 }
6872
6873 __declspec(naked) __declspec(align(16))
ARGBShuffleRow_Unaligned_SSSE3(const uint8 * src_argb,uint8 * dst_argb,const uint8 * shuffler,int pix)6874 void ARGBShuffleRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_argb,
6875 const uint8* shuffler, int pix) {
6876 __asm {
6877 mov eax, [esp + 4] // src_argb
6878 mov edx, [esp + 8] // dst_argb
6879 mov ecx, [esp + 12] // shuffler
6880 movdqa xmm5, [ecx]
6881 mov ecx, [esp + 16] // pix
6882
6883 align 4
6884 wloop:
6885 movdqu xmm0, [eax]
6886 movdqu xmm1, [eax + 16]
6887 lea eax, [eax + 32]
6888 pshufb xmm0, xmm5
6889 pshufb xmm1, xmm5
6890 sub ecx, 8
6891 movdqu [edx], xmm0
6892 movdqu [edx + 16], xmm1
6893 lea edx, [edx + 32]
6894 jg wloop
6895 ret
6896 }
6897 }
6898
6899 #ifdef HAS_ARGBSHUFFLEROW_AVX2
6900 __declspec(naked) __declspec(align(16))
ARGBShuffleRow_AVX2(const uint8 * src_argb,uint8 * dst_argb,const uint8 * shuffler,int pix)6901 void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb,
6902 const uint8* shuffler, int pix) {
6903 __asm {
6904 mov eax, [esp + 4] // src_argb
6905 mov edx, [esp + 8] // dst_argb
6906 mov ecx, [esp + 12] // shuffler
6907 vbroadcastf128 ymm5, [ecx] // same shuffle in high as low.
6908 mov ecx, [esp + 16] // pix
6909
6910 align 4
6911 wloop:
6912 vmovdqu ymm0, [eax]
6913 vmovdqu ymm1, [eax + 32]
6914 lea eax, [eax + 64]
6915 vpshufb ymm0, ymm0, ymm5
6916 vpshufb ymm1, ymm1, ymm5
6917 sub ecx, 16
6918 vmovdqu [edx], ymm0
6919 vmovdqu [edx + 32], ymm1
6920 lea edx, [edx + 64]
6921 jg wloop
6922
6923 vzeroupper
6924 ret
6925 }
6926 }
6927 #endif // HAS_ARGBSHUFFLEROW_AVX2
6928
6929 __declspec(naked) __declspec(align(16))
ARGBShuffleRow_SSE2(const uint8 * src_argb,uint8 * dst_argb,const uint8 * shuffler,int pix)6930 void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb,
6931 const uint8* shuffler, int pix) {
6932 __asm {
6933 push ebx
6934 push esi
6935 mov eax, [esp + 8 + 4] // src_argb
6936 mov edx, [esp + 8 + 8] // dst_argb
6937 mov esi, [esp + 8 + 12] // shuffler
6938 mov ecx, [esp + 8 + 16] // pix
6939 pxor xmm5, xmm5
6940
6941 mov ebx, [esi] // shuffler
6942 cmp ebx, 0x03000102
6943 je shuf_3012
6944 cmp ebx, 0x00010203
6945 je shuf_0123
6946 cmp ebx, 0x00030201
6947 je shuf_0321
6948 cmp ebx, 0x02010003
6949 je shuf_2103
6950
6951 // TODO(fbarchard): Use one source pointer and 3 offsets.
6952 shuf_any1:
6953 movzx ebx, byte ptr [esi]
6954 movzx ebx, byte ptr [eax + ebx]
6955 mov [edx], bl
6956 movzx ebx, byte ptr [esi + 1]
6957 movzx ebx, byte ptr [eax + ebx]
6958 mov [edx + 1], bl
6959 movzx ebx, byte ptr [esi + 2]
6960 movzx ebx, byte ptr [eax + ebx]
6961 mov [edx + 2], bl
6962 movzx ebx, byte ptr [esi + 3]
6963 movzx ebx, byte ptr [eax + ebx]
6964 mov [edx + 3], bl
6965 lea eax, [eax + 4]
6966 lea edx, [edx + 4]
6967 sub ecx, 1
6968 jg shuf_any1
6969 jmp shuf99
6970
6971 align 4
6972 shuf_0123:
6973 movdqu xmm0, [eax]
6974 lea eax, [eax + 16]
6975 movdqa xmm1, xmm0
6976 punpcklbw xmm0, xmm5
6977 punpckhbw xmm1, xmm5
6978 pshufhw xmm0, xmm0, 01Bh // 1B = 00011011 = 0x0123 = BGRAToARGB
6979 pshuflw xmm0, xmm0, 01Bh
6980 pshufhw xmm1, xmm1, 01Bh
6981 pshuflw xmm1, xmm1, 01Bh
6982 packuswb xmm0, xmm1
6983 sub ecx, 4
6984 movdqu [edx], xmm0
6985 lea edx, [edx + 16]
6986 jg shuf_0123
6987 jmp shuf99
6988
6989 align 4
6990 shuf_0321:
6991 movdqu xmm0, [eax]
6992 lea eax, [eax + 16]
6993 movdqa xmm1, xmm0
6994 punpcklbw xmm0, xmm5
6995 punpckhbw xmm1, xmm5
6996 pshufhw xmm0, xmm0, 039h // 39 = 00111001 = 0x0321 = RGBAToARGB
6997 pshuflw xmm0, xmm0, 039h
6998 pshufhw xmm1, xmm1, 039h
6999 pshuflw xmm1, xmm1, 039h
7000 packuswb xmm0, xmm1
7001 sub ecx, 4
7002 movdqu [edx], xmm0
7003 lea edx, [edx + 16]
7004 jg shuf_0321
7005 jmp shuf99
7006
7007 align 4
7008 shuf_2103:
7009 movdqu xmm0, [eax]
7010 lea eax, [eax + 16]
7011 movdqa xmm1, xmm0
7012 punpcklbw xmm0, xmm5
7013 punpckhbw xmm1, xmm5
7014 pshufhw xmm0, xmm0, 093h // 93 = 10010011 = 0x2103 = ARGBToRGBA
7015 pshuflw xmm0, xmm0, 093h
7016 pshufhw xmm1, xmm1, 093h
7017 pshuflw xmm1, xmm1, 093h
7018 packuswb xmm0, xmm1
7019 sub ecx, 4
7020 movdqu [edx], xmm0
7021 lea edx, [edx + 16]
7022 jg shuf_2103
7023 jmp shuf99
7024
7025 align 4
7026 shuf_3012:
7027 movdqu xmm0, [eax]
7028 lea eax, [eax + 16]
7029 movdqa xmm1, xmm0
7030 punpcklbw xmm0, xmm5
7031 punpckhbw xmm1, xmm5
7032 pshufhw xmm0, xmm0, 0C6h // C6 = 11000110 = 0x3012 = ABGRToARGB
7033 pshuflw xmm0, xmm0, 0C6h
7034 pshufhw xmm1, xmm1, 0C6h
7035 pshuflw xmm1, xmm1, 0C6h
7036 packuswb xmm0, xmm1
7037 sub ecx, 4
7038 movdqu [edx], xmm0
7039 lea edx, [edx + 16]
7040 jg shuf_3012
7041
7042 shuf99:
7043 pop esi
7044 pop ebx
7045 ret
7046 }
7047 }
7048
7049 // YUY2 - Macro-pixel = 2 image pixels
7050 // Y0U0Y1V0....Y2U2Y3V2...Y4U4Y5V4....
7051
7052 // UYVY - Macro-pixel = 2 image pixels
7053 // U0Y0V0Y1
7054
7055 __declspec(naked) __declspec(align(16))
I422ToYUY2Row_SSE2(const uint8 * src_y,const uint8 * src_u,const uint8 * src_v,uint8 * dst_frame,int width)7056 void I422ToYUY2Row_SSE2(const uint8* src_y,
7057 const uint8* src_u,
7058 const uint8* src_v,
7059 uint8* dst_frame, int width) {
7060 __asm {
7061 push esi
7062 push edi
7063 mov eax, [esp + 8 + 4] // src_y
7064 mov esi, [esp + 8 + 8] // src_u
7065 mov edx, [esp + 8 + 12] // src_v
7066 mov edi, [esp + 8 + 16] // dst_frame
7067 mov ecx, [esp + 8 + 20] // width
7068 sub edx, esi
7069
7070 align 4
7071 convertloop:
7072 movq xmm2, qword ptr [esi] // U
7073 movq xmm3, qword ptr [esi + edx] // V
7074 lea esi, [esi + 8]
7075 punpcklbw xmm2, xmm3 // UV
7076 movdqu xmm0, [eax] // Y
7077 lea eax, [eax + 16]
7078 movdqa xmm1, xmm0
7079 punpcklbw xmm0, xmm2 // YUYV
7080 punpckhbw xmm1, xmm2
7081 movdqu [edi], xmm0
7082 movdqu [edi + 16], xmm1
7083 lea edi, [edi + 32]
7084 sub ecx, 16
7085 jg convertloop
7086
7087 pop edi
7088 pop esi
7089 ret
7090 }
7091 }
7092
7093 __declspec(naked) __declspec(align(16))
I422ToUYVYRow_SSE2(const uint8 * src_y,const uint8 * src_u,const uint8 * src_v,uint8 * dst_frame,int width)7094 void I422ToUYVYRow_SSE2(const uint8* src_y,
7095 const uint8* src_u,
7096 const uint8* src_v,
7097 uint8* dst_frame, int width) {
7098 __asm {
7099 push esi
7100 push edi
7101 mov eax, [esp + 8 + 4] // src_y
7102 mov esi, [esp + 8 + 8] // src_u
7103 mov edx, [esp + 8 + 12] // src_v
7104 mov edi, [esp + 8 + 16] // dst_frame
7105 mov ecx, [esp + 8 + 20] // width
7106 sub edx, esi
7107
7108 align 4
7109 convertloop:
7110 movq xmm2, qword ptr [esi] // U
7111 movq xmm3, qword ptr [esi + edx] // V
7112 lea esi, [esi + 8]
7113 punpcklbw xmm2, xmm3 // UV
7114 movdqu xmm0, [eax] // Y
7115 movdqa xmm1, xmm2
7116 lea eax, [eax + 16]
7117 punpcklbw xmm1, xmm0 // UYVY
7118 punpckhbw xmm2, xmm0
7119 movdqu [edi], xmm1
7120 movdqu [edi + 16], xmm2
7121 lea edi, [edi + 32]
7122 sub ecx, 16
7123 jg convertloop
7124
7125 pop edi
7126 pop esi
7127 ret
7128 }
7129 }
7130
7131 #ifdef HAS_ARGBPOLYNOMIALROW_SSE2
7132 __declspec(naked) __declspec(align(16))
ARGBPolynomialRow_SSE2(const uint8 * src_argb,uint8 * dst_argb,const float * poly,int width)7133 void ARGBPolynomialRow_SSE2(const uint8* src_argb,
7134 uint8* dst_argb, const float* poly,
7135 int width) {
7136 __asm {
7137 push esi
7138 mov eax, [esp + 4 + 4] /* src_argb */
7139 mov edx, [esp + 4 + 8] /* dst_argb */
7140 mov esi, [esp + 4 + 12] /* poly */
7141 mov ecx, [esp + 4 + 16] /* width */
7142 pxor xmm3, xmm3 // 0 constant for zero extending bytes to ints.
7143
7144 // 2 pixel loop.
7145 align 4
7146 convertloop:
7147 // pmovzxbd xmm0, dword ptr [eax] // BGRA pixel
7148 // pmovzxbd xmm4, dword ptr [eax + 4] // BGRA pixel
7149 movq xmm0, qword ptr [eax] // BGRABGRA
7150 lea eax, [eax + 8]
7151 punpcklbw xmm0, xmm3
7152 movdqa xmm4, xmm0
7153 punpcklwd xmm0, xmm3 // pixel 0
7154 punpckhwd xmm4, xmm3 // pixel 1
7155 cvtdq2ps xmm0, xmm0 // 4 floats
7156 cvtdq2ps xmm4, xmm4
7157 movdqa xmm1, xmm0 // X
7158 movdqa xmm5, xmm4
7159 mulps xmm0, [esi + 16] // C1 * X
7160 mulps xmm4, [esi + 16]
7161 addps xmm0, [esi] // result = C0 + C1 * X
7162 addps xmm4, [esi]
7163 movdqa xmm2, xmm1
7164 movdqa xmm6, xmm5
7165 mulps xmm2, xmm1 // X * X
7166 mulps xmm6, xmm5
7167 mulps xmm1, xmm2 // X * X * X
7168 mulps xmm5, xmm6
7169 mulps xmm2, [esi + 32] // C2 * X * X
7170 mulps xmm6, [esi + 32]
7171 mulps xmm1, [esi + 48] // C3 * X * X * X
7172 mulps xmm5, [esi + 48]
7173 addps xmm0, xmm2 // result += C2 * X * X
7174 addps xmm4, xmm6
7175 addps xmm0, xmm1 // result += C3 * X * X * X
7176 addps xmm4, xmm5
7177 cvttps2dq xmm0, xmm0
7178 cvttps2dq xmm4, xmm4
7179 packuswb xmm0, xmm4
7180 packuswb xmm0, xmm0
7181 sub ecx, 2
7182 movq qword ptr [edx], xmm0
7183 lea edx, [edx + 8]
7184 jg convertloop
7185 pop esi
7186 ret
7187 }
7188 }
7189 #endif // HAS_ARGBPOLYNOMIALROW_SSE2
7190
7191 #ifdef HAS_ARGBPOLYNOMIALROW_AVX2
7192 __declspec(naked) __declspec(align(16))
ARGBPolynomialRow_AVX2(const uint8 * src_argb,uint8 * dst_argb,const float * poly,int width)7193 void ARGBPolynomialRow_AVX2(const uint8* src_argb,
7194 uint8* dst_argb, const float* poly,
7195 int width) {
7196 __asm {
7197 mov eax, [esp + 4] /* src_argb */
7198 mov edx, [esp + 8] /* dst_argb */
7199 mov ecx, [esp + 12] /* poly */
7200 vbroadcastf128 ymm4, [ecx] // C0
7201 vbroadcastf128 ymm5, [ecx + 16] // C1
7202 vbroadcastf128 ymm6, [ecx + 32] // C2
7203 vbroadcastf128 ymm7, [ecx + 48] // C3
7204 mov ecx, [esp + 16] /* width */
7205
7206 // 2 pixel loop.
7207 align 4
7208 convertloop:
7209 vpmovzxbd ymm0, qword ptr [eax] // 2 BGRA pixels
7210 lea eax, [eax + 8]
7211 vcvtdq2ps ymm0, ymm0 // X 8 floats
7212 vmulps ymm2, ymm0, ymm0 // X * X
7213 vmulps ymm3, ymm0, ymm7 // C3 * X
7214 vfmadd132ps ymm0, ymm4, ymm5 // result = C0 + C1 * X
7215 vfmadd231ps ymm0, ymm2, ymm6 // result += C2 * X * X
7216 vfmadd231ps ymm0, ymm2, ymm3 // result += C3 * X * X * X
7217 vcvttps2dq ymm0, ymm0
7218 vpackusdw ymm0, ymm0, ymm0 // b0g0r0a0_00000000_b0g0r0a0_00000000
7219 vpermq ymm0, ymm0, 0xd8 // b0g0r0a0_b0g0r0a0_00000000_00000000
7220 vpackuswb xmm0, xmm0, xmm0 // bgrabgra_00000000_00000000_00000000
7221 sub ecx, 2
7222 vmovq qword ptr [edx], xmm0
7223 lea edx, [edx + 8]
7224 jg convertloop
7225 vzeroupper
7226 ret
7227 }
7228 }
7229 #endif // HAS_ARGBPOLYNOMIALROW_AVX2
7230
7231 #ifdef HAS_ARGBCOLORTABLEROW_X86
7232 // Tranform ARGB pixels with color table.
7233 __declspec(naked) __declspec(align(16))
ARGBColorTableRow_X86(uint8 * dst_argb,const uint8 * table_argb,int width)7234 void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb,
7235 int width) {
7236 __asm {
7237 push esi
7238 mov eax, [esp + 4 + 4] /* dst_argb */
7239 mov esi, [esp + 4 + 8] /* table_argb */
7240 mov ecx, [esp + 4 + 12] /* width */
7241
7242 // 1 pixel loop.
7243 align 4
7244 convertloop:
7245 movzx edx, byte ptr [eax]
7246 lea eax, [eax + 4]
7247 movzx edx, byte ptr [esi + edx * 4]
7248 mov byte ptr [eax - 4], dl
7249 movzx edx, byte ptr [eax - 4 + 1]
7250 movzx edx, byte ptr [esi + edx * 4 + 1]
7251 mov byte ptr [eax - 4 + 1], dl
7252 movzx edx, byte ptr [eax - 4 + 2]
7253 movzx edx, byte ptr [esi + edx * 4 + 2]
7254 mov byte ptr [eax - 4 + 2], dl
7255 movzx edx, byte ptr [eax - 4 + 3]
7256 movzx edx, byte ptr [esi + edx * 4 + 3]
7257 mov byte ptr [eax - 4 + 3], dl
7258 dec ecx
7259 jg convertloop
7260 pop esi
7261 ret
7262 }
7263 }
7264 #endif // HAS_ARGBCOLORTABLEROW_X86
7265
7266 #ifdef HAS_RGBCOLORTABLEROW_X86
7267 // Tranform RGB pixels with color table.
7268 __declspec(naked) __declspec(align(16))
RGBColorTableRow_X86(uint8 * dst_argb,const uint8 * table_argb,int width)7269 void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width) {
7270 __asm {
7271 push esi
7272 mov eax, [esp + 4 + 4] /* dst_argb */
7273 mov esi, [esp + 4 + 8] /* table_argb */
7274 mov ecx, [esp + 4 + 12] /* width */
7275
7276 // 1 pixel loop.
7277 align 4
7278 convertloop:
7279 movzx edx, byte ptr [eax]
7280 lea eax, [eax + 4]
7281 movzx edx, byte ptr [esi + edx * 4]
7282 mov byte ptr [eax - 4], dl
7283 movzx edx, byte ptr [eax - 4 + 1]
7284 movzx edx, byte ptr [esi + edx * 4 + 1]
7285 mov byte ptr [eax - 4 + 1], dl
7286 movzx edx, byte ptr [eax - 4 + 2]
7287 movzx edx, byte ptr [esi + edx * 4 + 2]
7288 mov byte ptr [eax - 4 + 2], dl
7289 dec ecx
7290 jg convertloop
7291
7292 pop esi
7293 ret
7294 }
7295 }
7296 #endif // HAS_RGBCOLORTABLEROW_X86
7297
7298 #ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3
7299 // Tranform RGB pixels with luma table.
7300 __declspec(naked) __declspec(align(16))
ARGBLumaColorTableRow_SSSE3(const uint8 * src_argb,uint8 * dst_argb,int width,const uint8 * luma,uint32 lumacoeff)7301 void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
7302 int width,
7303 const uint8* luma, uint32 lumacoeff) {
7304 __asm {
7305 push esi
7306 push edi
7307 mov eax, [esp + 8 + 4] /* src_argb */
7308 mov edi, [esp + 8 + 8] /* dst_argb */
7309 mov ecx, [esp + 8 + 12] /* width */
7310 movd xmm2, dword ptr [esp + 8 + 16] // luma table
7311 movd xmm3, dword ptr [esp + 8 + 20] // lumacoeff
7312 pshufd xmm2, xmm2, 0
7313 pshufd xmm3, xmm3, 0
7314 pcmpeqb xmm4, xmm4 // generate mask 0xff00ff00
7315 psllw xmm4, 8
7316 pxor xmm5, xmm5
7317
7318 // 4 pixel loop.
7319 align 4
7320 convertloop:
7321 movdqu xmm0, qword ptr [eax] // generate luma ptr
7322 pmaddubsw xmm0, xmm3
7323 phaddw xmm0, xmm0
7324 pand xmm0, xmm4 // mask out low bits
7325 punpcklwd xmm0, xmm5
7326 paddd xmm0, xmm2 // add table base
7327 movd esi, xmm0
7328 pshufd xmm0, xmm0, 0x39 // 00111001 to rotate right 32
7329
7330 movzx edx, byte ptr [eax]
7331 movzx edx, byte ptr [esi + edx]
7332 mov byte ptr [edi], dl
7333 movzx edx, byte ptr [eax + 1]
7334 movzx edx, byte ptr [esi + edx]
7335 mov byte ptr [edi + 1], dl
7336 movzx edx, byte ptr [eax + 2]
7337 movzx edx, byte ptr [esi + edx]
7338 mov byte ptr [edi + 2], dl
7339 movzx edx, byte ptr [eax + 3] // copy alpha.
7340 mov byte ptr [edi + 3], dl
7341
7342 movd esi, xmm0
7343 pshufd xmm0, xmm0, 0x39 // 00111001 to rotate right 32
7344
7345 movzx edx, byte ptr [eax + 4]
7346 movzx edx, byte ptr [esi + edx]
7347 mov byte ptr [edi + 4], dl
7348 movzx edx, byte ptr [eax + 5]
7349 movzx edx, byte ptr [esi + edx]
7350 mov byte ptr [edi + 5], dl
7351 movzx edx, byte ptr [eax + 6]
7352 movzx edx, byte ptr [esi + edx]
7353 mov byte ptr [edi + 6], dl
7354 movzx edx, byte ptr [eax + 7] // copy alpha.
7355 mov byte ptr [edi + 7], dl
7356
7357 movd esi, xmm0
7358 pshufd xmm0, xmm0, 0x39 // 00111001 to rotate right 32
7359
7360 movzx edx, byte ptr [eax + 8]
7361 movzx edx, byte ptr [esi + edx]
7362 mov byte ptr [edi + 8], dl
7363 movzx edx, byte ptr [eax + 9]
7364 movzx edx, byte ptr [esi + edx]
7365 mov byte ptr [edi + 9], dl
7366 movzx edx, byte ptr [eax + 10]
7367 movzx edx, byte ptr [esi + edx]
7368 mov byte ptr [edi + 10], dl
7369 movzx edx, byte ptr [eax + 11] // copy alpha.
7370 mov byte ptr [edi + 11], dl
7371
7372 movd esi, xmm0
7373
7374 movzx edx, byte ptr [eax + 12]
7375 movzx edx, byte ptr [esi + edx]
7376 mov byte ptr [edi + 12], dl
7377 movzx edx, byte ptr [eax + 13]
7378 movzx edx, byte ptr [esi + edx]
7379 mov byte ptr [edi + 13], dl
7380 movzx edx, byte ptr [eax + 14]
7381 movzx edx, byte ptr [esi + edx]
7382 mov byte ptr [edi + 14], dl
7383 movzx edx, byte ptr [eax + 15] // copy alpha.
7384 mov byte ptr [edi + 15], dl
7385
7386 sub ecx, 4
7387 lea eax, [eax + 16]
7388 lea edi, [edi + 16]
7389 jg convertloop
7390
7391 pop edi
7392 pop esi
7393 ret
7394 }
7395 }
7396 #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3
7397
7398 #endif // defined(_M_X64)
7399 #endif // !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER)
7400
7401 #ifdef __cplusplus
7402 } // extern "C"
7403 } // namespace libyuv
7404 #endif
7405