1 /*
2 * Copyright 2011 The LibYuv Project Authors. All rights reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include "libyuv/row.h"
12
13 // This module is for Visual C 32/64 bit and clangcl 32 bit
14 #if !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER) && \
15 (defined(_M_IX86) || (defined(_M_X64) && !defined(__clang__)))
16
17 #if defined(_M_X64)
18 #include <emmintrin.h>
19 #include <tmmintrin.h> // For _mm_maddubs_epi16
20 #endif
21
22 #ifdef __cplusplus
23 namespace libyuv {
24 extern "C" {
25 #endif
26
27 // 64 bit
28 #if defined(_M_X64)
29
30 // Read 4 UV from 422, upsample to 8 UV.
31 #define READYUV422 \
32 xmm0 = _mm_cvtsi32_si128(*(uint32*)u_buf); \
33 xmm1 = _mm_cvtsi32_si128(*(uint32*)(u_buf + offset)); \
34 xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); \
35 xmm0 = _mm_unpacklo_epi16(xmm0, xmm0); \
36 u_buf += 4; \
37 xmm4 = _mm_loadl_epi64((__m128i*)y_buf); \
38 xmm4 = _mm_unpacklo_epi8(xmm4, xmm4); \
39 y_buf += 8;
40
41 // Read 4 UV from 422, upsample to 8 UV. With 8 Alpha.
42 #define READYUVA422 \
43 xmm0 = _mm_cvtsi32_si128(*(uint32*)u_buf); \
44 xmm1 = _mm_cvtsi32_si128(*(uint32*)(u_buf + offset)); \
45 xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); \
46 xmm0 = _mm_unpacklo_epi16(xmm0, xmm0); \
47 u_buf += 4; \
48 xmm4 = _mm_loadl_epi64((__m128i*)y_buf); \
49 xmm4 = _mm_unpacklo_epi8(xmm4, xmm4); \
50 y_buf += 8; \
51 xmm5 = _mm_loadl_epi64((__m128i*)a_buf); \
52 a_buf += 8;
53
54 // Convert 8 pixels: 8 UV and 8 Y.
55 #define YUVTORGB(yuvconstants) \
56 xmm1 = _mm_loadu_si128(&xmm0); \
57 xmm2 = _mm_loadu_si128(&xmm0); \
58 xmm0 = _mm_maddubs_epi16(xmm0, *(__m128i*)yuvconstants->kUVToB); \
59 xmm1 = _mm_maddubs_epi16(xmm1, *(__m128i*)yuvconstants->kUVToG); \
60 xmm2 = _mm_maddubs_epi16(xmm2, *(__m128i*)yuvconstants->kUVToR); \
61 xmm0 = _mm_sub_epi16(*(__m128i*)yuvconstants->kUVBiasB, xmm0); \
62 xmm1 = _mm_sub_epi16(*(__m128i*)yuvconstants->kUVBiasG, xmm1); \
63 xmm2 = _mm_sub_epi16(*(__m128i*)yuvconstants->kUVBiasR, xmm2); \
64 xmm4 = _mm_mulhi_epu16(xmm4, *(__m128i*)yuvconstants->kYToRgb); \
65 xmm0 = _mm_adds_epi16(xmm0, xmm4); \
66 xmm1 = _mm_adds_epi16(xmm1, xmm4); \
67 xmm2 = _mm_adds_epi16(xmm2, xmm4); \
68 xmm0 = _mm_srai_epi16(xmm0, 6); \
69 xmm1 = _mm_srai_epi16(xmm1, 6); \
70 xmm2 = _mm_srai_epi16(xmm2, 6); \
71 xmm0 = _mm_packus_epi16(xmm0, xmm0); \
72 xmm1 = _mm_packus_epi16(xmm1, xmm1); \
73 xmm2 = _mm_packus_epi16(xmm2, xmm2);
74
75 // Store 8 ARGB values.
76 #define STOREARGB \
77 xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); \
78 xmm2 = _mm_unpacklo_epi8(xmm2, xmm5); \
79 xmm1 = _mm_loadu_si128(&xmm0); \
80 xmm0 = _mm_unpacklo_epi16(xmm0, xmm2); \
81 xmm1 = _mm_unpackhi_epi16(xmm1, xmm2); \
82 _mm_storeu_si128((__m128i*)dst_argb, xmm0); \
83 _mm_storeu_si128((__m128i*)(dst_argb + 16), xmm1); \
84 dst_argb += 32;
85
86 #if defined(HAS_I422TOARGBROW_SSSE3)
I422ToARGBRow_SSSE3(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * dst_argb,const struct YuvConstants * yuvconstants,int width)87 void I422ToARGBRow_SSSE3(const uint8* y_buf,
88 const uint8* u_buf,
89 const uint8* v_buf,
90 uint8* dst_argb,
91 const struct YuvConstants* yuvconstants,
92 int width) {
93 __m128i xmm0, xmm1, xmm2, xmm4;
94 const __m128i xmm5 = _mm_set1_epi8(-1);
95 const ptrdiff_t offset = (uint8*)v_buf - (uint8*)u_buf;
96 while (width > 0) {
97 READYUV422
98 YUVTORGB(yuvconstants)
99 STOREARGB
100 width -= 8;
101 }
102 }
103 #endif
104
105 #if defined(HAS_I422ALPHATOARGBROW_SSSE3)
I422AlphaToARGBRow_SSSE3(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,const uint8 * a_buf,uint8 * dst_argb,const struct YuvConstants * yuvconstants,int width)106 void I422AlphaToARGBRow_SSSE3(const uint8* y_buf,
107 const uint8* u_buf,
108 const uint8* v_buf,
109 const uint8* a_buf,
110 uint8* dst_argb,
111 const struct YuvConstants* yuvconstants,
112 int width) {
113 __m128i xmm0, xmm1, xmm2, xmm4, xmm5;
114 const ptrdiff_t offset = (uint8*)v_buf - (uint8*)u_buf;
115 while (width > 0) {
116 READYUVA422
117 YUVTORGB(yuvconstants)
118 STOREARGB
119 width -= 8;
120 }
121 }
122 #endif
123
124 // 32 bit
125 #else // defined(_M_X64)
126 #ifdef HAS_ARGBTOYROW_SSSE3
127
128 // Constants for ARGB.
129 static const vec8 kARGBToY = {13, 65, 33, 0, 13, 65, 33, 0,
130 13, 65, 33, 0, 13, 65, 33, 0};
131
132 // JPeg full range.
133 static const vec8 kARGBToYJ = {15, 75, 38, 0, 15, 75, 38, 0,
134 15, 75, 38, 0, 15, 75, 38, 0};
135
136 static const vec8 kARGBToU = {112, -74, -38, 0, 112, -74, -38, 0,
137 112, -74, -38, 0, 112, -74, -38, 0};
138
139 static const vec8 kARGBToUJ = {127, -84, -43, 0, 127, -84, -43, 0,
140 127, -84, -43, 0, 127, -84, -43, 0};
141
142 static const vec8 kARGBToV = {
143 -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
144 };
145
146 static const vec8 kARGBToVJ = {-20, -107, 127, 0, -20, -107, 127, 0,
147 -20, -107, 127, 0, -20, -107, 127, 0};
148
149 // vpshufb for vphaddw + vpackuswb packed to shorts.
150 static const lvec8 kShufARGBToUV_AVX = {
151 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
152 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15};
153
154 // Constants for BGRA.
155 static const vec8 kBGRAToY = {0, 33, 65, 13, 0, 33, 65, 13,
156 0, 33, 65, 13, 0, 33, 65, 13};
157
158 static const vec8 kBGRAToU = {0, -38, -74, 112, 0, -38, -74, 112,
159 0, -38, -74, 112, 0, -38, -74, 112};
160
161 static const vec8 kBGRAToV = {0, 112, -94, -18, 0, 112, -94, -18,
162 0, 112, -94, -18, 0, 112, -94, -18};
163
164 // Constants for ABGR.
165 static const vec8 kABGRToY = {33, 65, 13, 0, 33, 65, 13, 0,
166 33, 65, 13, 0, 33, 65, 13, 0};
167
168 static const vec8 kABGRToU = {-38, -74, 112, 0, -38, -74, 112, 0,
169 -38, -74, 112, 0, -38, -74, 112, 0};
170
171 static const vec8 kABGRToV = {112, -94, -18, 0, 112, -94, -18, 0,
172 112, -94, -18, 0, 112, -94, -18, 0};
173
174 // Constants for RGBA.
175 static const vec8 kRGBAToY = {0, 13, 65, 33, 0, 13, 65, 33,
176 0, 13, 65, 33, 0, 13, 65, 33};
177
178 static const vec8 kRGBAToU = {0, 112, -74, -38, 0, 112, -74, -38,
179 0, 112, -74, -38, 0, 112, -74, -38};
180
181 static const vec8 kRGBAToV = {0, -18, -94, 112, 0, -18, -94, 112,
182 0, -18, -94, 112, 0, -18, -94, 112};
183
184 static const uvec8 kAddY16 = {16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u,
185 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u};
186
187 // 7 bit fixed point 0.5.
188 static const vec16 kAddYJ64 = {64, 64, 64, 64, 64, 64, 64, 64};
189
190 static const uvec8 kAddUV128 = {128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
191 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
192
193 static const uvec16 kAddUVJ128 = {0x8080u, 0x8080u, 0x8080u, 0x8080u,
194 0x8080u, 0x8080u, 0x8080u, 0x8080u};
195
196 // Shuffle table for converting RGB24 to ARGB.
197 static const uvec8 kShuffleMaskRGB24ToARGB = {
198 0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u};
199
200 // Shuffle table for converting RAW to ARGB.
201 static const uvec8 kShuffleMaskRAWToARGB = {2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u,
202 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u};
203
204 // Shuffle table for converting RAW to RGB24. First 8.
205 static const uvec8 kShuffleMaskRAWToRGB24_0 = {
206 2u, 1u, 0u, 5u, 4u, 3u, 8u, 7u,
207 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
208
209 // Shuffle table for converting RAW to RGB24. Middle 8.
210 static const uvec8 kShuffleMaskRAWToRGB24_1 = {
211 2u, 7u, 6u, 5u, 10u, 9u, 8u, 13u,
212 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
213
214 // Shuffle table for converting RAW to RGB24. Last 8.
215 static const uvec8 kShuffleMaskRAWToRGB24_2 = {
216 8u, 7u, 12u, 11u, 10u, 15u, 14u, 13u,
217 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
218
219 // Shuffle table for converting ARGB to RGB24.
220 static const uvec8 kShuffleMaskARGBToRGB24 = {
221 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u};
222
223 // Shuffle table for converting ARGB to RAW.
224 static const uvec8 kShuffleMaskARGBToRAW = {
225 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u};
226
227 // Shuffle table for converting ARGBToRGB24 for I422ToRGB24. First 8 + next 4
228 static const uvec8 kShuffleMaskARGBToRGB24_0 = {
229 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u};
230
231 // YUY2 shuf 16 Y to 32 Y.
232 static const lvec8 kShuffleYUY2Y = {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10,
233 10, 12, 12, 14, 14, 0, 0, 2, 2, 4, 4,
234 6, 6, 8, 8, 10, 10, 12, 12, 14, 14};
235
236 // YUY2 shuf 8 UV to 16 UV.
237 static const lvec8 kShuffleYUY2UV = {1, 3, 1, 3, 5, 7, 5, 7, 9, 11, 9,
238 11, 13, 15, 13, 15, 1, 3, 1, 3, 5, 7,
239 5, 7, 9, 11, 9, 11, 13, 15, 13, 15};
240
241 // UYVY shuf 16 Y to 32 Y.
242 static const lvec8 kShuffleUYVYY = {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11,
243 11, 13, 13, 15, 15, 1, 1, 3, 3, 5, 5,
244 7, 7, 9, 9, 11, 11, 13, 13, 15, 15};
245
246 // UYVY shuf 8 UV to 16 UV.
247 static const lvec8 kShuffleUYVYUV = {0, 2, 0, 2, 4, 6, 4, 6, 8, 10, 8,
248 10, 12, 14, 12, 14, 0, 2, 0, 2, 4, 6,
249 4, 6, 8, 10, 8, 10, 12, 14, 12, 14};
250
251 // NV21 shuf 8 VU to 16 UV.
252 static const lvec8 kShuffleNV21 = {
253 1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,
254 1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,
255 };
256
257 // Duplicates gray value 3 times and fills in alpha opaque.
258 __declspec(naked) void J400ToARGBRow_SSE2(const uint8* src_y,
259 uint8* dst_argb,
260 int width) {
261 __asm {
262 mov eax, [esp + 4] // src_y
263 mov edx, [esp + 8] // dst_argb
264 mov ecx, [esp + 12] // width
265 pcmpeqb xmm5, xmm5 // generate mask 0xff000000
266 pslld xmm5, 24
267
268 convertloop:
269 movq xmm0, qword ptr [eax]
270 lea eax, [eax + 8]
271 punpcklbw xmm0, xmm0
272 movdqa xmm1, xmm0
273 punpcklwd xmm0, xmm0
274 punpckhwd xmm1, xmm1
275 por xmm0, xmm5
276 por xmm1, xmm5
277 movdqu [edx], xmm0
278 movdqu [edx + 16], xmm1
279 lea edx, [edx + 32]
280 sub ecx, 8
281 jg convertloop
282 ret
283 }
284 }
285
286 #ifdef HAS_J400TOARGBROW_AVX2
287 // Duplicates gray value 3 times and fills in alpha opaque.
288 __declspec(naked) void J400ToARGBRow_AVX2(const uint8* src_y,
289 uint8* dst_argb,
290 int width) {
291 __asm {
292 mov eax, [esp + 4] // src_y
293 mov edx, [esp + 8] // dst_argb
294 mov ecx, [esp + 12] // width
295 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xff000000
296 vpslld ymm5, ymm5, 24
297
298 convertloop:
299 vmovdqu xmm0, [eax]
300 lea eax, [eax + 16]
301 vpermq ymm0, ymm0, 0xd8
302 vpunpcklbw ymm0, ymm0, ymm0
303 vpermq ymm0, ymm0, 0xd8
304 vpunpckhwd ymm1, ymm0, ymm0
305 vpunpcklwd ymm0, ymm0, ymm0
306 vpor ymm0, ymm0, ymm5
307 vpor ymm1, ymm1, ymm5
308 vmovdqu [edx], ymm0
309 vmovdqu [edx + 32], ymm1
310 lea edx, [edx + 64]
311 sub ecx, 16
312 jg convertloop
313 vzeroupper
314 ret
315 }
316 }
317 #endif // HAS_J400TOARGBROW_AVX2
318
319 __declspec(naked) void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24,
320 uint8* dst_argb,
321 int width) {
322 __asm {
323 mov eax, [esp + 4] // src_rgb24
324 mov edx, [esp + 8] // dst_argb
325 mov ecx, [esp + 12] // width
326 pcmpeqb xmm5, xmm5 // generate mask 0xff000000
327 pslld xmm5, 24
328 movdqa xmm4, xmmword ptr kShuffleMaskRGB24ToARGB
329
330 convertloop:
331 movdqu xmm0, [eax]
332 movdqu xmm1, [eax + 16]
333 movdqu xmm3, [eax + 32]
334 lea eax, [eax + 48]
335 movdqa xmm2, xmm3
336 palignr xmm2, xmm1, 8 // xmm2 = { xmm3[0:3] xmm1[8:15]}
337 pshufb xmm2, xmm4
338 por xmm2, xmm5
339 palignr xmm1, xmm0, 12 // xmm1 = { xmm3[0:7] xmm0[12:15]}
340 pshufb xmm0, xmm4
341 movdqu [edx + 32], xmm2
342 por xmm0, xmm5
343 pshufb xmm1, xmm4
344 movdqu [edx], xmm0
345 por xmm1, xmm5
346 palignr xmm3, xmm3, 4 // xmm3 = { xmm3[4:15]}
347 pshufb xmm3, xmm4
348 movdqu [edx + 16], xmm1
349 por xmm3, xmm5
350 movdqu [edx + 48], xmm3
351 lea edx, [edx + 64]
352 sub ecx, 16
353 jg convertloop
354 ret
355 }
356 }
357
358 __declspec(naked) void RAWToARGBRow_SSSE3(const uint8* src_raw,
359 uint8* dst_argb,
360 int width) {
361 __asm {
362 mov eax, [esp + 4] // src_raw
363 mov edx, [esp + 8] // dst_argb
364 mov ecx, [esp + 12] // width
365 pcmpeqb xmm5, xmm5 // generate mask 0xff000000
366 pslld xmm5, 24
367 movdqa xmm4, xmmword ptr kShuffleMaskRAWToARGB
368
369 convertloop:
370 movdqu xmm0, [eax]
371 movdqu xmm1, [eax + 16]
372 movdqu xmm3, [eax + 32]
373 lea eax, [eax + 48]
374 movdqa xmm2, xmm3
375 palignr xmm2, xmm1, 8 // xmm2 = { xmm3[0:3] xmm1[8:15]}
376 pshufb xmm2, xmm4
377 por xmm2, xmm5
378 palignr xmm1, xmm0, 12 // xmm1 = { xmm3[0:7] xmm0[12:15]}
379 pshufb xmm0, xmm4
380 movdqu [edx + 32], xmm2
381 por xmm0, xmm5
382 pshufb xmm1, xmm4
383 movdqu [edx], xmm0
384 por xmm1, xmm5
385 palignr xmm3, xmm3, 4 // xmm3 = { xmm3[4:15]}
386 pshufb xmm3, xmm4
387 movdqu [edx + 16], xmm1
388 por xmm3, xmm5
389 movdqu [edx + 48], xmm3
390 lea edx, [edx + 64]
391 sub ecx, 16
392 jg convertloop
393 ret
394 }
395 }
396
397 __declspec(naked) void RAWToRGB24Row_SSSE3(const uint8* src_raw,
398 uint8* dst_rgb24,
399 int width) {
400 __asm {
401 mov eax, [esp + 4] // src_raw
402 mov edx, [esp + 8] // dst_rgb24
403 mov ecx, [esp + 12] // width
404 movdqa xmm3, xmmword ptr kShuffleMaskRAWToRGB24_0
405 movdqa xmm4, xmmword ptr kShuffleMaskRAWToRGB24_1
406 movdqa xmm5, xmmword ptr kShuffleMaskRAWToRGB24_2
407
408 convertloop:
409 movdqu xmm0, [eax]
410 movdqu xmm1, [eax + 4]
411 movdqu xmm2, [eax + 8]
412 lea eax, [eax + 24]
413 pshufb xmm0, xmm3
414 pshufb xmm1, xmm4
415 pshufb xmm2, xmm5
416 movq qword ptr [edx], xmm0
417 movq qword ptr [edx + 8], xmm1
418 movq qword ptr [edx + 16], xmm2
419 lea edx, [edx + 24]
420 sub ecx, 8
421 jg convertloop
422 ret
423 }
424 }
425
426 // pmul method to replicate bits.
427 // Math to replicate bits:
428 // (v << 8) | (v << 3)
429 // v * 256 + v * 8
430 // v * (256 + 8)
431 // G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3
432 // 20 instructions.
433 __declspec(naked) void RGB565ToARGBRow_SSE2(const uint8* src_rgb565,
434 uint8* dst_argb,
435 int width) {
436 __asm {
437 mov eax, 0x01080108 // generate multiplier to repeat 5 bits
438 movd xmm5, eax
439 pshufd xmm5, xmm5, 0
440 mov eax, 0x20802080 // multiplier shift by 5 and then repeat 6 bits
441 movd xmm6, eax
442 pshufd xmm6, xmm6, 0
443 pcmpeqb xmm3, xmm3 // generate mask 0xf800f800 for Red
444 psllw xmm3, 11
445 pcmpeqb xmm4, xmm4 // generate mask 0x07e007e0 for Green
446 psllw xmm4, 10
447 psrlw xmm4, 5
448 pcmpeqb xmm7, xmm7 // generate mask 0xff00ff00 for Alpha
449 psllw xmm7, 8
450
451 mov eax, [esp + 4] // src_rgb565
452 mov edx, [esp + 8] // dst_argb
453 mov ecx, [esp + 12] // width
454 sub edx, eax
455 sub edx, eax
456
457 convertloop:
458 movdqu xmm0, [eax] // fetch 8 pixels of bgr565
459 movdqa xmm1, xmm0
460 movdqa xmm2, xmm0
461 pand xmm1, xmm3 // R in upper 5 bits
462 psllw xmm2, 11 // B in upper 5 bits
463 pmulhuw xmm1, xmm5 // * (256 + 8)
464 pmulhuw xmm2, xmm5 // * (256 + 8)
465 psllw xmm1, 8
466 por xmm1, xmm2 // RB
467 pand xmm0, xmm4 // G in middle 6 bits
468 pmulhuw xmm0, xmm6 // << 5 * (256 + 4)
469 por xmm0, xmm7 // AG
470 movdqa xmm2, xmm1
471 punpcklbw xmm1, xmm0
472 punpckhbw xmm2, xmm0
473 movdqu [eax * 2 + edx], xmm1 // store 4 pixels of ARGB
474 movdqu [eax * 2 + edx + 16], xmm2 // store next 4 pixels of ARGB
475 lea eax, [eax + 16]
476 sub ecx, 8
477 jg convertloop
478 ret
479 }
480 }
481
482 #ifdef HAS_RGB565TOARGBROW_AVX2
483 // pmul method to replicate bits.
484 // Math to replicate bits:
485 // (v << 8) | (v << 3)
486 // v * 256 + v * 8
487 // v * (256 + 8)
488 // G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3
489 __declspec(naked) void RGB565ToARGBRow_AVX2(const uint8* src_rgb565,
490 uint8* dst_argb,
491 int width) {
492 __asm {
493 mov eax, 0x01080108 // generate multiplier to repeat 5 bits
494 vmovd xmm5, eax
495 vbroadcastss ymm5, xmm5
496 mov eax, 0x20802080 // multiplier shift by 5 and then repeat 6 bits
497 vmovd xmm6, eax
498 vbroadcastss ymm6, xmm6
499 vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0xf800f800 for Red
500 vpsllw ymm3, ymm3, 11
501 vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0x07e007e0 for Green
502 vpsllw ymm4, ymm4, 10
503 vpsrlw ymm4, ymm4, 5
504 vpcmpeqb ymm7, ymm7, ymm7 // generate mask 0xff00ff00 for Alpha
505 vpsllw ymm7, ymm7, 8
506
507 mov eax, [esp + 4] // src_rgb565
508 mov edx, [esp + 8] // dst_argb
509 mov ecx, [esp + 12] // width
510 sub edx, eax
511 sub edx, eax
512
513 convertloop:
514 vmovdqu ymm0, [eax] // fetch 16 pixels of bgr565
515 vpand ymm1, ymm0, ymm3 // R in upper 5 bits
516 vpsllw ymm2, ymm0, 11 // B in upper 5 bits
517 vpmulhuw ymm1, ymm1, ymm5 // * (256 + 8)
518 vpmulhuw ymm2, ymm2, ymm5 // * (256 + 8)
519 vpsllw ymm1, ymm1, 8
520 vpor ymm1, ymm1, ymm2 // RB
521 vpand ymm0, ymm0, ymm4 // G in middle 6 bits
522 vpmulhuw ymm0, ymm0, ymm6 // << 5 * (256 + 4)
523 vpor ymm0, ymm0, ymm7 // AG
524 vpermq ymm0, ymm0, 0xd8 // mutate for unpack
525 vpermq ymm1, ymm1, 0xd8
526 vpunpckhbw ymm2, ymm1, ymm0
527 vpunpcklbw ymm1, ymm1, ymm0
528 vmovdqu [eax * 2 + edx], ymm1 // store 4 pixels of ARGB
529 vmovdqu [eax * 2 + edx + 32], ymm2 // store next 4 pixels of ARGB
530 lea eax, [eax + 32]
531 sub ecx, 16
532 jg convertloop
533 vzeroupper
534 ret
535 }
536 }
537 #endif // HAS_RGB565TOARGBROW_AVX2
538
539 #ifdef HAS_ARGB1555TOARGBROW_AVX2
540 __declspec(naked) void ARGB1555ToARGBRow_AVX2(const uint8* src_argb1555,
541 uint8* dst_argb,
542 int width) {
543 __asm {
544 mov eax, 0x01080108 // generate multiplier to repeat 5 bits
545 vmovd xmm5, eax
546 vbroadcastss ymm5, xmm5
547 mov eax, 0x42004200 // multiplier shift by 6 and then repeat 5 bits
548 vmovd xmm6, eax
549 vbroadcastss ymm6, xmm6
550 vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0xf800f800 for Red
551 vpsllw ymm3, ymm3, 11
552 vpsrlw ymm4, ymm3, 6 // generate mask 0x03e003e0 for Green
553 vpcmpeqb ymm7, ymm7, ymm7 // generate mask 0xff00ff00 for Alpha
554 vpsllw ymm7, ymm7, 8
555
556 mov eax, [esp + 4] // src_argb1555
557 mov edx, [esp + 8] // dst_argb
558 mov ecx, [esp + 12] // width
559 sub edx, eax
560 sub edx, eax
561
562 convertloop:
563 vmovdqu ymm0, [eax] // fetch 16 pixels of 1555
564 vpsllw ymm1, ymm0, 1 // R in upper 5 bits
565 vpsllw ymm2, ymm0, 11 // B in upper 5 bits
566 vpand ymm1, ymm1, ymm3
567 vpmulhuw ymm2, ymm2, ymm5 // * (256 + 8)
568 vpmulhuw ymm1, ymm1, ymm5 // * (256 + 8)
569 vpsllw ymm1, ymm1, 8
570 vpor ymm1, ymm1, ymm2 // RB
571 vpsraw ymm2, ymm0, 8 // A
572 vpand ymm0, ymm0, ymm4 // G in middle 5 bits
573 vpmulhuw ymm0, ymm0, ymm6 // << 6 * (256 + 8)
574 vpand ymm2, ymm2, ymm7
575 vpor ymm0, ymm0, ymm2 // AG
576 vpermq ymm0, ymm0, 0xd8 // mutate for unpack
577 vpermq ymm1, ymm1, 0xd8
578 vpunpckhbw ymm2, ymm1, ymm0
579 vpunpcklbw ymm1, ymm1, ymm0
580 vmovdqu [eax * 2 + edx], ymm1 // store 8 pixels of ARGB
581 vmovdqu [eax * 2 + edx + 32], ymm2 // store next 8 pixels of ARGB
582 lea eax, [eax + 32]
583 sub ecx, 16
584 jg convertloop
585 vzeroupper
586 ret
587 }
588 }
589 #endif // HAS_ARGB1555TOARGBROW_AVX2
590
591 #ifdef HAS_ARGB4444TOARGBROW_AVX2
592 __declspec(naked) void ARGB4444ToARGBRow_AVX2(const uint8* src_argb4444,
593 uint8* dst_argb,
594 int width) {
595 __asm {
596 mov eax, 0x0f0f0f0f // generate mask 0x0f0f0f0f
597 vmovd xmm4, eax
598 vbroadcastss ymm4, xmm4
599 vpslld ymm5, ymm4, 4 // 0xf0f0f0f0 for high nibbles
600 mov eax, [esp + 4] // src_argb4444
601 mov edx, [esp + 8] // dst_argb
602 mov ecx, [esp + 12] // width
603 sub edx, eax
604 sub edx, eax
605
606 convertloop:
607 vmovdqu ymm0, [eax] // fetch 16 pixels of bgra4444
608 vpand ymm2, ymm0, ymm5 // mask high nibbles
609 vpand ymm0, ymm0, ymm4 // mask low nibbles
610 vpsrlw ymm3, ymm2, 4
611 vpsllw ymm1, ymm0, 4
612 vpor ymm2, ymm2, ymm3
613 vpor ymm0, ymm0, ymm1
614 vpermq ymm0, ymm0, 0xd8 // mutate for unpack
615 vpermq ymm2, ymm2, 0xd8
616 vpunpckhbw ymm1, ymm0, ymm2
617 vpunpcklbw ymm0, ymm0, ymm2
618 vmovdqu [eax * 2 + edx], ymm0 // store 8 pixels of ARGB
619 vmovdqu [eax * 2 + edx + 32], ymm1 // store next 8 pixels of ARGB
620 lea eax, [eax + 32]
621 sub ecx, 16
622 jg convertloop
623 vzeroupper
624 ret
625 }
626 }
627 #endif // HAS_ARGB4444TOARGBROW_AVX2
628
629 // 24 instructions
630 __declspec(naked) void ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555,
631 uint8* dst_argb,
632 int width) {
633 __asm {
634 mov eax, 0x01080108 // generate multiplier to repeat 5 bits
635 movd xmm5, eax
636 pshufd xmm5, xmm5, 0
637 mov eax, 0x42004200 // multiplier shift by 6 and then repeat 5 bits
638 movd xmm6, eax
639 pshufd xmm6, xmm6, 0
640 pcmpeqb xmm3, xmm3 // generate mask 0xf800f800 for Red
641 psllw xmm3, 11
642 movdqa xmm4, xmm3 // generate mask 0x03e003e0 for Green
643 psrlw xmm4, 6
644 pcmpeqb xmm7, xmm7 // generate mask 0xff00ff00 for Alpha
645 psllw xmm7, 8
646
647 mov eax, [esp + 4] // src_argb1555
648 mov edx, [esp + 8] // dst_argb
649 mov ecx, [esp + 12] // width
650 sub edx, eax
651 sub edx, eax
652
653 convertloop:
654 movdqu xmm0, [eax] // fetch 8 pixels of 1555
655 movdqa xmm1, xmm0
656 movdqa xmm2, xmm0
657 psllw xmm1, 1 // R in upper 5 bits
658 psllw xmm2, 11 // B in upper 5 bits
659 pand xmm1, xmm3
660 pmulhuw xmm2, xmm5 // * (256 + 8)
661 pmulhuw xmm1, xmm5 // * (256 + 8)
662 psllw xmm1, 8
663 por xmm1, xmm2 // RB
664 movdqa xmm2, xmm0
665 pand xmm0, xmm4 // G in middle 5 bits
666 psraw xmm2, 8 // A
667 pmulhuw xmm0, xmm6 // << 6 * (256 + 8)
668 pand xmm2, xmm7
669 por xmm0, xmm2 // AG
670 movdqa xmm2, xmm1
671 punpcklbw xmm1, xmm0
672 punpckhbw xmm2, xmm0
673 movdqu [eax * 2 + edx], xmm1 // store 4 pixels of ARGB
674 movdqu [eax * 2 + edx + 16], xmm2 // store next 4 pixels of ARGB
675 lea eax, [eax + 16]
676 sub ecx, 8
677 jg convertloop
678 ret
679 }
680 }
681
682 // 18 instructions.
683 __declspec(naked) void ARGB4444ToARGBRow_SSE2(const uint8* src_argb4444,
684 uint8* dst_argb,
685 int width) {
686 __asm {
687 mov eax, 0x0f0f0f0f // generate mask 0x0f0f0f0f
688 movd xmm4, eax
689 pshufd xmm4, xmm4, 0
690 movdqa xmm5, xmm4 // 0xf0f0f0f0 for high nibbles
691 pslld xmm5, 4
692 mov eax, [esp + 4] // src_argb4444
693 mov edx, [esp + 8] // dst_argb
694 mov ecx, [esp + 12] // width
695 sub edx, eax
696 sub edx, eax
697
698 convertloop:
699 movdqu xmm0, [eax] // fetch 8 pixels of bgra4444
700 movdqa xmm2, xmm0
701 pand xmm0, xmm4 // mask low nibbles
702 pand xmm2, xmm5 // mask high nibbles
703 movdqa xmm1, xmm0
704 movdqa xmm3, xmm2
705 psllw xmm1, 4
706 psrlw xmm3, 4
707 por xmm0, xmm1
708 por xmm2, xmm3
709 movdqa xmm1, xmm0
710 punpcklbw xmm0, xmm2
711 punpckhbw xmm1, xmm2
712 movdqu [eax * 2 + edx], xmm0 // store 4 pixels of ARGB
713 movdqu [eax * 2 + edx + 16], xmm1 // store next 4 pixels of ARGB
714 lea eax, [eax + 16]
715 sub ecx, 8
716 jg convertloop
717 ret
718 }
719 }
720
721 __declspec(naked) void ARGBToRGB24Row_SSSE3(const uint8* src_argb,
722 uint8* dst_rgb,
723 int width) {
724 __asm {
725 mov eax, [esp + 4] // src_argb
726 mov edx, [esp + 8] // dst_rgb
727 mov ecx, [esp + 12] // width
728 movdqa xmm6, xmmword ptr kShuffleMaskARGBToRGB24
729
730 convertloop:
731 movdqu xmm0, [eax] // fetch 16 pixels of argb
732 movdqu xmm1, [eax + 16]
733 movdqu xmm2, [eax + 32]
734 movdqu xmm3, [eax + 48]
735 lea eax, [eax + 64]
736 pshufb xmm0, xmm6 // pack 16 bytes of ARGB to 12 bytes of RGB
737 pshufb xmm1, xmm6
738 pshufb xmm2, xmm6
739 pshufb xmm3, xmm6
740 movdqa xmm4, xmm1 // 4 bytes from 1 for 0
741 psrldq xmm1, 4 // 8 bytes from 1
742 pslldq xmm4, 12 // 4 bytes from 1 for 0
743 movdqa xmm5, xmm2 // 8 bytes from 2 for 1
744 por xmm0, xmm4 // 4 bytes from 1 for 0
745 pslldq xmm5, 8 // 8 bytes from 2 for 1
746 movdqu [edx], xmm0 // store 0
747 por xmm1, xmm5 // 8 bytes from 2 for 1
748 psrldq xmm2, 8 // 4 bytes from 2
749 pslldq xmm3, 4 // 12 bytes from 3 for 2
750 por xmm2, xmm3 // 12 bytes from 3 for 2
751 movdqu [edx + 16], xmm1 // store 1
752 movdqu [edx + 32], xmm2 // store 2
753 lea edx, [edx + 48]
754 sub ecx, 16
755 jg convertloop
756 ret
757 }
758 }
759
760 __declspec(naked) void ARGBToRAWRow_SSSE3(const uint8* src_argb,
761 uint8* dst_rgb,
762 int width) {
763 __asm {
764 mov eax, [esp + 4] // src_argb
765 mov edx, [esp + 8] // dst_rgb
766 mov ecx, [esp + 12] // width
767 movdqa xmm6, xmmword ptr kShuffleMaskARGBToRAW
768
769 convertloop:
770 movdqu xmm0, [eax] // fetch 16 pixels of argb
771 movdqu xmm1, [eax + 16]
772 movdqu xmm2, [eax + 32]
773 movdqu xmm3, [eax + 48]
774 lea eax, [eax + 64]
775 pshufb xmm0, xmm6 // pack 16 bytes of ARGB to 12 bytes of RGB
776 pshufb xmm1, xmm6
777 pshufb xmm2, xmm6
778 pshufb xmm3, xmm6
779 movdqa xmm4, xmm1 // 4 bytes from 1 for 0
780 psrldq xmm1, 4 // 8 bytes from 1
781 pslldq xmm4, 12 // 4 bytes from 1 for 0
782 movdqa xmm5, xmm2 // 8 bytes from 2 for 1
783 por xmm0, xmm4 // 4 bytes from 1 for 0
784 pslldq xmm5, 8 // 8 bytes from 2 for 1
785 movdqu [edx], xmm0 // store 0
786 por xmm1, xmm5 // 8 bytes from 2 for 1
787 psrldq xmm2, 8 // 4 bytes from 2
788 pslldq xmm3, 4 // 12 bytes from 3 for 2
789 por xmm2, xmm3 // 12 bytes from 3 for 2
790 movdqu [edx + 16], xmm1 // store 1
791 movdqu [edx + 32], xmm2 // store 2
792 lea edx, [edx + 48]
793 sub ecx, 16
794 jg convertloop
795 ret
796 }
797 }
798
799 __declspec(naked) void ARGBToRGB565Row_SSE2(const uint8* src_argb,
800 uint8* dst_rgb,
801 int width) {
802 __asm {
803 mov eax, [esp + 4] // src_argb
804 mov edx, [esp + 8] // dst_rgb
805 mov ecx, [esp + 12] // width
806 pcmpeqb xmm3, xmm3 // generate mask 0x0000001f
807 psrld xmm3, 27
808 pcmpeqb xmm4, xmm4 // generate mask 0x000007e0
809 psrld xmm4, 26
810 pslld xmm4, 5
811 pcmpeqb xmm5, xmm5 // generate mask 0xfffff800
812 pslld xmm5, 11
813
814 convertloop:
815 movdqu xmm0, [eax] // fetch 4 pixels of argb
816 movdqa xmm1, xmm0 // B
817 movdqa xmm2, xmm0 // G
818 pslld xmm0, 8 // R
819 psrld xmm1, 3 // B
820 psrld xmm2, 5 // G
821 psrad xmm0, 16 // R
822 pand xmm1, xmm3 // B
823 pand xmm2, xmm4 // G
824 pand xmm0, xmm5 // R
825 por xmm1, xmm2 // BG
826 por xmm0, xmm1 // BGR
827 packssdw xmm0, xmm0
828 lea eax, [eax + 16]
829 movq qword ptr [edx], xmm0 // store 4 pixels of RGB565
830 lea edx, [edx + 8]
831 sub ecx, 4
832 jg convertloop
833 ret
834 }
835 }
836
837 __declspec(naked) void ARGBToRGB565DitherRow_SSE2(const uint8* src_argb,
838 uint8* dst_rgb,
839 const uint32 dither4,
840 int width) {
841 __asm {
842
843 mov eax, [esp + 4] // src_argb
844 mov edx, [esp + 8] // dst_rgb
845 movd xmm6, [esp + 12] // dither4
846 mov ecx, [esp + 16] // width
847 punpcklbw xmm6, xmm6 // make dither 16 bytes
848 movdqa xmm7, xmm6
849 punpcklwd xmm6, xmm6
850 punpckhwd xmm7, xmm7
851 pcmpeqb xmm3, xmm3 // generate mask 0x0000001f
852 psrld xmm3, 27
853 pcmpeqb xmm4, xmm4 // generate mask 0x000007e0
854 psrld xmm4, 26
855 pslld xmm4, 5
856 pcmpeqb xmm5, xmm5 // generate mask 0xfffff800
857 pslld xmm5, 11
858
859 convertloop:
860 movdqu xmm0, [eax] // fetch 4 pixels of argb
861 paddusb xmm0, xmm6 // add dither
862 movdqa xmm1, xmm0 // B
863 movdqa xmm2, xmm0 // G
864 pslld xmm0, 8 // R
865 psrld xmm1, 3 // B
866 psrld xmm2, 5 // G
867 psrad xmm0, 16 // R
868 pand xmm1, xmm3 // B
869 pand xmm2, xmm4 // G
870 pand xmm0, xmm5 // R
871 por xmm1, xmm2 // BG
872 por xmm0, xmm1 // BGR
873 packssdw xmm0, xmm0
874 lea eax, [eax + 16]
875 movq qword ptr [edx], xmm0 // store 4 pixels of RGB565
876 lea edx, [edx + 8]
877 sub ecx, 4
878 jg convertloop
879 ret
880 }
881 }
882
883 #ifdef HAS_ARGBTORGB565DITHERROW_AVX2
884 __declspec(naked) void ARGBToRGB565DitherRow_AVX2(const uint8* src_argb,
885 uint8* dst_rgb,
886 const uint32 dither4,
887 int width) {
888 __asm {
889 mov eax, [esp + 4] // src_argb
890 mov edx, [esp + 8] // dst_rgb
891 vbroadcastss xmm6, [esp + 12] // dither4
892 mov ecx, [esp + 16] // width
893 vpunpcklbw xmm6, xmm6, xmm6 // make dither 32 bytes
894 vpermq ymm6, ymm6, 0xd8
895 vpunpcklwd ymm6, ymm6, ymm6
896 vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0x0000001f
897 vpsrld ymm3, ymm3, 27
898 vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0x000007e0
899 vpsrld ymm4, ymm4, 26
900 vpslld ymm4, ymm4, 5
901 vpslld ymm5, ymm3, 11 // generate mask 0x0000f800
902
903 convertloop:
904 vmovdqu ymm0, [eax] // fetch 8 pixels of argb
905 vpaddusb ymm0, ymm0, ymm6 // add dither
906 vpsrld ymm2, ymm0, 5 // G
907 vpsrld ymm1, ymm0, 3 // B
908 vpsrld ymm0, ymm0, 8 // R
909 vpand ymm2, ymm2, ymm4 // G
910 vpand ymm1, ymm1, ymm3 // B
911 vpand ymm0, ymm0, ymm5 // R
912 vpor ymm1, ymm1, ymm2 // BG
913 vpor ymm0, ymm0, ymm1 // BGR
914 vpackusdw ymm0, ymm0, ymm0
915 vpermq ymm0, ymm0, 0xd8
916 lea eax, [eax + 32]
917 vmovdqu [edx], xmm0 // store 8 pixels of RGB565
918 lea edx, [edx + 16]
919 sub ecx, 8
920 jg convertloop
921 vzeroupper
922 ret
923 }
924 }
925 #endif // HAS_ARGBTORGB565DITHERROW_AVX2
926
927 // TODO(fbarchard): Improve sign extension/packing.
928 __declspec(naked) void ARGBToARGB1555Row_SSE2(const uint8* src_argb,
929 uint8* dst_rgb,
930 int width) {
931 __asm {
932 mov eax, [esp + 4] // src_argb
933 mov edx, [esp + 8] // dst_rgb
934 mov ecx, [esp + 12] // width
935 pcmpeqb xmm4, xmm4 // generate mask 0x0000001f
936 psrld xmm4, 27
937 movdqa xmm5, xmm4 // generate mask 0x000003e0
938 pslld xmm5, 5
939 movdqa xmm6, xmm4 // generate mask 0x00007c00
940 pslld xmm6, 10
941 pcmpeqb xmm7, xmm7 // generate mask 0xffff8000
942 pslld xmm7, 15
943
944 convertloop:
945 movdqu xmm0, [eax] // fetch 4 pixels of argb
946 movdqa xmm1, xmm0 // B
947 movdqa xmm2, xmm0 // G
948 movdqa xmm3, xmm0 // R
949 psrad xmm0, 16 // A
950 psrld xmm1, 3 // B
951 psrld xmm2, 6 // G
952 psrld xmm3, 9 // R
953 pand xmm0, xmm7 // A
954 pand xmm1, xmm4 // B
955 pand xmm2, xmm5 // G
956 pand xmm3, xmm6 // R
957 por xmm0, xmm1 // BA
958 por xmm2, xmm3 // GR
959 por xmm0, xmm2 // BGRA
960 packssdw xmm0, xmm0
961 lea eax, [eax + 16]
962 movq qword ptr [edx], xmm0 // store 4 pixels of ARGB1555
963 lea edx, [edx + 8]
964 sub ecx, 4
965 jg convertloop
966 ret
967 }
968 }
969
970 __declspec(naked) void ARGBToARGB4444Row_SSE2(const uint8* src_argb,
971 uint8* dst_rgb,
972 int width) {
973 __asm {
974 mov eax, [esp + 4] // src_argb
975 mov edx, [esp + 8] // dst_rgb
976 mov ecx, [esp + 12] // width
977 pcmpeqb xmm4, xmm4 // generate mask 0xf000f000
978 psllw xmm4, 12
979 movdqa xmm3, xmm4 // generate mask 0x00f000f0
980 psrlw xmm3, 8
981
982 convertloop:
983 movdqu xmm0, [eax] // fetch 4 pixels of argb
984 movdqa xmm1, xmm0
985 pand xmm0, xmm3 // low nibble
986 pand xmm1, xmm4 // high nibble
987 psrld xmm0, 4
988 psrld xmm1, 8
989 por xmm0, xmm1
990 packuswb xmm0, xmm0
991 lea eax, [eax + 16]
992 movq qword ptr [edx], xmm0 // store 4 pixels of ARGB4444
993 lea edx, [edx + 8]
994 sub ecx, 4
995 jg convertloop
996 ret
997 }
998 }
999
1000 #ifdef HAS_ARGBTORGB565ROW_AVX2
1001 __declspec(naked) void ARGBToRGB565Row_AVX2(const uint8* src_argb,
1002 uint8* dst_rgb,
1003 int width) {
1004 __asm {
1005 mov eax, [esp + 4] // src_argb
1006 mov edx, [esp + 8] // dst_rgb
1007 mov ecx, [esp + 12] // width
1008 vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0x0000001f
1009 vpsrld ymm3, ymm3, 27
1010 vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0x000007e0
1011 vpsrld ymm4, ymm4, 26
1012 vpslld ymm4, ymm4, 5
1013 vpslld ymm5, ymm3, 11 // generate mask 0x0000f800
1014
1015 convertloop:
1016 vmovdqu ymm0, [eax] // fetch 8 pixels of argb
1017 vpsrld ymm2, ymm0, 5 // G
1018 vpsrld ymm1, ymm0, 3 // B
1019 vpsrld ymm0, ymm0, 8 // R
1020 vpand ymm2, ymm2, ymm4 // G
1021 vpand ymm1, ymm1, ymm3 // B
1022 vpand ymm0, ymm0, ymm5 // R
1023 vpor ymm1, ymm1, ymm2 // BG
1024 vpor ymm0, ymm0, ymm1 // BGR
1025 vpackusdw ymm0, ymm0, ymm0
1026 vpermq ymm0, ymm0, 0xd8
1027 lea eax, [eax + 32]
1028 vmovdqu [edx], xmm0 // store 8 pixels of RGB565
1029 lea edx, [edx + 16]
1030 sub ecx, 8
1031 jg convertloop
1032 vzeroupper
1033 ret
1034 }
1035 }
1036 #endif // HAS_ARGBTORGB565ROW_AVX2
1037
1038 #ifdef HAS_ARGBTOARGB1555ROW_AVX2
1039 __declspec(naked) void ARGBToARGB1555Row_AVX2(const uint8* src_argb,
1040 uint8* dst_rgb,
1041 int width) {
1042 __asm {
1043 mov eax, [esp + 4] // src_argb
1044 mov edx, [esp + 8] // dst_rgb
1045 mov ecx, [esp + 12] // width
1046 vpcmpeqb ymm4, ymm4, ymm4
1047 vpsrld ymm4, ymm4, 27 // generate mask 0x0000001f
1048 vpslld ymm5, ymm4, 5 // generate mask 0x000003e0
1049 vpslld ymm6, ymm4, 10 // generate mask 0x00007c00
1050 vpcmpeqb ymm7, ymm7, ymm7 // generate mask 0xffff8000
1051 vpslld ymm7, ymm7, 15
1052
1053 convertloop:
1054 vmovdqu ymm0, [eax] // fetch 8 pixels of argb
1055 vpsrld ymm3, ymm0, 9 // R
1056 vpsrld ymm2, ymm0, 6 // G
1057 vpsrld ymm1, ymm0, 3 // B
1058 vpsrad ymm0, ymm0, 16 // A
1059 vpand ymm3, ymm3, ymm6 // R
1060 vpand ymm2, ymm2, ymm5 // G
1061 vpand ymm1, ymm1, ymm4 // B
1062 vpand ymm0, ymm0, ymm7 // A
1063 vpor ymm0, ymm0, ymm1 // BA
1064 vpor ymm2, ymm2, ymm3 // GR
1065 vpor ymm0, ymm0, ymm2 // BGRA
1066 vpackssdw ymm0, ymm0, ymm0
1067 vpermq ymm0, ymm0, 0xd8
1068 lea eax, [eax + 32]
1069 vmovdqu [edx], xmm0 // store 8 pixels of ARGB1555
1070 lea edx, [edx + 16]
1071 sub ecx, 8
1072 jg convertloop
1073 vzeroupper
1074 ret
1075 }
1076 }
1077 #endif // HAS_ARGBTOARGB1555ROW_AVX2
1078
1079 #ifdef HAS_ARGBTOARGB4444ROW_AVX2
1080 __declspec(naked) void ARGBToARGB4444Row_AVX2(const uint8* src_argb,
1081 uint8* dst_rgb,
1082 int width) {
1083 __asm {
1084 mov eax, [esp + 4] // src_argb
1085 mov edx, [esp + 8] // dst_rgb
1086 mov ecx, [esp + 12] // width
1087 vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0xf000f000
1088 vpsllw ymm4, ymm4, 12
1089 vpsrlw ymm3, ymm4, 8 // generate mask 0x00f000f0
1090
1091 convertloop:
1092 vmovdqu ymm0, [eax] // fetch 8 pixels of argb
1093 vpand ymm1, ymm0, ymm4 // high nibble
1094 vpand ymm0, ymm0, ymm3 // low nibble
1095 vpsrld ymm1, ymm1, 8
1096 vpsrld ymm0, ymm0, 4
1097 vpor ymm0, ymm0, ymm1
1098 vpackuswb ymm0, ymm0, ymm0
1099 vpermq ymm0, ymm0, 0xd8
1100 lea eax, [eax + 32]
1101 vmovdqu [edx], xmm0 // store 8 pixels of ARGB4444
1102 lea edx, [edx + 16]
1103 sub ecx, 8
1104 jg convertloop
1105 vzeroupper
1106 ret
1107 }
1108 }
1109 #endif // HAS_ARGBTOARGB4444ROW_AVX2
1110
1111 // Convert 16 ARGB pixels (64 bytes) to 16 Y values.
1112 __declspec(naked) void ARGBToYRow_SSSE3(const uint8* src_argb,
1113 uint8* dst_y,
1114 int width) {
1115 __asm {
1116 mov eax, [esp + 4] /* src_argb */
1117 mov edx, [esp + 8] /* dst_y */
1118 mov ecx, [esp + 12] /* width */
1119 movdqa xmm4, xmmword ptr kARGBToY
1120 movdqa xmm5, xmmword ptr kAddY16
1121
1122 convertloop:
1123 movdqu xmm0, [eax]
1124 movdqu xmm1, [eax + 16]
1125 movdqu xmm2, [eax + 32]
1126 movdqu xmm3, [eax + 48]
1127 pmaddubsw xmm0, xmm4
1128 pmaddubsw xmm1, xmm4
1129 pmaddubsw xmm2, xmm4
1130 pmaddubsw xmm3, xmm4
1131 lea eax, [eax + 64]
1132 phaddw xmm0, xmm1
1133 phaddw xmm2, xmm3
1134 psrlw xmm0, 7
1135 psrlw xmm2, 7
1136 packuswb xmm0, xmm2
1137 paddb xmm0, xmm5
1138 movdqu [edx], xmm0
1139 lea edx, [edx + 16]
1140 sub ecx, 16
1141 jg convertloop
1142 ret
1143 }
1144 }
1145
1146 // Convert 16 ARGB pixels (64 bytes) to 16 YJ values.
1147 // Same as ARGBToYRow but different coefficients, no add 16, but do rounding.
1148 __declspec(naked) void ARGBToYJRow_SSSE3(const uint8* src_argb,
1149 uint8* dst_y,
1150 int width) {
1151 __asm {
1152 mov eax, [esp + 4] /* src_argb */
1153 mov edx, [esp + 8] /* dst_y */
1154 mov ecx, [esp + 12] /* width */
1155 movdqa xmm4, xmmword ptr kARGBToYJ
1156 movdqa xmm5, xmmword ptr kAddYJ64
1157
1158 convertloop:
1159 movdqu xmm0, [eax]
1160 movdqu xmm1, [eax + 16]
1161 movdqu xmm2, [eax + 32]
1162 movdqu xmm3, [eax + 48]
1163 pmaddubsw xmm0, xmm4
1164 pmaddubsw xmm1, xmm4
1165 pmaddubsw xmm2, xmm4
1166 pmaddubsw xmm3, xmm4
1167 lea eax, [eax + 64]
1168 phaddw xmm0, xmm1
1169 phaddw xmm2, xmm3
1170 paddw xmm0, xmm5 // Add .5 for rounding.
1171 paddw xmm2, xmm5
1172 psrlw xmm0, 7
1173 psrlw xmm2, 7
1174 packuswb xmm0, xmm2
1175 movdqu [edx], xmm0
1176 lea edx, [edx + 16]
1177 sub ecx, 16
1178 jg convertloop
1179 ret
1180 }
1181 }
1182
1183 #ifdef HAS_ARGBTOYROW_AVX2
1184 // vpermd for vphaddw + vpackuswb vpermd.
1185 static const lvec32 kPermdARGBToY_AVX = {0, 4, 1, 5, 2, 6, 3, 7};
1186
1187 // Convert 32 ARGB pixels (128 bytes) to 32 Y values.
1188 __declspec(naked) void ARGBToYRow_AVX2(const uint8* src_argb,
1189 uint8* dst_y,
1190 int width) {
1191 __asm {
1192 mov eax, [esp + 4] /* src_argb */
1193 mov edx, [esp + 8] /* dst_y */
1194 mov ecx, [esp + 12] /* width */
1195 vbroadcastf128 ymm4, xmmword ptr kARGBToY
1196 vbroadcastf128 ymm5, xmmword ptr kAddY16
1197 vmovdqu ymm6, ymmword ptr kPermdARGBToY_AVX
1198
1199 convertloop:
1200 vmovdqu ymm0, [eax]
1201 vmovdqu ymm1, [eax + 32]
1202 vmovdqu ymm2, [eax + 64]
1203 vmovdqu ymm3, [eax + 96]
1204 vpmaddubsw ymm0, ymm0, ymm4
1205 vpmaddubsw ymm1, ymm1, ymm4
1206 vpmaddubsw ymm2, ymm2, ymm4
1207 vpmaddubsw ymm3, ymm3, ymm4
1208 lea eax, [eax + 128]
1209 vphaddw ymm0, ymm0, ymm1 // mutates.
1210 vphaddw ymm2, ymm2, ymm3
1211 vpsrlw ymm0, ymm0, 7
1212 vpsrlw ymm2, ymm2, 7
1213 vpackuswb ymm0, ymm0, ymm2 // mutates.
1214 vpermd ymm0, ymm6, ymm0 // For vphaddw + vpackuswb mutation.
1215 vpaddb ymm0, ymm0, ymm5 // add 16 for Y
1216 vmovdqu [edx], ymm0
1217 lea edx, [edx + 32]
1218 sub ecx, 32
1219 jg convertloop
1220 vzeroupper
1221 ret
1222 }
1223 }
1224 #endif // HAS_ARGBTOYROW_AVX2
1225
1226 #ifdef HAS_ARGBTOYJROW_AVX2
1227 // Convert 32 ARGB pixels (128 bytes) to 32 Y values.
1228 __declspec(naked) void ARGBToYJRow_AVX2(const uint8* src_argb,
1229 uint8* dst_y,
1230 int width) {
1231 __asm {
1232 mov eax, [esp + 4] /* src_argb */
1233 mov edx, [esp + 8] /* dst_y */
1234 mov ecx, [esp + 12] /* width */
1235 vbroadcastf128 ymm4, xmmword ptr kARGBToYJ
1236 vbroadcastf128 ymm5, xmmword ptr kAddYJ64
1237 vmovdqu ymm6, ymmword ptr kPermdARGBToY_AVX
1238
1239 convertloop:
1240 vmovdqu ymm0, [eax]
1241 vmovdqu ymm1, [eax + 32]
1242 vmovdqu ymm2, [eax + 64]
1243 vmovdqu ymm3, [eax + 96]
1244 vpmaddubsw ymm0, ymm0, ymm4
1245 vpmaddubsw ymm1, ymm1, ymm4
1246 vpmaddubsw ymm2, ymm2, ymm4
1247 vpmaddubsw ymm3, ymm3, ymm4
1248 lea eax, [eax + 128]
1249 vphaddw ymm0, ymm0, ymm1 // mutates.
1250 vphaddw ymm2, ymm2, ymm3
1251 vpaddw ymm0, ymm0, ymm5 // Add .5 for rounding.
1252 vpaddw ymm2, ymm2, ymm5
1253 vpsrlw ymm0, ymm0, 7
1254 vpsrlw ymm2, ymm2, 7
1255 vpackuswb ymm0, ymm0, ymm2 // mutates.
1256 vpermd ymm0, ymm6, ymm0 // For vphaddw + vpackuswb mutation.
1257 vmovdqu [edx], ymm0
1258 lea edx, [edx + 32]
1259 sub ecx, 32
1260 jg convertloop
1261
1262 vzeroupper
1263 ret
1264 }
1265 }
1266 #endif // HAS_ARGBTOYJROW_AVX2
1267
1268 __declspec(naked) void BGRAToYRow_SSSE3(const uint8* src_argb,
1269 uint8* dst_y,
1270 int width) {
1271 __asm {
1272 mov eax, [esp + 4] /* src_argb */
1273 mov edx, [esp + 8] /* dst_y */
1274 mov ecx, [esp + 12] /* width */
1275 movdqa xmm4, xmmword ptr kBGRAToY
1276 movdqa xmm5, xmmword ptr kAddY16
1277
1278 convertloop:
1279 movdqu xmm0, [eax]
1280 movdqu xmm1, [eax + 16]
1281 movdqu xmm2, [eax + 32]
1282 movdqu xmm3, [eax + 48]
1283 pmaddubsw xmm0, xmm4
1284 pmaddubsw xmm1, xmm4
1285 pmaddubsw xmm2, xmm4
1286 pmaddubsw xmm3, xmm4
1287 lea eax, [eax + 64]
1288 phaddw xmm0, xmm1
1289 phaddw xmm2, xmm3
1290 psrlw xmm0, 7
1291 psrlw xmm2, 7
1292 packuswb xmm0, xmm2
1293 paddb xmm0, xmm5
1294 movdqu [edx], xmm0
1295 lea edx, [edx + 16]
1296 sub ecx, 16
1297 jg convertloop
1298 ret
1299 }
1300 }
1301
1302 __declspec(naked) void ABGRToYRow_SSSE3(const uint8* src_argb,
1303 uint8* dst_y,
1304 int width) {
1305 __asm {
1306 mov eax, [esp + 4] /* src_argb */
1307 mov edx, [esp + 8] /* dst_y */
1308 mov ecx, [esp + 12] /* width */
1309 movdqa xmm4, xmmword ptr kABGRToY
1310 movdqa xmm5, xmmword ptr kAddY16
1311
1312 convertloop:
1313 movdqu xmm0, [eax]
1314 movdqu xmm1, [eax + 16]
1315 movdqu xmm2, [eax + 32]
1316 movdqu xmm3, [eax + 48]
1317 pmaddubsw xmm0, xmm4
1318 pmaddubsw xmm1, xmm4
1319 pmaddubsw xmm2, xmm4
1320 pmaddubsw xmm3, xmm4
1321 lea eax, [eax + 64]
1322 phaddw xmm0, xmm1
1323 phaddw xmm2, xmm3
1324 psrlw xmm0, 7
1325 psrlw xmm2, 7
1326 packuswb xmm0, xmm2
1327 paddb xmm0, xmm5
1328 movdqu [edx], xmm0
1329 lea edx, [edx + 16]
1330 sub ecx, 16
1331 jg convertloop
1332 ret
1333 }
1334 }
1335
1336 __declspec(naked) void RGBAToYRow_SSSE3(const uint8* src_argb,
1337 uint8* dst_y,
1338 int width) {
1339 __asm {
1340 mov eax, [esp + 4] /* src_argb */
1341 mov edx, [esp + 8] /* dst_y */
1342 mov ecx, [esp + 12] /* width */
1343 movdqa xmm4, xmmword ptr kRGBAToY
1344 movdqa xmm5, xmmword ptr kAddY16
1345
1346 convertloop:
1347 movdqu xmm0, [eax]
1348 movdqu xmm1, [eax + 16]
1349 movdqu xmm2, [eax + 32]
1350 movdqu xmm3, [eax + 48]
1351 pmaddubsw xmm0, xmm4
1352 pmaddubsw xmm1, xmm4
1353 pmaddubsw xmm2, xmm4
1354 pmaddubsw xmm3, xmm4
1355 lea eax, [eax + 64]
1356 phaddw xmm0, xmm1
1357 phaddw xmm2, xmm3
1358 psrlw xmm0, 7
1359 psrlw xmm2, 7
1360 packuswb xmm0, xmm2
1361 paddb xmm0, xmm5
1362 movdqu [edx], xmm0
1363 lea edx, [edx + 16]
1364 sub ecx, 16
1365 jg convertloop
1366 ret
1367 }
1368 }
1369
1370 __declspec(naked) void ARGBToUVRow_SSSE3(const uint8* src_argb0,
1371 int src_stride_argb,
1372 uint8* dst_u,
1373 uint8* dst_v,
1374 int width) {
1375 __asm {
1376 push esi
1377 push edi
1378 mov eax, [esp + 8 + 4] // src_argb
1379 mov esi, [esp + 8 + 8] // src_stride_argb
1380 mov edx, [esp + 8 + 12] // dst_u
1381 mov edi, [esp + 8 + 16] // dst_v
1382 mov ecx, [esp + 8 + 20] // width
1383 movdqa xmm5, xmmword ptr kAddUV128
1384 movdqa xmm6, xmmword ptr kARGBToV
1385 movdqa xmm7, xmmword ptr kARGBToU
1386 sub edi, edx // stride from u to v
1387
1388 convertloop:
1389 /* step 1 - subsample 16x2 argb pixels to 8x1 */
1390 movdqu xmm0, [eax]
1391 movdqu xmm4, [eax + esi]
1392 pavgb xmm0, xmm4
1393 movdqu xmm1, [eax + 16]
1394 movdqu xmm4, [eax + esi + 16]
1395 pavgb xmm1, xmm4
1396 movdqu xmm2, [eax + 32]
1397 movdqu xmm4, [eax + esi + 32]
1398 pavgb xmm2, xmm4
1399 movdqu xmm3, [eax + 48]
1400 movdqu xmm4, [eax + esi + 48]
1401 pavgb xmm3, xmm4
1402
1403 lea eax, [eax + 64]
1404 movdqa xmm4, xmm0
1405 shufps xmm0, xmm1, 0x88
1406 shufps xmm4, xmm1, 0xdd
1407 pavgb xmm0, xmm4
1408 movdqa xmm4, xmm2
1409 shufps xmm2, xmm3, 0x88
1410 shufps xmm4, xmm3, 0xdd
1411 pavgb xmm2, xmm4
1412
1413 // step 2 - convert to U and V
1414 // from here down is very similar to Y code except
1415 // instead of 16 different pixels, its 8 pixels of U and 8 of V
1416 movdqa xmm1, xmm0
1417 movdqa xmm3, xmm2
1418 pmaddubsw xmm0, xmm7 // U
1419 pmaddubsw xmm2, xmm7
1420 pmaddubsw xmm1, xmm6 // V
1421 pmaddubsw xmm3, xmm6
1422 phaddw xmm0, xmm2
1423 phaddw xmm1, xmm3
1424 psraw xmm0, 8
1425 psraw xmm1, 8
1426 packsswb xmm0, xmm1
1427 paddb xmm0, xmm5 // -> unsigned
1428
1429 // step 3 - store 8 U and 8 V values
1430 movlps qword ptr [edx], xmm0 // U
1431 movhps qword ptr [edx + edi], xmm0 // V
1432 lea edx, [edx + 8]
1433 sub ecx, 16
1434 jg convertloop
1435
1436 pop edi
1437 pop esi
1438 ret
1439 }
1440 }
1441
1442 __declspec(naked) void ARGBToUVJRow_SSSE3(const uint8* src_argb0,
1443 int src_stride_argb,
1444 uint8* dst_u,
1445 uint8* dst_v,
1446 int width) {
1447 __asm {
1448 push esi
1449 push edi
1450 mov eax, [esp + 8 + 4] // src_argb
1451 mov esi, [esp + 8 + 8] // src_stride_argb
1452 mov edx, [esp + 8 + 12] // dst_u
1453 mov edi, [esp + 8 + 16] // dst_v
1454 mov ecx, [esp + 8 + 20] // width
1455 movdqa xmm5, xmmword ptr kAddUVJ128
1456 movdqa xmm6, xmmword ptr kARGBToVJ
1457 movdqa xmm7, xmmword ptr kARGBToUJ
1458 sub edi, edx // stride from u to v
1459
1460 convertloop:
1461 /* step 1 - subsample 16x2 argb pixels to 8x1 */
1462 movdqu xmm0, [eax]
1463 movdqu xmm4, [eax + esi]
1464 pavgb xmm0, xmm4
1465 movdqu xmm1, [eax + 16]
1466 movdqu xmm4, [eax + esi + 16]
1467 pavgb xmm1, xmm4
1468 movdqu xmm2, [eax + 32]
1469 movdqu xmm4, [eax + esi + 32]
1470 pavgb xmm2, xmm4
1471 movdqu xmm3, [eax + 48]
1472 movdqu xmm4, [eax + esi + 48]
1473 pavgb xmm3, xmm4
1474
1475 lea eax, [eax + 64]
1476 movdqa xmm4, xmm0
1477 shufps xmm0, xmm1, 0x88
1478 shufps xmm4, xmm1, 0xdd
1479 pavgb xmm0, xmm4
1480 movdqa xmm4, xmm2
1481 shufps xmm2, xmm3, 0x88
1482 shufps xmm4, xmm3, 0xdd
1483 pavgb xmm2, xmm4
1484
1485 // step 2 - convert to U and V
1486 // from here down is very similar to Y code except
1487 // instead of 16 different pixels, its 8 pixels of U and 8 of V
1488 movdqa xmm1, xmm0
1489 movdqa xmm3, xmm2
1490 pmaddubsw xmm0, xmm7 // U
1491 pmaddubsw xmm2, xmm7
1492 pmaddubsw xmm1, xmm6 // V
1493 pmaddubsw xmm3, xmm6
1494 phaddw xmm0, xmm2
1495 phaddw xmm1, xmm3
1496 paddw xmm0, xmm5 // +.5 rounding -> unsigned
1497 paddw xmm1, xmm5
1498 psraw xmm0, 8
1499 psraw xmm1, 8
1500 packsswb xmm0, xmm1
1501
1502 // step 3 - store 8 U and 8 V values
1503 movlps qword ptr [edx], xmm0 // U
1504 movhps qword ptr [edx + edi], xmm0 // V
1505 lea edx, [edx + 8]
1506 sub ecx, 16
1507 jg convertloop
1508
1509 pop edi
1510 pop esi
1511 ret
1512 }
1513 }
1514
1515 #ifdef HAS_ARGBTOUVROW_AVX2
1516 __declspec(naked) void ARGBToUVRow_AVX2(const uint8* src_argb0,
1517 int src_stride_argb,
1518 uint8* dst_u,
1519 uint8* dst_v,
1520 int width) {
1521 __asm {
1522 push esi
1523 push edi
1524 mov eax, [esp + 8 + 4] // src_argb
1525 mov esi, [esp + 8 + 8] // src_stride_argb
1526 mov edx, [esp + 8 + 12] // dst_u
1527 mov edi, [esp + 8 + 16] // dst_v
1528 mov ecx, [esp + 8 + 20] // width
1529 vbroadcastf128 ymm5, xmmword ptr kAddUV128
1530 vbroadcastf128 ymm6, xmmword ptr kARGBToV
1531 vbroadcastf128 ymm7, xmmword ptr kARGBToU
1532 sub edi, edx // stride from u to v
1533
1534 convertloop:
1535 /* step 1 - subsample 32x2 argb pixels to 16x1 */
1536 vmovdqu ymm0, [eax]
1537 vmovdqu ymm1, [eax + 32]
1538 vmovdqu ymm2, [eax + 64]
1539 vmovdqu ymm3, [eax + 96]
1540 vpavgb ymm0, ymm0, [eax + esi]
1541 vpavgb ymm1, ymm1, [eax + esi + 32]
1542 vpavgb ymm2, ymm2, [eax + esi + 64]
1543 vpavgb ymm3, ymm3, [eax + esi + 96]
1544 lea eax, [eax + 128]
1545 vshufps ymm4, ymm0, ymm1, 0x88
1546 vshufps ymm0, ymm0, ymm1, 0xdd
1547 vpavgb ymm0, ymm0, ymm4 // mutated by vshufps
1548 vshufps ymm4, ymm2, ymm3, 0x88
1549 vshufps ymm2, ymm2, ymm3, 0xdd
1550 vpavgb ymm2, ymm2, ymm4 // mutated by vshufps
1551
1552 // step 2 - convert to U and V
1553 // from here down is very similar to Y code except
1554 // instead of 32 different pixels, its 16 pixels of U and 16 of V
1555 vpmaddubsw ymm1, ymm0, ymm7 // U
1556 vpmaddubsw ymm3, ymm2, ymm7
1557 vpmaddubsw ymm0, ymm0, ymm6 // V
1558 vpmaddubsw ymm2, ymm2, ymm6
1559 vphaddw ymm1, ymm1, ymm3 // mutates
1560 vphaddw ymm0, ymm0, ymm2
1561 vpsraw ymm1, ymm1, 8
1562 vpsraw ymm0, ymm0, 8
1563 vpacksswb ymm0, ymm1, ymm0 // mutates
1564 vpermq ymm0, ymm0, 0xd8 // For vpacksswb
1565 vpshufb ymm0, ymm0, ymmword ptr kShufARGBToUV_AVX // for vshufps/vphaddw
1566 vpaddb ymm0, ymm0, ymm5 // -> unsigned
1567
1568 // step 3 - store 16 U and 16 V values
1569 vextractf128 [edx], ymm0, 0 // U
1570 vextractf128 [edx + edi], ymm0, 1 // V
1571 lea edx, [edx + 16]
1572 sub ecx, 32
1573 jg convertloop
1574
1575 pop edi
1576 pop esi
1577 vzeroupper
1578 ret
1579 }
1580 }
1581 #endif // HAS_ARGBTOUVROW_AVX2
1582
1583 #ifdef HAS_ARGBTOUVJROW_AVX2
1584 __declspec(naked) void ARGBToUVJRow_AVX2(const uint8* src_argb0,
1585 int src_stride_argb,
1586 uint8* dst_u,
1587 uint8* dst_v,
1588 int width) {
1589 __asm {
1590 push esi
1591 push edi
1592 mov eax, [esp + 8 + 4] // src_argb
1593 mov esi, [esp + 8 + 8] // src_stride_argb
1594 mov edx, [esp + 8 + 12] // dst_u
1595 mov edi, [esp + 8 + 16] // dst_v
1596 mov ecx, [esp + 8 + 20] // width
1597 vbroadcastf128 ymm5, xmmword ptr kAddUV128
1598 vbroadcastf128 ymm6, xmmword ptr kARGBToV
1599 vbroadcastf128 ymm7, xmmword ptr kARGBToU
1600 sub edi, edx // stride from u to v
1601
1602 convertloop:
1603 /* step 1 - subsample 32x2 argb pixels to 16x1 */
1604 vmovdqu ymm0, [eax]
1605 vmovdqu ymm1, [eax + 32]
1606 vmovdqu ymm2, [eax + 64]
1607 vmovdqu ymm3, [eax + 96]
1608 vpavgb ymm0, ymm0, [eax + esi]
1609 vpavgb ymm1, ymm1, [eax + esi + 32]
1610 vpavgb ymm2, ymm2, [eax + esi + 64]
1611 vpavgb ymm3, ymm3, [eax + esi + 96]
1612 lea eax, [eax + 128]
1613 vshufps ymm4, ymm0, ymm1, 0x88
1614 vshufps ymm0, ymm0, ymm1, 0xdd
1615 vpavgb ymm0, ymm0, ymm4 // mutated by vshufps
1616 vshufps ymm4, ymm2, ymm3, 0x88
1617 vshufps ymm2, ymm2, ymm3, 0xdd
1618 vpavgb ymm2, ymm2, ymm4 // mutated by vshufps
1619
1620 // step 2 - convert to U and V
1621 // from here down is very similar to Y code except
1622 // instead of 32 different pixels, its 16 pixels of U and 16 of V
1623 vpmaddubsw ymm1, ymm0, ymm7 // U
1624 vpmaddubsw ymm3, ymm2, ymm7
1625 vpmaddubsw ymm0, ymm0, ymm6 // V
1626 vpmaddubsw ymm2, ymm2, ymm6
1627 vphaddw ymm1, ymm1, ymm3 // mutates
1628 vphaddw ymm0, ymm0, ymm2
1629 vpaddw ymm1, ymm1, ymm5 // +.5 rounding -> unsigned
1630 vpaddw ymm0, ymm0, ymm5
1631 vpsraw ymm1, ymm1, 8
1632 vpsraw ymm0, ymm0, 8
1633 vpacksswb ymm0, ymm1, ymm0 // mutates
1634 vpermq ymm0, ymm0, 0xd8 // For vpacksswb
1635 vpshufb ymm0, ymm0, ymmword ptr kShufARGBToUV_AVX // for vshufps/vphaddw
1636
1637 // step 3 - store 16 U and 16 V values
1638 vextractf128 [edx], ymm0, 0 // U
1639 vextractf128 [edx + edi], ymm0, 1 // V
1640 lea edx, [edx + 16]
1641 sub ecx, 32
1642 jg convertloop
1643
1644 pop edi
1645 pop esi
1646 vzeroupper
1647 ret
1648 }
1649 }
1650 #endif // HAS_ARGBTOUVJROW_AVX2
1651
1652 __declspec(naked) void ARGBToUV444Row_SSSE3(const uint8* src_argb0,
1653 uint8* dst_u,
1654 uint8* dst_v,
1655 int width) {
1656 __asm {
1657 push edi
1658 mov eax, [esp + 4 + 4] // src_argb
1659 mov edx, [esp + 4 + 8] // dst_u
1660 mov edi, [esp + 4 + 12] // dst_v
1661 mov ecx, [esp + 4 + 16] // width
1662 movdqa xmm5, xmmword ptr kAddUV128
1663 movdqa xmm6, xmmword ptr kARGBToV
1664 movdqa xmm7, xmmword ptr kARGBToU
1665 sub edi, edx // stride from u to v
1666
1667 convertloop:
1668 /* convert to U and V */
1669 movdqu xmm0, [eax] // U
1670 movdqu xmm1, [eax + 16]
1671 movdqu xmm2, [eax + 32]
1672 movdqu xmm3, [eax + 48]
1673 pmaddubsw xmm0, xmm7
1674 pmaddubsw xmm1, xmm7
1675 pmaddubsw xmm2, xmm7
1676 pmaddubsw xmm3, xmm7
1677 phaddw xmm0, xmm1
1678 phaddw xmm2, xmm3
1679 psraw xmm0, 8
1680 psraw xmm2, 8
1681 packsswb xmm0, xmm2
1682 paddb xmm0, xmm5
1683 movdqu [edx], xmm0
1684
1685 movdqu xmm0, [eax] // V
1686 movdqu xmm1, [eax + 16]
1687 movdqu xmm2, [eax + 32]
1688 movdqu xmm3, [eax + 48]
1689 pmaddubsw xmm0, xmm6
1690 pmaddubsw xmm1, xmm6
1691 pmaddubsw xmm2, xmm6
1692 pmaddubsw xmm3, xmm6
1693 phaddw xmm0, xmm1
1694 phaddw xmm2, xmm3
1695 psraw xmm0, 8
1696 psraw xmm2, 8
1697 packsswb xmm0, xmm2
1698 paddb xmm0, xmm5
1699 lea eax, [eax + 64]
1700 movdqu [edx + edi], xmm0
1701 lea edx, [edx + 16]
1702 sub ecx, 16
1703 jg convertloop
1704
1705 pop edi
1706 ret
1707 }
1708 }
1709
1710 __declspec(naked) void BGRAToUVRow_SSSE3(const uint8* src_argb0,
1711 int src_stride_argb,
1712 uint8* dst_u,
1713 uint8* dst_v,
1714 int width) {
1715 __asm {
1716 push esi
1717 push edi
1718 mov eax, [esp + 8 + 4] // src_argb
1719 mov esi, [esp + 8 + 8] // src_stride_argb
1720 mov edx, [esp + 8 + 12] // dst_u
1721 mov edi, [esp + 8 + 16] // dst_v
1722 mov ecx, [esp + 8 + 20] // width
1723 movdqa xmm5, xmmword ptr kAddUV128
1724 movdqa xmm6, xmmword ptr kBGRAToV
1725 movdqa xmm7, xmmword ptr kBGRAToU
1726 sub edi, edx // stride from u to v
1727
1728 convertloop:
1729 /* step 1 - subsample 16x2 argb pixels to 8x1 */
1730 movdqu xmm0, [eax]
1731 movdqu xmm4, [eax + esi]
1732 pavgb xmm0, xmm4
1733 movdqu xmm1, [eax + 16]
1734 movdqu xmm4, [eax + esi + 16]
1735 pavgb xmm1, xmm4
1736 movdqu xmm2, [eax + 32]
1737 movdqu xmm4, [eax + esi + 32]
1738 pavgb xmm2, xmm4
1739 movdqu xmm3, [eax + 48]
1740 movdqu xmm4, [eax + esi + 48]
1741 pavgb xmm3, xmm4
1742
1743 lea eax, [eax + 64]
1744 movdqa xmm4, xmm0
1745 shufps xmm0, xmm1, 0x88
1746 shufps xmm4, xmm1, 0xdd
1747 pavgb xmm0, xmm4
1748 movdqa xmm4, xmm2
1749 shufps xmm2, xmm3, 0x88
1750 shufps xmm4, xmm3, 0xdd
1751 pavgb xmm2, xmm4
1752
1753 // step 2 - convert to U and V
1754 // from here down is very similar to Y code except
1755 // instead of 16 different pixels, its 8 pixels of U and 8 of V
1756 movdqa xmm1, xmm0
1757 movdqa xmm3, xmm2
1758 pmaddubsw xmm0, xmm7 // U
1759 pmaddubsw xmm2, xmm7
1760 pmaddubsw xmm1, xmm6 // V
1761 pmaddubsw xmm3, xmm6
1762 phaddw xmm0, xmm2
1763 phaddw xmm1, xmm3
1764 psraw xmm0, 8
1765 psraw xmm1, 8
1766 packsswb xmm0, xmm1
1767 paddb xmm0, xmm5 // -> unsigned
1768
1769 // step 3 - store 8 U and 8 V values
1770 movlps qword ptr [edx], xmm0 // U
1771 movhps qword ptr [edx + edi], xmm0 // V
1772 lea edx, [edx + 8]
1773 sub ecx, 16
1774 jg convertloop
1775
1776 pop edi
1777 pop esi
1778 ret
1779 }
1780 }
1781
1782 __declspec(naked) void ABGRToUVRow_SSSE3(const uint8* src_argb0,
1783 int src_stride_argb,
1784 uint8* dst_u,
1785 uint8* dst_v,
1786 int width) {
1787 __asm {
1788 push esi
1789 push edi
1790 mov eax, [esp + 8 + 4] // src_argb
1791 mov esi, [esp + 8 + 8] // src_stride_argb
1792 mov edx, [esp + 8 + 12] // dst_u
1793 mov edi, [esp + 8 + 16] // dst_v
1794 mov ecx, [esp + 8 + 20] // width
1795 movdqa xmm5, xmmword ptr kAddUV128
1796 movdqa xmm6, xmmword ptr kABGRToV
1797 movdqa xmm7, xmmword ptr kABGRToU
1798 sub edi, edx // stride from u to v
1799
1800 convertloop:
1801 /* step 1 - subsample 16x2 argb pixels to 8x1 */
1802 movdqu xmm0, [eax]
1803 movdqu xmm4, [eax + esi]
1804 pavgb xmm0, xmm4
1805 movdqu xmm1, [eax + 16]
1806 movdqu xmm4, [eax + esi + 16]
1807 pavgb xmm1, xmm4
1808 movdqu xmm2, [eax + 32]
1809 movdqu xmm4, [eax + esi + 32]
1810 pavgb xmm2, xmm4
1811 movdqu xmm3, [eax + 48]
1812 movdqu xmm4, [eax + esi + 48]
1813 pavgb xmm3, xmm4
1814
1815 lea eax, [eax + 64]
1816 movdqa xmm4, xmm0
1817 shufps xmm0, xmm1, 0x88
1818 shufps xmm4, xmm1, 0xdd
1819 pavgb xmm0, xmm4
1820 movdqa xmm4, xmm2
1821 shufps xmm2, xmm3, 0x88
1822 shufps xmm4, xmm3, 0xdd
1823 pavgb xmm2, xmm4
1824
1825 // step 2 - convert to U and V
1826 // from here down is very similar to Y code except
1827 // instead of 16 different pixels, its 8 pixels of U and 8 of V
1828 movdqa xmm1, xmm0
1829 movdqa xmm3, xmm2
1830 pmaddubsw xmm0, xmm7 // U
1831 pmaddubsw xmm2, xmm7
1832 pmaddubsw xmm1, xmm6 // V
1833 pmaddubsw xmm3, xmm6
1834 phaddw xmm0, xmm2
1835 phaddw xmm1, xmm3
1836 psraw xmm0, 8
1837 psraw xmm1, 8
1838 packsswb xmm0, xmm1
1839 paddb xmm0, xmm5 // -> unsigned
1840
1841 // step 3 - store 8 U and 8 V values
1842 movlps qword ptr [edx], xmm0 // U
1843 movhps qword ptr [edx + edi], xmm0 // V
1844 lea edx, [edx + 8]
1845 sub ecx, 16
1846 jg convertloop
1847
1848 pop edi
1849 pop esi
1850 ret
1851 }
1852 }
1853
1854 __declspec(naked) void RGBAToUVRow_SSSE3(const uint8* src_argb0,
1855 int src_stride_argb,
1856 uint8* dst_u,
1857 uint8* dst_v,
1858 int width) {
1859 __asm {
1860 push esi
1861 push edi
1862 mov eax, [esp + 8 + 4] // src_argb
1863 mov esi, [esp + 8 + 8] // src_stride_argb
1864 mov edx, [esp + 8 + 12] // dst_u
1865 mov edi, [esp + 8 + 16] // dst_v
1866 mov ecx, [esp + 8 + 20] // width
1867 movdqa xmm5, xmmword ptr kAddUV128
1868 movdqa xmm6, xmmword ptr kRGBAToV
1869 movdqa xmm7, xmmword ptr kRGBAToU
1870 sub edi, edx // stride from u to v
1871
1872 convertloop:
1873 /* step 1 - subsample 16x2 argb pixels to 8x1 */
1874 movdqu xmm0, [eax]
1875 movdqu xmm4, [eax + esi]
1876 pavgb xmm0, xmm4
1877 movdqu xmm1, [eax + 16]
1878 movdqu xmm4, [eax + esi + 16]
1879 pavgb xmm1, xmm4
1880 movdqu xmm2, [eax + 32]
1881 movdqu xmm4, [eax + esi + 32]
1882 pavgb xmm2, xmm4
1883 movdqu xmm3, [eax + 48]
1884 movdqu xmm4, [eax + esi + 48]
1885 pavgb xmm3, xmm4
1886
1887 lea eax, [eax + 64]
1888 movdqa xmm4, xmm0
1889 shufps xmm0, xmm1, 0x88
1890 shufps xmm4, xmm1, 0xdd
1891 pavgb xmm0, xmm4
1892 movdqa xmm4, xmm2
1893 shufps xmm2, xmm3, 0x88
1894 shufps xmm4, xmm3, 0xdd
1895 pavgb xmm2, xmm4
1896
1897 // step 2 - convert to U and V
1898 // from here down is very similar to Y code except
1899 // instead of 16 different pixels, its 8 pixels of U and 8 of V
1900 movdqa xmm1, xmm0
1901 movdqa xmm3, xmm2
1902 pmaddubsw xmm0, xmm7 // U
1903 pmaddubsw xmm2, xmm7
1904 pmaddubsw xmm1, xmm6 // V
1905 pmaddubsw xmm3, xmm6
1906 phaddw xmm0, xmm2
1907 phaddw xmm1, xmm3
1908 psraw xmm0, 8
1909 psraw xmm1, 8
1910 packsswb xmm0, xmm1
1911 paddb xmm0, xmm5 // -> unsigned
1912
1913 // step 3 - store 8 U and 8 V values
1914 movlps qword ptr [edx], xmm0 // U
1915 movhps qword ptr [edx + edi], xmm0 // V
1916 lea edx, [edx + 8]
1917 sub ecx, 16
1918 jg convertloop
1919
1920 pop edi
1921 pop esi
1922 ret
1923 }
1924 }
1925 #endif // HAS_ARGBTOYROW_SSSE3
1926
1927 // Read 16 UV from 444
1928 #define READYUV444_AVX2 \
1929 __asm { \
1930 __asm vmovdqu xmm0, [esi] /* U */ \
1931 __asm vmovdqu xmm1, [esi + edi] /* V */ \
1932 __asm lea esi, [esi + 16] \
1933 __asm vpermq ymm0, ymm0, 0xd8 \
1934 __asm vpermq ymm1, ymm1, 0xd8 \
1935 __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */ \
1936 __asm vmovdqu xmm4, [eax] /* Y */ \
1937 __asm vpermq ymm4, ymm4, 0xd8 \
1938 __asm vpunpcklbw ymm4, ymm4, ymm4 \
1939 __asm lea eax, [eax + 16]}
1940
1941 // Read 8 UV from 422, upsample to 16 UV.
1942 #define READYUV422_AVX2 \
1943 __asm { \
1944 __asm vmovq xmm0, qword ptr [esi] /* U */ \
1945 __asm vmovq xmm1, qword ptr [esi + edi] /* V */ \
1946 __asm lea esi, [esi + 8] \
1947 __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */ \
1948 __asm vpermq ymm0, ymm0, 0xd8 \
1949 __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \
1950 __asm vmovdqu xmm4, [eax] /* Y */ \
1951 __asm vpermq ymm4, ymm4, 0xd8 \
1952 __asm vpunpcklbw ymm4, ymm4, ymm4 \
1953 __asm lea eax, [eax + 16]}
1954
1955 // Read 8 UV from 422, upsample to 16 UV. With 16 Alpha.
1956 #define READYUVA422_AVX2 \
1957 __asm { \
1958 __asm vmovq xmm0, qword ptr [esi] /* U */ \
1959 __asm vmovq xmm1, qword ptr [esi + edi] /* V */ \
1960 __asm lea esi, [esi + 8] \
1961 __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */ \
1962 __asm vpermq ymm0, ymm0, 0xd8 \
1963 __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \
1964 __asm vmovdqu xmm4, [eax] /* Y */ \
1965 __asm vpermq ymm4, ymm4, 0xd8 \
1966 __asm vpunpcklbw ymm4, ymm4, ymm4 \
1967 __asm lea eax, [eax + 16] \
1968 __asm vmovdqu xmm5, [ebp] /* A */ \
1969 __asm vpermq ymm5, ymm5, 0xd8 \
1970 __asm lea ebp, [ebp + 16]}
1971
1972 // Read 8 UV from NV12, upsample to 16 UV.
1973 #define READNV12_AVX2 \
1974 __asm { \
1975 __asm vmovdqu xmm0, [esi] /* UV */ \
1976 __asm lea esi, [esi + 16] \
1977 __asm vpermq ymm0, ymm0, 0xd8 \
1978 __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \
1979 __asm vmovdqu xmm4, [eax] /* Y */ \
1980 __asm vpermq ymm4, ymm4, 0xd8 \
1981 __asm vpunpcklbw ymm4, ymm4, ymm4 \
1982 __asm lea eax, [eax + 16]}
1983
1984 // Read 8 UV from NV21, upsample to 16 UV.
1985 #define READNV21_AVX2 \
1986 __asm { \
1987 __asm vmovdqu xmm0, [esi] /* UV */ \
1988 __asm lea esi, [esi + 16] \
1989 __asm vpermq ymm0, ymm0, 0xd8 \
1990 __asm vpshufb ymm0, ymm0, ymmword ptr kShuffleNV21 \
1991 __asm vmovdqu xmm4, [eax] /* Y */ \
1992 __asm vpermq ymm4, ymm4, 0xd8 \
1993 __asm vpunpcklbw ymm4, ymm4, ymm4 \
1994 __asm lea eax, [eax + 16]}
1995
1996 // Read 8 YUY2 with 16 Y and upsample 8 UV to 16 UV.
1997 #define READYUY2_AVX2 \
1998 __asm { \
1999 __asm vmovdqu ymm4, [eax] /* YUY2 */ \
2000 __asm vpshufb ymm4, ymm4, ymmword ptr kShuffleYUY2Y \
2001 __asm vmovdqu ymm0, [eax] /* UV */ \
2002 __asm vpshufb ymm0, ymm0, ymmword ptr kShuffleYUY2UV \
2003 __asm lea eax, [eax + 32]}
2004
2005 // Read 8 UYVY with 16 Y and upsample 8 UV to 16 UV.
2006 #define READUYVY_AVX2 \
2007 __asm { \
2008 __asm vmovdqu ymm4, [eax] /* UYVY */ \
2009 __asm vpshufb ymm4, ymm4, ymmword ptr kShuffleUYVYY \
2010 __asm vmovdqu ymm0, [eax] /* UV */ \
2011 __asm vpshufb ymm0, ymm0, ymmword ptr kShuffleUYVYUV \
2012 __asm lea eax, [eax + 32]}
2013
2014 // Convert 16 pixels: 16 UV and 16 Y.
2015 #define YUVTORGB_AVX2(YuvConstants) \
2016 __asm { \
2017 __asm vpmaddubsw ymm2, ymm0, ymmword ptr [YuvConstants + KUVTOR] /* R UV */\
2018 __asm vpmaddubsw ymm1, ymm0, ymmword ptr [YuvConstants + KUVTOG] /* G UV */\
2019 __asm vpmaddubsw ymm0, ymm0, ymmword ptr [YuvConstants + KUVTOB] /* B UV */\
2020 __asm vmovdqu ymm3, ymmword ptr [YuvConstants + KUVBIASR] \
2021 __asm vpsubw ymm2, ymm3, ymm2 \
2022 __asm vmovdqu ymm3, ymmword ptr [YuvConstants + KUVBIASG] \
2023 __asm vpsubw ymm1, ymm3, ymm1 \
2024 __asm vmovdqu ymm3, ymmword ptr [YuvConstants + KUVBIASB] \
2025 __asm vpsubw ymm0, ymm3, ymm0 /* Step 2: Find Y contribution to 16 R,G,B values */ \
2026 __asm vpmulhuw ymm4, ymm4, ymmword ptr [YuvConstants + KYTORGB] \
2027 __asm vpaddsw ymm0, ymm0, ymm4 /* B += Y */ \
2028 __asm vpaddsw ymm1, ymm1, ymm4 /* G += Y */ \
2029 __asm vpaddsw ymm2, ymm2, ymm4 /* R += Y */ \
2030 __asm vpsraw ymm0, ymm0, 6 \
2031 __asm vpsraw ymm1, ymm1, 6 \
2032 __asm vpsraw ymm2, ymm2, 6 \
2033 __asm vpackuswb ymm0, ymm0, ymm0 /* B */ \
2034 __asm vpackuswb ymm1, ymm1, ymm1 /* G */ \
2035 __asm vpackuswb ymm2, ymm2, ymm2 /* R */ \
2036 }
2037
2038 // Store 16 ARGB values.
2039 #define STOREARGB_AVX2 \
2040 __asm { \
2041 __asm vpunpcklbw ymm0, ymm0, ymm1 /* BG */ \
2042 __asm vpermq ymm0, ymm0, 0xd8 \
2043 __asm vpunpcklbw ymm2, ymm2, ymm5 /* RA */ \
2044 __asm vpermq ymm2, ymm2, 0xd8 \
2045 __asm vpunpcklwd ymm1, ymm0, ymm2 /* BGRA first 8 pixels */ \
2046 __asm vpunpckhwd ymm0, ymm0, ymm2 /* BGRA next 8 pixels */ \
2047 __asm vmovdqu 0[edx], ymm1 \
2048 __asm vmovdqu 32[edx], ymm0 \
2049 __asm lea edx, [edx + 64]}
2050
2051 // Store 16 RGBA values.
2052 #define STORERGBA_AVX2 \
2053 __asm { \
2054 __asm vpunpcklbw ymm1, ymm1, ymm2 /* GR */ \
2055 __asm vpermq ymm1, ymm1, 0xd8 \
2056 __asm vpunpcklbw ymm2, ymm5, ymm0 /* AB */ \
2057 __asm vpermq ymm2, ymm2, 0xd8 \
2058 __asm vpunpcklwd ymm0, ymm2, ymm1 /* ABGR first 8 pixels */ \
2059 __asm vpunpckhwd ymm1, ymm2, ymm1 /* ABGR next 8 pixels */ \
2060 __asm vmovdqu [edx], ymm0 \
2061 __asm vmovdqu [edx + 32], ymm1 \
2062 __asm lea edx, [edx + 64]}
2063
2064 #ifdef HAS_I422TOARGBROW_AVX2
2065 // 16 pixels
2066 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
2067 __declspec(naked) void I422ToARGBRow_AVX2(
2068 const uint8* y_buf,
2069 const uint8* u_buf,
2070 const uint8* v_buf,
2071 uint8* dst_argb,
2072 const struct YuvConstants* yuvconstants,
2073 int width) {
2074 __asm {
2075 push esi
2076 push edi
2077 push ebx
2078 mov eax, [esp + 12 + 4] // Y
2079 mov esi, [esp + 12 + 8] // U
2080 mov edi, [esp + 12 + 12] // V
2081 mov edx, [esp + 12 + 16] // argb
2082 mov ebx, [esp + 12 + 20] // yuvconstants
2083 mov ecx, [esp + 12 + 24] // width
2084 sub edi, esi
2085 vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
2086
2087 convertloop:
2088 READYUV422_AVX2
2089 YUVTORGB_AVX2(ebx)
2090 STOREARGB_AVX2
2091
2092 sub ecx, 16
2093 jg convertloop
2094
2095 pop ebx
2096 pop edi
2097 pop esi
2098 vzeroupper
2099 ret
2100 }
2101 }
2102 #endif // HAS_I422TOARGBROW_AVX2
2103
2104 #ifdef HAS_I422ALPHATOARGBROW_AVX2
2105 // 16 pixels
2106 // 8 UV values upsampled to 16 UV, mixed with 16 Y and 16 A producing 16 ARGB.
2107 __declspec(naked) void I422AlphaToARGBRow_AVX2(
2108 const uint8* y_buf,
2109 const uint8* u_buf,
2110 const uint8* v_buf,
2111 const uint8* a_buf,
2112 uint8* dst_argb,
2113 const struct YuvConstants* yuvconstants,
2114 int width) {
2115 __asm {
2116 push esi
2117 push edi
2118 push ebx
2119 push ebp
2120 mov eax, [esp + 16 + 4] // Y
2121 mov esi, [esp + 16 + 8] // U
2122 mov edi, [esp + 16 + 12] // V
2123 mov ebp, [esp + 16 + 16] // A
2124 mov edx, [esp + 16 + 20] // argb
2125 mov ebx, [esp + 16 + 24] // yuvconstants
2126 mov ecx, [esp + 16 + 28] // width
2127 sub edi, esi
2128
2129 convertloop:
2130 READYUVA422_AVX2
2131 YUVTORGB_AVX2(ebx)
2132 STOREARGB_AVX2
2133
2134 sub ecx, 16
2135 jg convertloop
2136
2137 pop ebp
2138 pop ebx
2139 pop edi
2140 pop esi
2141 vzeroupper
2142 ret
2143 }
2144 }
2145 #endif // HAS_I422ALPHATOARGBROW_AVX2
2146
2147 #ifdef HAS_I444TOARGBROW_AVX2
2148 // 16 pixels
2149 // 16 UV values with 16 Y producing 16 ARGB (64 bytes).
2150 __declspec(naked) void I444ToARGBRow_AVX2(
2151 const uint8* y_buf,
2152 const uint8* u_buf,
2153 const uint8* v_buf,
2154 uint8* dst_argb,
2155 const struct YuvConstants* yuvconstants,
2156 int width) {
2157 __asm {
2158 push esi
2159 push edi
2160 push ebx
2161 mov eax, [esp + 12 + 4] // Y
2162 mov esi, [esp + 12 + 8] // U
2163 mov edi, [esp + 12 + 12] // V
2164 mov edx, [esp + 12 + 16] // argb
2165 mov ebx, [esp + 12 + 20] // yuvconstants
2166 mov ecx, [esp + 12 + 24] // width
2167 sub edi, esi
2168 vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
2169 convertloop:
2170 READYUV444_AVX2
2171 YUVTORGB_AVX2(ebx)
2172 STOREARGB_AVX2
2173
2174 sub ecx, 16
2175 jg convertloop
2176
2177 pop ebx
2178 pop edi
2179 pop esi
2180 vzeroupper
2181 ret
2182 }
2183 }
2184 #endif // HAS_I444TOARGBROW_AVX2
2185
2186 #ifdef HAS_NV12TOARGBROW_AVX2
2187 // 16 pixels.
2188 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
2189 __declspec(naked) void NV12ToARGBRow_AVX2(
2190 const uint8* y_buf,
2191 const uint8* uv_buf,
2192 uint8* dst_argb,
2193 const struct YuvConstants* yuvconstants,
2194 int width) {
2195 __asm {
2196 push esi
2197 push ebx
2198 mov eax, [esp + 8 + 4] // Y
2199 mov esi, [esp + 8 + 8] // UV
2200 mov edx, [esp + 8 + 12] // argb
2201 mov ebx, [esp + 8 + 16] // yuvconstants
2202 mov ecx, [esp + 8 + 20] // width
2203 vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
2204
2205 convertloop:
2206 READNV12_AVX2
2207 YUVTORGB_AVX2(ebx)
2208 STOREARGB_AVX2
2209
2210 sub ecx, 16
2211 jg convertloop
2212
2213 pop ebx
2214 pop esi
2215 vzeroupper
2216 ret
2217 }
2218 }
2219 #endif // HAS_NV12TOARGBROW_AVX2
2220
2221 #ifdef HAS_NV21TOARGBROW_AVX2
2222 // 16 pixels.
2223 // 8 VU values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
2224 __declspec(naked) void NV21ToARGBRow_AVX2(
2225 const uint8* y_buf,
2226 const uint8* vu_buf,
2227 uint8* dst_argb,
2228 const struct YuvConstants* yuvconstants,
2229 int width) {
2230 __asm {
2231 push esi
2232 push ebx
2233 mov eax, [esp + 8 + 4] // Y
2234 mov esi, [esp + 8 + 8] // VU
2235 mov edx, [esp + 8 + 12] // argb
2236 mov ebx, [esp + 8 + 16] // yuvconstants
2237 mov ecx, [esp + 8 + 20] // width
2238 vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
2239
2240 convertloop:
2241 READNV21_AVX2
2242 YUVTORGB_AVX2(ebx)
2243 STOREARGB_AVX2
2244
2245 sub ecx, 16
2246 jg convertloop
2247
2248 pop ebx
2249 pop esi
2250 vzeroupper
2251 ret
2252 }
2253 }
2254 #endif // HAS_NV21TOARGBROW_AVX2
2255
2256 #ifdef HAS_YUY2TOARGBROW_AVX2
2257 // 16 pixels.
2258 // 8 YUY2 values with 16 Y and 8 UV producing 16 ARGB (64 bytes).
2259 __declspec(naked) void YUY2ToARGBRow_AVX2(
2260 const uint8* src_yuy2,
2261 uint8* dst_argb,
2262 const struct YuvConstants* yuvconstants,
2263 int width) {
2264 __asm {
2265 push ebx
2266 mov eax, [esp + 4 + 4] // yuy2
2267 mov edx, [esp + 4 + 8] // argb
2268 mov ebx, [esp + 4 + 12] // yuvconstants
2269 mov ecx, [esp + 4 + 16] // width
2270 vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
2271
2272 convertloop:
2273 READYUY2_AVX2
2274 YUVTORGB_AVX2(ebx)
2275 STOREARGB_AVX2
2276
2277 sub ecx, 16
2278 jg convertloop
2279
2280 pop ebx
2281 vzeroupper
2282 ret
2283 }
2284 }
2285 #endif // HAS_YUY2TOARGBROW_AVX2
2286
2287 #ifdef HAS_UYVYTOARGBROW_AVX2
2288 // 16 pixels.
2289 // 8 UYVY values with 16 Y and 8 UV producing 16 ARGB (64 bytes).
2290 __declspec(naked) void UYVYToARGBRow_AVX2(
2291 const uint8* src_uyvy,
2292 uint8* dst_argb,
2293 const struct YuvConstants* yuvconstants,
2294 int width) {
2295 __asm {
2296 push ebx
2297 mov eax, [esp + 4 + 4] // uyvy
2298 mov edx, [esp + 4 + 8] // argb
2299 mov ebx, [esp + 4 + 12] // yuvconstants
2300 mov ecx, [esp + 4 + 16] // width
2301 vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
2302
2303 convertloop:
2304 READUYVY_AVX2
2305 YUVTORGB_AVX2(ebx)
2306 STOREARGB_AVX2
2307
2308 sub ecx, 16
2309 jg convertloop
2310
2311 pop ebx
2312 vzeroupper
2313 ret
2314 }
2315 }
2316 #endif // HAS_UYVYTOARGBROW_AVX2
2317
2318 #ifdef HAS_I422TORGBAROW_AVX2
2319 // 16 pixels
2320 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 RGBA (64 bytes).
2321 __declspec(naked) void I422ToRGBARow_AVX2(
2322 const uint8* y_buf,
2323 const uint8* u_buf,
2324 const uint8* v_buf,
2325 uint8* dst_argb,
2326 const struct YuvConstants* yuvconstants,
2327 int width) {
2328 __asm {
2329 push esi
2330 push edi
2331 push ebx
2332 mov eax, [esp + 12 + 4] // Y
2333 mov esi, [esp + 12 + 8] // U
2334 mov edi, [esp + 12 + 12] // V
2335 mov edx, [esp + 12 + 16] // abgr
2336 mov ebx, [esp + 12 + 20] // yuvconstants
2337 mov ecx, [esp + 12 + 24] // width
2338 sub edi, esi
2339 vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
2340
2341 convertloop:
2342 READYUV422_AVX2
2343 YUVTORGB_AVX2(ebx)
2344 STORERGBA_AVX2
2345
2346 sub ecx, 16
2347 jg convertloop
2348
2349 pop ebx
2350 pop edi
2351 pop esi
2352 vzeroupper
2353 ret
2354 }
2355 }
2356 #endif // HAS_I422TORGBAROW_AVX2
2357
2358 #if defined(HAS_I422TOARGBROW_SSSE3)
2359 // TODO(fbarchard): Read that does half size on Y and treats 420 as 444.
2360 // Allows a conversion with half size scaling.
2361
2362 // Read 8 UV from 444.
2363 #define READYUV444 \
2364 __asm { \
2365 __asm movq xmm0, qword ptr [esi] /* U */ \
2366 __asm movq xmm1, qword ptr [esi + edi] /* V */ \
2367 __asm lea esi, [esi + 8] \
2368 __asm punpcklbw xmm0, xmm1 /* UV */ \
2369 __asm movq xmm4, qword ptr [eax] \
2370 __asm punpcklbw xmm4, xmm4 \
2371 __asm lea eax, [eax + 8]}
2372
2373 // Read 4 UV from 422, upsample to 8 UV.
2374 #define READYUV422 \
2375 __asm { \
2376 __asm movd xmm0, [esi] /* U */ \
2377 __asm movd xmm1, [esi + edi] /* V */ \
2378 __asm lea esi, [esi + 4] \
2379 __asm punpcklbw xmm0, xmm1 /* UV */ \
2380 __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \
2381 __asm movq xmm4, qword ptr [eax] \
2382 __asm punpcklbw xmm4, xmm4 \
2383 __asm lea eax, [eax + 8]}
2384
2385 // Read 4 UV from 422, upsample to 8 UV. With 8 Alpha.
2386 #define READYUVA422 \
2387 __asm { \
2388 __asm movd xmm0, [esi] /* U */ \
2389 __asm movd xmm1, [esi + edi] /* V */ \
2390 __asm lea esi, [esi + 4] \
2391 __asm punpcklbw xmm0, xmm1 /* UV */ \
2392 __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \
2393 __asm movq xmm4, qword ptr [eax] /* Y */ \
2394 __asm punpcklbw xmm4, xmm4 \
2395 __asm lea eax, [eax + 8] \
2396 __asm movq xmm5, qword ptr [ebp] /* A */ \
2397 __asm lea ebp, [ebp + 8]}
2398
2399 // Read 4 UV from NV12, upsample to 8 UV.
2400 #define READNV12 \
2401 __asm { \
2402 __asm movq xmm0, qword ptr [esi] /* UV */ \
2403 __asm lea esi, [esi + 8] \
2404 __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \
2405 __asm movq xmm4, qword ptr [eax] \
2406 __asm punpcklbw xmm4, xmm4 \
2407 __asm lea eax, [eax + 8]}
2408
2409 // Read 4 VU from NV21, upsample to 8 UV.
2410 #define READNV21 \
2411 __asm { \
2412 __asm movq xmm0, qword ptr [esi] /* UV */ \
2413 __asm lea esi, [esi + 8] \
2414 __asm pshufb xmm0, xmmword ptr kShuffleNV21 \
2415 __asm movq xmm4, qword ptr [eax] \
2416 __asm punpcklbw xmm4, xmm4 \
2417 __asm lea eax, [eax + 8]}
2418
2419 // Read 4 YUY2 with 8 Y and upsample 4 UV to 8 UV.
2420 #define READYUY2 \
2421 __asm { \
2422 __asm movdqu xmm4, [eax] /* YUY2 */ \
2423 __asm pshufb xmm4, xmmword ptr kShuffleYUY2Y \
2424 __asm movdqu xmm0, [eax] /* UV */ \
2425 __asm pshufb xmm0, xmmword ptr kShuffleYUY2UV \
2426 __asm lea eax, [eax + 16]}
2427
2428 // Read 4 UYVY with 8 Y and upsample 4 UV to 8 UV.
2429 #define READUYVY \
2430 __asm { \
2431 __asm movdqu xmm4, [eax] /* UYVY */ \
2432 __asm pshufb xmm4, xmmword ptr kShuffleUYVYY \
2433 __asm movdqu xmm0, [eax] /* UV */ \
2434 __asm pshufb xmm0, xmmword ptr kShuffleUYVYUV \
2435 __asm lea eax, [eax + 16]}
2436
2437 // Convert 8 pixels: 8 UV and 8 Y.
2438 #define YUVTORGB(YuvConstants) \
2439 __asm { \
2440 __asm movdqa xmm1, xmm0 \
2441 __asm movdqa xmm2, xmm0 \
2442 __asm movdqa xmm3, xmm0 \
2443 __asm movdqa xmm0, xmmword ptr [YuvConstants + KUVBIASB] \
2444 __asm pmaddubsw xmm1, xmmword ptr [YuvConstants + KUVTOB] \
2445 __asm psubw xmm0, xmm1 \
2446 __asm movdqa xmm1, xmmword ptr [YuvConstants + KUVBIASG] \
2447 __asm pmaddubsw xmm2, xmmword ptr [YuvConstants + KUVTOG] \
2448 __asm psubw xmm1, xmm2 \
2449 __asm movdqa xmm2, xmmword ptr [YuvConstants + KUVBIASR] \
2450 __asm pmaddubsw xmm3, xmmword ptr [YuvConstants + KUVTOR] \
2451 __asm psubw xmm2, xmm3 \
2452 __asm pmulhuw xmm4, xmmword ptr [YuvConstants + KYTORGB] \
2453 __asm paddsw xmm0, xmm4 /* B += Y */ \
2454 __asm paddsw xmm1, xmm4 /* G += Y */ \
2455 __asm paddsw xmm2, xmm4 /* R += Y */ \
2456 __asm psraw xmm0, 6 \
2457 __asm psraw xmm1, 6 \
2458 __asm psraw xmm2, 6 \
2459 __asm packuswb xmm0, xmm0 /* B */ \
2460 __asm packuswb xmm1, xmm1 /* G */ \
2461 __asm packuswb xmm2, xmm2 /* R */ \
2462 }
2463
2464 // Store 8 ARGB values.
2465 #define STOREARGB \
2466 __asm { \
2467 __asm punpcklbw xmm0, xmm1 /* BG */ \
2468 __asm punpcklbw xmm2, xmm5 /* RA */ \
2469 __asm movdqa xmm1, xmm0 \
2470 __asm punpcklwd xmm0, xmm2 /* BGRA first 4 pixels */ \
2471 __asm punpckhwd xmm1, xmm2 /* BGRA next 4 pixels */ \
2472 __asm movdqu 0[edx], xmm0 \
2473 __asm movdqu 16[edx], xmm1 \
2474 __asm lea edx, [edx + 32]}
2475
2476 // Store 8 BGRA values.
2477 #define STOREBGRA \
2478 __asm { \
2479 __asm pcmpeqb xmm5, xmm5 /* generate 0xffffffff for alpha */ \
2480 __asm punpcklbw xmm1, xmm0 /* GB */ \
2481 __asm punpcklbw xmm5, xmm2 /* AR */ \
2482 __asm movdqa xmm0, xmm5 \
2483 __asm punpcklwd xmm5, xmm1 /* BGRA first 4 pixels */ \
2484 __asm punpckhwd xmm0, xmm1 /* BGRA next 4 pixels */ \
2485 __asm movdqu 0[edx], xmm5 \
2486 __asm movdqu 16[edx], xmm0 \
2487 __asm lea edx, [edx + 32]}
2488
2489 // Store 8 RGBA values.
2490 #define STORERGBA \
2491 __asm { \
2492 __asm pcmpeqb xmm5, xmm5 /* generate 0xffffffff for alpha */ \
2493 __asm punpcklbw xmm1, xmm2 /* GR */ \
2494 __asm punpcklbw xmm5, xmm0 /* AB */ \
2495 __asm movdqa xmm0, xmm5 \
2496 __asm punpcklwd xmm5, xmm1 /* RGBA first 4 pixels */ \
2497 __asm punpckhwd xmm0, xmm1 /* RGBA next 4 pixels */ \
2498 __asm movdqu 0[edx], xmm5 \
2499 __asm movdqu 16[edx], xmm0 \
2500 __asm lea edx, [edx + 32]}
2501
2502 // Store 8 RGB24 values.
2503 #define STORERGB24 \
2504 __asm {/* Weave into RRGB */ \
2505 __asm punpcklbw xmm0, xmm1 /* BG */ \
2506 __asm punpcklbw xmm2, xmm2 /* RR */ \
2507 __asm movdqa xmm1, xmm0 \
2508 __asm punpcklwd xmm0, xmm2 /* BGRR first 4 pixels */ \
2509 __asm punpckhwd xmm1, xmm2 /* BGRR next 4 pixels */ /* RRGB -> RGB24 */ \
2510 __asm pshufb xmm0, xmm5 /* Pack first 8 and last 4 bytes. */ \
2511 __asm pshufb xmm1, xmm6 /* Pack first 12 bytes. */ \
2512 __asm palignr xmm1, xmm0, 12 /* last 4 bytes of xmm0 + 12 xmm1 */ \
2513 __asm movq qword ptr 0[edx], xmm0 /* First 8 bytes */ \
2514 __asm movdqu 8[edx], xmm1 /* Last 16 bytes */ \
2515 __asm lea edx, [edx + 24]}
2516
2517 // Store 8 RGB565 values.
2518 #define STORERGB565 \
2519 __asm {/* Weave into RRGB */ \
2520 __asm punpcklbw xmm0, xmm1 /* BG */ \
2521 __asm punpcklbw xmm2, xmm2 /* RR */ \
2522 __asm movdqa xmm1, xmm0 \
2523 __asm punpcklwd xmm0, xmm2 /* BGRR first 4 pixels */ \
2524 __asm punpckhwd xmm1, xmm2 /* BGRR next 4 pixels */ /* RRGB -> RGB565 */ \
2525 __asm movdqa xmm3, xmm0 /* B first 4 pixels of argb */ \
2526 __asm movdqa xmm2, xmm0 /* G */ \
2527 __asm pslld xmm0, 8 /* R */ \
2528 __asm psrld xmm3, 3 /* B */ \
2529 __asm psrld xmm2, 5 /* G */ \
2530 __asm psrad xmm0, 16 /* R */ \
2531 __asm pand xmm3, xmm5 /* B */ \
2532 __asm pand xmm2, xmm6 /* G */ \
2533 __asm pand xmm0, xmm7 /* R */ \
2534 __asm por xmm3, xmm2 /* BG */ \
2535 __asm por xmm0, xmm3 /* BGR */ \
2536 __asm movdqa xmm3, xmm1 /* B next 4 pixels of argb */ \
2537 __asm movdqa xmm2, xmm1 /* G */ \
2538 __asm pslld xmm1, 8 /* R */ \
2539 __asm psrld xmm3, 3 /* B */ \
2540 __asm psrld xmm2, 5 /* G */ \
2541 __asm psrad xmm1, 16 /* R */ \
2542 __asm pand xmm3, xmm5 /* B */ \
2543 __asm pand xmm2, xmm6 /* G */ \
2544 __asm pand xmm1, xmm7 /* R */ \
2545 __asm por xmm3, xmm2 /* BG */ \
2546 __asm por xmm1, xmm3 /* BGR */ \
2547 __asm packssdw xmm0, xmm1 \
2548 __asm movdqu 0[edx], xmm0 /* store 8 pixels of RGB565 */ \
2549 __asm lea edx, [edx + 16]}
2550
2551 // 8 pixels.
2552 // 8 UV values, mixed with 8 Y producing 8 ARGB (32 bytes).
2553 __declspec(naked) void I444ToARGBRow_SSSE3(
2554 const uint8* y_buf,
2555 const uint8* u_buf,
2556 const uint8* v_buf,
2557 uint8* dst_argb,
2558 const struct YuvConstants* yuvconstants,
2559 int width) {
2560 __asm {
2561 push esi
2562 push edi
2563 push ebx
2564 mov eax, [esp + 12 + 4] // Y
2565 mov esi, [esp + 12 + 8] // U
2566 mov edi, [esp + 12 + 12] // V
2567 mov edx, [esp + 12 + 16] // argb
2568 mov ebx, [esp + 12 + 20] // yuvconstants
2569 mov ecx, [esp + 12 + 24] // width
2570 sub edi, esi
2571 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
2572
2573 convertloop:
2574 READYUV444
2575 YUVTORGB(ebx)
2576 STOREARGB
2577
2578 sub ecx, 8
2579 jg convertloop
2580
2581 pop ebx
2582 pop edi
2583 pop esi
2584 ret
2585 }
2586 }
2587
2588 // 8 pixels.
2589 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RGB24 (24 bytes).
2590 __declspec(naked) void I422ToRGB24Row_SSSE3(
2591 const uint8* y_buf,
2592 const uint8* u_buf,
2593 const uint8* v_buf,
2594 uint8* dst_rgb24,
2595 const struct YuvConstants* yuvconstants,
2596 int width) {
2597 __asm {
2598 push esi
2599 push edi
2600 push ebx
2601 mov eax, [esp + 12 + 4] // Y
2602 mov esi, [esp + 12 + 8] // U
2603 mov edi, [esp + 12 + 12] // V
2604 mov edx, [esp + 12 + 16] // argb
2605 mov ebx, [esp + 12 + 20] // yuvconstants
2606 mov ecx, [esp + 12 + 24] // width
2607 sub edi, esi
2608 movdqa xmm5, xmmword ptr kShuffleMaskARGBToRGB24_0
2609 movdqa xmm6, xmmword ptr kShuffleMaskARGBToRGB24
2610
2611 convertloop:
2612 READYUV422
2613 YUVTORGB(ebx)
2614 STORERGB24
2615
2616 sub ecx, 8
2617 jg convertloop
2618
2619 pop ebx
2620 pop edi
2621 pop esi
2622 ret
2623 }
2624 }
2625
2626 // 8 pixels
2627 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RGB565 (16 bytes).
2628 __declspec(naked) void I422ToRGB565Row_SSSE3(
2629 const uint8* y_buf,
2630 const uint8* u_buf,
2631 const uint8* v_buf,
2632 uint8* rgb565_buf,
2633 const struct YuvConstants* yuvconstants,
2634 int width) {
2635 __asm {
2636 push esi
2637 push edi
2638 push ebx
2639 mov eax, [esp + 12 + 4] // Y
2640 mov esi, [esp + 12 + 8] // U
2641 mov edi, [esp + 12 + 12] // V
2642 mov edx, [esp + 12 + 16] // argb
2643 mov ebx, [esp + 12 + 20] // yuvconstants
2644 mov ecx, [esp + 12 + 24] // width
2645 sub edi, esi
2646 pcmpeqb xmm5, xmm5 // generate mask 0x0000001f
2647 psrld xmm5, 27
2648 pcmpeqb xmm6, xmm6 // generate mask 0x000007e0
2649 psrld xmm6, 26
2650 pslld xmm6, 5
2651 pcmpeqb xmm7, xmm7 // generate mask 0xfffff800
2652 pslld xmm7, 11
2653
2654 convertloop:
2655 READYUV422
2656 YUVTORGB(ebx)
2657 STORERGB565
2658
2659 sub ecx, 8
2660 jg convertloop
2661
2662 pop ebx
2663 pop edi
2664 pop esi
2665 ret
2666 }
2667 }
2668
2669 // 8 pixels.
2670 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
2671 __declspec(naked) void I422ToARGBRow_SSSE3(
2672 const uint8* y_buf,
2673 const uint8* u_buf,
2674 const uint8* v_buf,
2675 uint8* dst_argb,
2676 const struct YuvConstants* yuvconstants,
2677 int width) {
2678 __asm {
2679 push esi
2680 push edi
2681 push ebx
2682 mov eax, [esp + 12 + 4] // Y
2683 mov esi, [esp + 12 + 8] // U
2684 mov edi, [esp + 12 + 12] // V
2685 mov edx, [esp + 12 + 16] // argb
2686 mov ebx, [esp + 12 + 20] // yuvconstants
2687 mov ecx, [esp + 12 + 24] // width
2688 sub edi, esi
2689 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
2690
2691 convertloop:
2692 READYUV422
2693 YUVTORGB(ebx)
2694 STOREARGB
2695
2696 sub ecx, 8
2697 jg convertloop
2698
2699 pop ebx
2700 pop edi
2701 pop esi
2702 ret
2703 }
2704 }
2705
2706 // 8 pixels.
2707 // 4 UV values upsampled to 8 UV, mixed with 8 Y and 8 A producing 8 ARGB.
2708 __declspec(naked) void I422AlphaToARGBRow_SSSE3(
2709 const uint8* y_buf,
2710 const uint8* u_buf,
2711 const uint8* v_buf,
2712 const uint8* a_buf,
2713 uint8* dst_argb,
2714 const struct YuvConstants* yuvconstants,
2715 int width) {
2716 __asm {
2717 push esi
2718 push edi
2719 push ebx
2720 push ebp
2721 mov eax, [esp + 16 + 4] // Y
2722 mov esi, [esp + 16 + 8] // U
2723 mov edi, [esp + 16 + 12] // V
2724 mov ebp, [esp + 16 + 16] // A
2725 mov edx, [esp + 16 + 20] // argb
2726 mov ebx, [esp + 16 + 24] // yuvconstants
2727 mov ecx, [esp + 16 + 28] // width
2728 sub edi, esi
2729
2730 convertloop:
2731 READYUVA422
2732 YUVTORGB(ebx)
2733 STOREARGB
2734
2735 sub ecx, 8
2736 jg convertloop
2737
2738 pop ebp
2739 pop ebx
2740 pop edi
2741 pop esi
2742 ret
2743 }
2744 }
2745
2746 // 8 pixels.
2747 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
2748 __declspec(naked) void NV12ToARGBRow_SSSE3(
2749 const uint8* y_buf,
2750 const uint8* uv_buf,
2751 uint8* dst_argb,
2752 const struct YuvConstants* yuvconstants,
2753 int width) {
2754 __asm {
2755 push esi
2756 push ebx
2757 mov eax, [esp + 8 + 4] // Y
2758 mov esi, [esp + 8 + 8] // UV
2759 mov edx, [esp + 8 + 12] // argb
2760 mov ebx, [esp + 8 + 16] // yuvconstants
2761 mov ecx, [esp + 8 + 20] // width
2762 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
2763
2764 convertloop:
2765 READNV12
2766 YUVTORGB(ebx)
2767 STOREARGB
2768
2769 sub ecx, 8
2770 jg convertloop
2771
2772 pop ebx
2773 pop esi
2774 ret
2775 }
2776 }
2777
2778 // 8 pixels.
2779 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
2780 __declspec(naked) void NV21ToARGBRow_SSSE3(
2781 const uint8* y_buf,
2782 const uint8* vu_buf,
2783 uint8* dst_argb,
2784 const struct YuvConstants* yuvconstants,
2785 int width) {
2786 __asm {
2787 push esi
2788 push ebx
2789 mov eax, [esp + 8 + 4] // Y
2790 mov esi, [esp + 8 + 8] // VU
2791 mov edx, [esp + 8 + 12] // argb
2792 mov ebx, [esp + 8 + 16] // yuvconstants
2793 mov ecx, [esp + 8 + 20] // width
2794 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
2795
2796 convertloop:
2797 READNV21
2798 YUVTORGB(ebx)
2799 STOREARGB
2800
2801 sub ecx, 8
2802 jg convertloop
2803
2804 pop ebx
2805 pop esi
2806 ret
2807 }
2808 }
2809
2810 // 8 pixels.
2811 // 4 YUY2 values with 8 Y and 4 UV producing 8 ARGB (32 bytes).
2812 __declspec(naked) void YUY2ToARGBRow_SSSE3(
2813 const uint8* src_yuy2,
2814 uint8* dst_argb,
2815 const struct YuvConstants* yuvconstants,
2816 int width) {
2817 __asm {
2818 push ebx
2819 mov eax, [esp + 4 + 4] // yuy2
2820 mov edx, [esp + 4 + 8] // argb
2821 mov ebx, [esp + 4 + 12] // yuvconstants
2822 mov ecx, [esp + 4 + 16] // width
2823 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
2824
2825 convertloop:
2826 READYUY2
2827 YUVTORGB(ebx)
2828 STOREARGB
2829
2830 sub ecx, 8
2831 jg convertloop
2832
2833 pop ebx
2834 ret
2835 }
2836 }
2837
2838 // 8 pixels.
2839 // 4 UYVY values with 8 Y and 4 UV producing 8 ARGB (32 bytes).
2840 __declspec(naked) void UYVYToARGBRow_SSSE3(
2841 const uint8* src_uyvy,
2842 uint8* dst_argb,
2843 const struct YuvConstants* yuvconstants,
2844 int width) {
2845 __asm {
2846 push ebx
2847 mov eax, [esp + 4 + 4] // uyvy
2848 mov edx, [esp + 4 + 8] // argb
2849 mov ebx, [esp + 4 + 12] // yuvconstants
2850 mov ecx, [esp + 4 + 16] // width
2851 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
2852
2853 convertloop:
2854 READUYVY
2855 YUVTORGB(ebx)
2856 STOREARGB
2857
2858 sub ecx, 8
2859 jg convertloop
2860
2861 pop ebx
2862 ret
2863 }
2864 }
2865
2866 __declspec(naked) void I422ToRGBARow_SSSE3(
2867 const uint8* y_buf,
2868 const uint8* u_buf,
2869 const uint8* v_buf,
2870 uint8* dst_rgba,
2871 const struct YuvConstants* yuvconstants,
2872 int width) {
2873 __asm {
2874 push esi
2875 push edi
2876 push ebx
2877 mov eax, [esp + 12 + 4] // Y
2878 mov esi, [esp + 12 + 8] // U
2879 mov edi, [esp + 12 + 12] // V
2880 mov edx, [esp + 12 + 16] // argb
2881 mov ebx, [esp + 12 + 20] // yuvconstants
2882 mov ecx, [esp + 12 + 24] // width
2883 sub edi, esi
2884
2885 convertloop:
2886 READYUV422
2887 YUVTORGB(ebx)
2888 STORERGBA
2889
2890 sub ecx, 8
2891 jg convertloop
2892
2893 pop ebx
2894 pop edi
2895 pop esi
2896 ret
2897 }
2898 }
2899 #endif // HAS_I422TOARGBROW_SSSE3
2900
2901 #ifdef HAS_I400TOARGBROW_SSE2
2902 // 8 pixels of Y converted to 8 pixels of ARGB (32 bytes).
2903 __declspec(naked) void I400ToARGBRow_SSE2(const uint8* y_buf,
2904 uint8* rgb_buf,
2905 int width) {
2906 __asm {
2907 mov eax, 0x4a354a35 // 4a35 = 18997 = round(1.164 * 64 * 256)
2908 movd xmm2, eax
2909 pshufd xmm2, xmm2,0
2910 mov eax, 0x04880488 // 0488 = 1160 = round(1.164 * 64 * 16)
2911 movd xmm3, eax
2912 pshufd xmm3, xmm3, 0
2913 pcmpeqb xmm4, xmm4 // generate mask 0xff000000
2914 pslld xmm4, 24
2915
2916 mov eax, [esp + 4] // Y
2917 mov edx, [esp + 8] // rgb
2918 mov ecx, [esp + 12] // width
2919
2920 convertloop:
2921 // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
2922 movq xmm0, qword ptr [eax]
2923 lea eax, [eax + 8]
2924 punpcklbw xmm0, xmm0 // Y.Y
2925 pmulhuw xmm0, xmm2
2926 psubusw xmm0, xmm3
2927 psrlw xmm0, 6
2928 packuswb xmm0, xmm0 // G
2929
2930 // Step 2: Weave into ARGB
2931 punpcklbw xmm0, xmm0 // GG
2932 movdqa xmm1, xmm0
2933 punpcklwd xmm0, xmm0 // BGRA first 4 pixels
2934 punpckhwd xmm1, xmm1 // BGRA next 4 pixels
2935 por xmm0, xmm4
2936 por xmm1, xmm4
2937 movdqu [edx], xmm0
2938 movdqu [edx + 16], xmm1
2939 lea edx, [edx + 32]
2940 sub ecx, 8
2941 jg convertloop
2942 ret
2943 }
2944 }
2945 #endif // HAS_I400TOARGBROW_SSE2
2946
2947 #ifdef HAS_I400TOARGBROW_AVX2
2948 // 16 pixels of Y converted to 16 pixels of ARGB (64 bytes).
2949 // note: vpunpcklbw mutates and vpackuswb unmutates.
2950 __declspec(naked) void I400ToARGBRow_AVX2(const uint8* y_buf,
2951 uint8* rgb_buf,
2952 int width) {
2953 __asm {
2954 mov eax, 0x4a354a35 // 4a35 = 18997 = round(1.164 * 64 * 256)
2955 vmovd xmm2, eax
2956 vbroadcastss ymm2, xmm2
2957 mov eax, 0x04880488 // 0488 = 1160 = round(1.164 * 64 * 16)
2958 vmovd xmm3, eax
2959 vbroadcastss ymm3, xmm3
2960 vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0xff000000
2961 vpslld ymm4, ymm4, 24
2962
2963 mov eax, [esp + 4] // Y
2964 mov edx, [esp + 8] // rgb
2965 mov ecx, [esp + 12] // width
2966
2967 convertloop:
2968 // Step 1: Scale Y contriportbution to 16 G values. G = (y - 16) * 1.164
2969 vmovdqu xmm0, [eax]
2970 lea eax, [eax + 16]
2971 vpermq ymm0, ymm0, 0xd8 // vpunpcklbw mutates
2972 vpunpcklbw ymm0, ymm0, ymm0 // Y.Y
2973 vpmulhuw ymm0, ymm0, ymm2
2974 vpsubusw ymm0, ymm0, ymm3
2975 vpsrlw ymm0, ymm0, 6
2976 vpackuswb ymm0, ymm0, ymm0 // G. still mutated: 3120
2977
2978 // TODO(fbarchard): Weave alpha with unpack.
2979 // Step 2: Weave into ARGB
2980 vpunpcklbw ymm1, ymm0, ymm0 // GG - mutates
2981 vpermq ymm1, ymm1, 0xd8
2982 vpunpcklwd ymm0, ymm1, ymm1 // GGGG first 8 pixels
2983 vpunpckhwd ymm1, ymm1, ymm1 // GGGG next 8 pixels
2984 vpor ymm0, ymm0, ymm4
2985 vpor ymm1, ymm1, ymm4
2986 vmovdqu [edx], ymm0
2987 vmovdqu [edx + 32], ymm1
2988 lea edx, [edx + 64]
2989 sub ecx, 16
2990 jg convertloop
2991 vzeroupper
2992 ret
2993 }
2994 }
2995 #endif // HAS_I400TOARGBROW_AVX2
2996
2997 #ifdef HAS_MIRRORROW_SSSE3
2998 // Shuffle table for reversing the bytes.
2999 static const uvec8 kShuffleMirror = {15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u,
3000 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u};
3001
3002 // TODO(fbarchard): Replace lea with -16 offset.
3003 __declspec(naked) void MirrorRow_SSSE3(const uint8* src,
3004 uint8* dst,
3005 int width) {
3006 __asm {
3007 mov eax, [esp + 4] // src
3008 mov edx, [esp + 8] // dst
3009 mov ecx, [esp + 12] // width
3010 movdqa xmm5, xmmword ptr kShuffleMirror
3011
3012 convertloop:
3013 movdqu xmm0, [eax - 16 + ecx]
3014 pshufb xmm0, xmm5
3015 movdqu [edx], xmm0
3016 lea edx, [edx + 16]
3017 sub ecx, 16
3018 jg convertloop
3019 ret
3020 }
3021 }
3022 #endif // HAS_MIRRORROW_SSSE3
3023
3024 #ifdef HAS_MIRRORROW_AVX2
3025 __declspec(naked) void MirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
3026 __asm {
3027 mov eax, [esp + 4] // src
3028 mov edx, [esp + 8] // dst
3029 mov ecx, [esp + 12] // width
3030 vbroadcastf128 ymm5, xmmword ptr kShuffleMirror
3031
3032 convertloop:
3033 vmovdqu ymm0, [eax - 32 + ecx]
3034 vpshufb ymm0, ymm0, ymm5
3035 vpermq ymm0, ymm0, 0x4e // swap high and low halfs
3036 vmovdqu [edx], ymm0
3037 lea edx, [edx + 32]
3038 sub ecx, 32
3039 jg convertloop
3040 vzeroupper
3041 ret
3042 }
3043 }
3044 #endif // HAS_MIRRORROW_AVX2
3045
3046 #ifdef HAS_MIRRORUVROW_SSSE3
3047 // Shuffle table for reversing the bytes of UV channels.
3048 static const uvec8 kShuffleMirrorUV = {14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u,
3049 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u};
3050
3051 __declspec(naked) void MirrorUVRow_SSSE3(const uint8* src,
3052 uint8* dst_u,
3053 uint8* dst_v,
3054 int width) {
3055 __asm {
3056 push edi
3057 mov eax, [esp + 4 + 4] // src
3058 mov edx, [esp + 4 + 8] // dst_u
3059 mov edi, [esp + 4 + 12] // dst_v
3060 mov ecx, [esp + 4 + 16] // width
3061 movdqa xmm1, xmmword ptr kShuffleMirrorUV
3062 lea eax, [eax + ecx * 2 - 16]
3063 sub edi, edx
3064
3065 convertloop:
3066 movdqu xmm0, [eax]
3067 lea eax, [eax - 16]
3068 pshufb xmm0, xmm1
3069 movlpd qword ptr [edx], xmm0
3070 movhpd qword ptr [edx + edi], xmm0
3071 lea edx, [edx + 8]
3072 sub ecx, 8
3073 jg convertloop
3074
3075 pop edi
3076 ret
3077 }
3078 }
3079 #endif // HAS_MIRRORUVROW_SSSE3
3080
3081 #ifdef HAS_ARGBMIRRORROW_SSE2
3082 __declspec(naked) void ARGBMirrorRow_SSE2(const uint8* src,
3083 uint8* dst,
3084 int width) {
3085 __asm {
3086 mov eax, [esp + 4] // src
3087 mov edx, [esp + 8] // dst
3088 mov ecx, [esp + 12] // width
3089 lea eax, [eax - 16 + ecx * 4] // last 4 pixels.
3090
3091 convertloop:
3092 movdqu xmm0, [eax]
3093 lea eax, [eax - 16]
3094 pshufd xmm0, xmm0, 0x1b
3095 movdqu [edx], xmm0
3096 lea edx, [edx + 16]
3097 sub ecx, 4
3098 jg convertloop
3099 ret
3100 }
3101 }
3102 #endif // HAS_ARGBMIRRORROW_SSE2
3103
3104 #ifdef HAS_ARGBMIRRORROW_AVX2
3105 // Shuffle table for reversing the bytes.
3106 static const ulvec32 kARGBShuffleMirror_AVX2 = {7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u};
3107
3108 __declspec(naked) void ARGBMirrorRow_AVX2(const uint8* src,
3109 uint8* dst,
3110 int width) {
3111 __asm {
3112 mov eax, [esp + 4] // src
3113 mov edx, [esp + 8] // dst
3114 mov ecx, [esp + 12] // width
3115 vmovdqu ymm5, ymmword ptr kARGBShuffleMirror_AVX2
3116
3117 convertloop:
3118 vpermd ymm0, ymm5, [eax - 32 + ecx * 4] // permute dword order
3119 vmovdqu [edx], ymm0
3120 lea edx, [edx + 32]
3121 sub ecx, 8
3122 jg convertloop
3123 vzeroupper
3124 ret
3125 }
3126 }
3127 #endif // HAS_ARGBMIRRORROW_AVX2
3128
3129 #ifdef HAS_SPLITUVROW_SSE2
3130 __declspec(naked) void SplitUVRow_SSE2(const uint8* src_uv,
3131 uint8* dst_u,
3132 uint8* dst_v,
3133 int width) {
3134 __asm {
3135 push edi
3136 mov eax, [esp + 4 + 4] // src_uv
3137 mov edx, [esp + 4 + 8] // dst_u
3138 mov edi, [esp + 4 + 12] // dst_v
3139 mov ecx, [esp + 4 + 16] // width
3140 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
3141 psrlw xmm5, 8
3142 sub edi, edx
3143
3144 convertloop:
3145 movdqu xmm0, [eax]
3146 movdqu xmm1, [eax + 16]
3147 lea eax, [eax + 32]
3148 movdqa xmm2, xmm0
3149 movdqa xmm3, xmm1
3150 pand xmm0, xmm5 // even bytes
3151 pand xmm1, xmm5
3152 packuswb xmm0, xmm1
3153 psrlw xmm2, 8 // odd bytes
3154 psrlw xmm3, 8
3155 packuswb xmm2, xmm3
3156 movdqu [edx], xmm0
3157 movdqu [edx + edi], xmm2
3158 lea edx, [edx + 16]
3159 sub ecx, 16
3160 jg convertloop
3161
3162 pop edi
3163 ret
3164 }
3165 }
3166
3167 #endif // HAS_SPLITUVROW_SSE2
3168
3169 #ifdef HAS_SPLITUVROW_AVX2
3170 __declspec(naked) void SplitUVRow_AVX2(const uint8* src_uv,
3171 uint8* dst_u,
3172 uint8* dst_v,
3173 int width) {
3174 __asm {
3175 push edi
3176 mov eax, [esp + 4 + 4] // src_uv
3177 mov edx, [esp + 4 + 8] // dst_u
3178 mov edi, [esp + 4 + 12] // dst_v
3179 mov ecx, [esp + 4 + 16] // width
3180 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff
3181 vpsrlw ymm5, ymm5, 8
3182 sub edi, edx
3183
3184 convertloop:
3185 vmovdqu ymm0, [eax]
3186 vmovdqu ymm1, [eax + 32]
3187 lea eax, [eax + 64]
3188 vpsrlw ymm2, ymm0, 8 // odd bytes
3189 vpsrlw ymm3, ymm1, 8
3190 vpand ymm0, ymm0, ymm5 // even bytes
3191 vpand ymm1, ymm1, ymm5
3192 vpackuswb ymm0, ymm0, ymm1
3193 vpackuswb ymm2, ymm2, ymm3
3194 vpermq ymm0, ymm0, 0xd8
3195 vpermq ymm2, ymm2, 0xd8
3196 vmovdqu [edx], ymm0
3197 vmovdqu [edx + edi], ymm2
3198 lea edx, [edx + 32]
3199 sub ecx, 32
3200 jg convertloop
3201
3202 pop edi
3203 vzeroupper
3204 ret
3205 }
3206 }
3207 #endif // HAS_SPLITUVROW_AVX2
3208
3209 #ifdef HAS_MERGEUVROW_SSE2
3210 __declspec(naked) void MergeUVRow_SSE2(const uint8* src_u,
3211 const uint8* src_v,
3212 uint8* dst_uv,
3213 int width) {
3214 __asm {
3215 push edi
3216 mov eax, [esp + 4 + 4] // src_u
3217 mov edx, [esp + 4 + 8] // src_v
3218 mov edi, [esp + 4 + 12] // dst_uv
3219 mov ecx, [esp + 4 + 16] // width
3220 sub edx, eax
3221
3222 convertloop:
3223 movdqu xmm0, [eax] // read 16 U's
3224 movdqu xmm1, [eax + edx] // and 16 V's
3225 lea eax, [eax + 16]
3226 movdqa xmm2, xmm0
3227 punpcklbw xmm0, xmm1 // first 8 UV pairs
3228 punpckhbw xmm2, xmm1 // next 8 UV pairs
3229 movdqu [edi], xmm0
3230 movdqu [edi + 16], xmm2
3231 lea edi, [edi + 32]
3232 sub ecx, 16
3233 jg convertloop
3234
3235 pop edi
3236 ret
3237 }
3238 }
3239 #endif // HAS_MERGEUVROW_SSE2
3240
3241 #ifdef HAS_MERGEUVROW_AVX2
3242 __declspec(naked) void MergeUVRow_AVX2(const uint8* src_u,
3243 const uint8* src_v,
3244 uint8* dst_uv,
3245 int width) {
3246 __asm {
3247 push edi
3248 mov eax, [esp + 4 + 4] // src_u
3249 mov edx, [esp + 4 + 8] // src_v
3250 mov edi, [esp + 4 + 12] // dst_uv
3251 mov ecx, [esp + 4 + 16] // width
3252 sub edx, eax
3253
3254 convertloop:
3255 vmovdqu ymm0, [eax] // read 32 U's
3256 vmovdqu ymm1, [eax + edx] // and 32 V's
3257 lea eax, [eax + 32]
3258 vpunpcklbw ymm2, ymm0, ymm1 // low 16 UV pairs. mutated qqword 0,2
3259 vpunpckhbw ymm0, ymm0, ymm1 // high 16 UV pairs. mutated qqword 1,3
3260 vextractf128 [edi], ymm2, 0 // bytes 0..15
3261 vextractf128 [edi + 16], ymm0, 0 // bytes 16..31
3262 vextractf128 [edi + 32], ymm2, 1 // bytes 32..47
3263 vextractf128 [edi + 48], ymm0, 1 // bytes 47..63
3264 lea edi, [edi + 64]
3265 sub ecx, 32
3266 jg convertloop
3267
3268 pop edi
3269 vzeroupper
3270 ret
3271 }
3272 }
3273 #endif // HAS_MERGEUVROW_AVX2
3274
3275 #ifdef HAS_COPYROW_SSE2
3276 // CopyRow copys 'count' bytes using a 16 byte load/store, 32 bytes at time.
3277 __declspec(naked) void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
3278 __asm {
3279 mov eax, [esp + 4] // src
3280 mov edx, [esp + 8] // dst
3281 mov ecx, [esp + 12] // count
3282 test eax, 15
3283 jne convertloopu
3284 test edx, 15
3285 jne convertloopu
3286
3287 convertloopa:
3288 movdqa xmm0, [eax]
3289 movdqa xmm1, [eax + 16]
3290 lea eax, [eax + 32]
3291 movdqa [edx], xmm0
3292 movdqa [edx + 16], xmm1
3293 lea edx, [edx + 32]
3294 sub ecx, 32
3295 jg convertloopa
3296 ret
3297
3298 convertloopu:
3299 movdqu xmm0, [eax]
3300 movdqu xmm1, [eax + 16]
3301 lea eax, [eax + 32]
3302 movdqu [edx], xmm0
3303 movdqu [edx + 16], xmm1
3304 lea edx, [edx + 32]
3305 sub ecx, 32
3306 jg convertloopu
3307 ret
3308 }
3309 }
3310 #endif // HAS_COPYROW_SSE2
3311
3312 #ifdef HAS_COPYROW_AVX
3313 // CopyRow copys 'count' bytes using a 32 byte load/store, 64 bytes at time.
3314 __declspec(naked) void CopyRow_AVX(const uint8* src, uint8* dst, int count) {
3315 __asm {
3316 mov eax, [esp + 4] // src
3317 mov edx, [esp + 8] // dst
3318 mov ecx, [esp + 12] // count
3319
3320 convertloop:
3321 vmovdqu ymm0, [eax]
3322 vmovdqu ymm1, [eax + 32]
3323 lea eax, [eax + 64]
3324 vmovdqu [edx], ymm0
3325 vmovdqu [edx + 32], ymm1
3326 lea edx, [edx + 64]
3327 sub ecx, 64
3328 jg convertloop
3329
3330 vzeroupper
3331 ret
3332 }
3333 }
3334 #endif // HAS_COPYROW_AVX
3335
3336 // Multiple of 1.
3337 __declspec(naked) void CopyRow_ERMS(const uint8* src, uint8* dst, int count) {
3338 __asm {
3339 mov eax, esi
3340 mov edx, edi
3341 mov esi, [esp + 4] // src
3342 mov edi, [esp + 8] // dst
3343 mov ecx, [esp + 12] // count
3344 rep movsb
3345 mov edi, edx
3346 mov esi, eax
3347 ret
3348 }
3349 }
3350
3351 #ifdef HAS_ARGBCOPYALPHAROW_SSE2
3352 // width in pixels
3353 __declspec(naked) void ARGBCopyAlphaRow_SSE2(const uint8* src,
3354 uint8* dst,
3355 int width) {
3356 __asm {
3357 mov eax, [esp + 4] // src
3358 mov edx, [esp + 8] // dst
3359 mov ecx, [esp + 12] // count
3360 pcmpeqb xmm0, xmm0 // generate mask 0xff000000
3361 pslld xmm0, 24
3362 pcmpeqb xmm1, xmm1 // generate mask 0x00ffffff
3363 psrld xmm1, 8
3364
3365 convertloop:
3366 movdqu xmm2, [eax]
3367 movdqu xmm3, [eax + 16]
3368 lea eax, [eax + 32]
3369 movdqu xmm4, [edx]
3370 movdqu xmm5, [edx + 16]
3371 pand xmm2, xmm0
3372 pand xmm3, xmm0
3373 pand xmm4, xmm1
3374 pand xmm5, xmm1
3375 por xmm2, xmm4
3376 por xmm3, xmm5
3377 movdqu [edx], xmm2
3378 movdqu [edx + 16], xmm3
3379 lea edx, [edx + 32]
3380 sub ecx, 8
3381 jg convertloop
3382
3383 ret
3384 }
3385 }
3386 #endif // HAS_ARGBCOPYALPHAROW_SSE2
3387
3388 #ifdef HAS_ARGBCOPYALPHAROW_AVX2
3389 // width in pixels
3390 __declspec(naked) void ARGBCopyAlphaRow_AVX2(const uint8* src,
3391 uint8* dst,
3392 int width) {
3393 __asm {
3394 mov eax, [esp + 4] // src
3395 mov edx, [esp + 8] // dst
3396 mov ecx, [esp + 12] // count
3397 vpcmpeqb ymm0, ymm0, ymm0
3398 vpsrld ymm0, ymm0, 8 // generate mask 0x00ffffff
3399
3400 convertloop:
3401 vmovdqu ymm1, [eax]
3402 vmovdqu ymm2, [eax + 32]
3403 lea eax, [eax + 64]
3404 vpblendvb ymm1, ymm1, [edx], ymm0
3405 vpblendvb ymm2, ymm2, [edx + 32], ymm0
3406 vmovdqu [edx], ymm1
3407 vmovdqu [edx + 32], ymm2
3408 lea edx, [edx + 64]
3409 sub ecx, 16
3410 jg convertloop
3411
3412 vzeroupper
3413 ret
3414 }
3415 }
3416 #endif // HAS_ARGBCOPYALPHAROW_AVX2
3417
3418 #ifdef HAS_ARGBEXTRACTALPHAROW_SSE2
3419 // width in pixels
3420 __declspec(naked) void ARGBExtractAlphaRow_SSE2(const uint8* src_argb,
3421 uint8* dst_a,
3422 int width) {
3423 __asm {
3424 mov eax, [esp + 4] // src_argb
3425 mov edx, [esp + 8] // dst_a
3426 mov ecx, [esp + 12] // width
3427
3428 extractloop:
3429 movdqu xmm0, [eax]
3430 movdqu xmm1, [eax + 16]
3431 lea eax, [eax + 32]
3432 psrld xmm0, 24
3433 psrld xmm1, 24
3434 packssdw xmm0, xmm1
3435 packuswb xmm0, xmm0
3436 movq qword ptr [edx], xmm0
3437 lea edx, [edx + 8]
3438 sub ecx, 8
3439 jg extractloop
3440
3441 ret
3442 }
3443 }
3444 #endif // HAS_ARGBEXTRACTALPHAROW_SSE2
3445
3446 #ifdef HAS_ARGBEXTRACTALPHAROW_AVX2
3447 // width in pixels
3448 __declspec(naked) void ARGBExtractAlphaRow_AVX2(const uint8* src_argb,
3449 uint8* dst_a,
3450 int width) {
3451 __asm {
3452 mov eax, [esp + 4] // src_argb
3453 mov edx, [esp + 8] // dst_a
3454 mov ecx, [esp + 12] // width
3455 vmovdqa ymm4, ymmword ptr kPermdARGBToY_AVX
3456
3457 extractloop:
3458 vmovdqu ymm0, [eax]
3459 vmovdqu ymm1, [eax + 32]
3460 vpsrld ymm0, ymm0, 24
3461 vpsrld ymm1, ymm1, 24
3462 vmovdqu ymm2, [eax + 64]
3463 vmovdqu ymm3, [eax + 96]
3464 lea eax, [eax + 128]
3465 vpackssdw ymm0, ymm0, ymm1 // mutates
3466 vpsrld ymm2, ymm2, 24
3467 vpsrld ymm3, ymm3, 24
3468 vpackssdw ymm2, ymm2, ymm3 // mutates
3469 vpackuswb ymm0, ymm0, ymm2 // mutates
3470 vpermd ymm0, ymm4, ymm0 // unmutate
3471 vmovdqu [edx], ymm0
3472 lea edx, [edx + 32]
3473 sub ecx, 32
3474 jg extractloop
3475
3476 vzeroupper
3477 ret
3478 }
3479 }
3480 #endif // HAS_ARGBEXTRACTALPHAROW_AVX2
3481
3482 #ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2
3483 // width in pixels
3484 __declspec(naked) void ARGBCopyYToAlphaRow_SSE2(const uint8* src,
3485 uint8* dst,
3486 int width) {
3487 __asm {
3488 mov eax, [esp + 4] // src
3489 mov edx, [esp + 8] // dst
3490 mov ecx, [esp + 12] // count
3491 pcmpeqb xmm0, xmm0 // generate mask 0xff000000
3492 pslld xmm0, 24
3493 pcmpeqb xmm1, xmm1 // generate mask 0x00ffffff
3494 psrld xmm1, 8
3495
3496 convertloop:
3497 movq xmm2, qword ptr [eax] // 8 Y's
3498 lea eax, [eax + 8]
3499 punpcklbw xmm2, xmm2
3500 punpckhwd xmm3, xmm2
3501 punpcklwd xmm2, xmm2
3502 movdqu xmm4, [edx]
3503 movdqu xmm5, [edx + 16]
3504 pand xmm2, xmm0
3505 pand xmm3, xmm0
3506 pand xmm4, xmm1
3507 pand xmm5, xmm1
3508 por xmm2, xmm4
3509 por xmm3, xmm5
3510 movdqu [edx], xmm2
3511 movdqu [edx + 16], xmm3
3512 lea edx, [edx + 32]
3513 sub ecx, 8
3514 jg convertloop
3515
3516 ret
3517 }
3518 }
3519 #endif // HAS_ARGBCOPYYTOALPHAROW_SSE2
3520
3521 #ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2
3522 // width in pixels
3523 __declspec(naked) void ARGBCopyYToAlphaRow_AVX2(const uint8* src,
3524 uint8* dst,
3525 int width) {
3526 __asm {
3527 mov eax, [esp + 4] // src
3528 mov edx, [esp + 8] // dst
3529 mov ecx, [esp + 12] // count
3530 vpcmpeqb ymm0, ymm0, ymm0
3531 vpsrld ymm0, ymm0, 8 // generate mask 0x00ffffff
3532
3533 convertloop:
3534 vpmovzxbd ymm1, qword ptr [eax]
3535 vpmovzxbd ymm2, qword ptr [eax + 8]
3536 lea eax, [eax + 16]
3537 vpslld ymm1, ymm1, 24
3538 vpslld ymm2, ymm2, 24
3539 vpblendvb ymm1, ymm1, [edx], ymm0
3540 vpblendvb ymm2, ymm2, [edx + 32], ymm0
3541 vmovdqu [edx], ymm1
3542 vmovdqu [edx + 32], ymm2
3543 lea edx, [edx + 64]
3544 sub ecx, 16
3545 jg convertloop
3546
3547 vzeroupper
3548 ret
3549 }
3550 }
3551 #endif // HAS_ARGBCOPYYTOALPHAROW_AVX2
3552
3553 #ifdef HAS_SETROW_X86
3554 // Write 'count' bytes using an 8 bit value repeated.
3555 // Count should be multiple of 4.
3556 __declspec(naked) void SetRow_X86(uint8* dst, uint8 v8, int count) {
3557 __asm {
3558 movzx eax, byte ptr [esp + 8] // v8
3559 mov edx, 0x01010101 // Duplicate byte to all bytes.
3560 mul edx // overwrites edx with upper part of result.
3561 mov edx, edi
3562 mov edi, [esp + 4] // dst
3563 mov ecx, [esp + 12] // count
3564 shr ecx, 2
3565 rep stosd
3566 mov edi, edx
3567 ret
3568 }
3569 }
3570
3571 // Write 'count' bytes using an 8 bit value repeated.
3572 __declspec(naked) void SetRow_ERMS(uint8* dst, uint8 v8, int count) {
3573 __asm {
3574 mov edx, edi
3575 mov edi, [esp + 4] // dst
3576 mov eax, [esp + 8] // v8
3577 mov ecx, [esp + 12] // count
3578 rep stosb
3579 mov edi, edx
3580 ret
3581 }
3582 }
3583
3584 // Write 'count' 32 bit values.
3585 __declspec(naked) void ARGBSetRow_X86(uint8* dst_argb, uint32 v32, int count) {
3586 __asm {
3587 mov edx, edi
3588 mov edi, [esp + 4] // dst
3589 mov eax, [esp + 8] // v32
3590 mov ecx, [esp + 12] // count
3591 rep stosd
3592 mov edi, edx
3593 ret
3594 }
3595 }
3596 #endif // HAS_SETROW_X86
3597
3598 #ifdef HAS_YUY2TOYROW_AVX2
3599 __declspec(naked) void YUY2ToYRow_AVX2(const uint8* src_yuy2,
3600 uint8* dst_y,
3601 int width) {
3602 __asm {
3603 mov eax, [esp + 4] // src_yuy2
3604 mov edx, [esp + 8] // dst_y
3605 mov ecx, [esp + 12] // width
3606 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff
3607 vpsrlw ymm5, ymm5, 8
3608
3609 convertloop:
3610 vmovdqu ymm0, [eax]
3611 vmovdqu ymm1, [eax + 32]
3612 lea eax, [eax + 64]
3613 vpand ymm0, ymm0, ymm5 // even bytes are Y
3614 vpand ymm1, ymm1, ymm5
3615 vpackuswb ymm0, ymm0, ymm1 // mutates.
3616 vpermq ymm0, ymm0, 0xd8
3617 vmovdqu [edx], ymm0
3618 lea edx, [edx + 32]
3619 sub ecx, 32
3620 jg convertloop
3621 vzeroupper
3622 ret
3623 }
3624 }
3625
3626 __declspec(naked) void YUY2ToUVRow_AVX2(const uint8* src_yuy2,
3627 int stride_yuy2,
3628 uint8* dst_u,
3629 uint8* dst_v,
3630 int width) {
3631 __asm {
3632 push esi
3633 push edi
3634 mov eax, [esp + 8 + 4] // src_yuy2
3635 mov esi, [esp + 8 + 8] // stride_yuy2
3636 mov edx, [esp + 8 + 12] // dst_u
3637 mov edi, [esp + 8 + 16] // dst_v
3638 mov ecx, [esp + 8 + 20] // width
3639 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff
3640 vpsrlw ymm5, ymm5, 8
3641 sub edi, edx
3642
3643 convertloop:
3644 vmovdqu ymm0, [eax]
3645 vmovdqu ymm1, [eax + 32]
3646 vpavgb ymm0, ymm0, [eax + esi]
3647 vpavgb ymm1, ymm1, [eax + esi + 32]
3648 lea eax, [eax + 64]
3649 vpsrlw ymm0, ymm0, 8 // YUYV -> UVUV
3650 vpsrlw ymm1, ymm1, 8
3651 vpackuswb ymm0, ymm0, ymm1 // mutates.
3652 vpermq ymm0, ymm0, 0xd8
3653 vpand ymm1, ymm0, ymm5 // U
3654 vpsrlw ymm0, ymm0, 8 // V
3655 vpackuswb ymm1, ymm1, ymm1 // mutates.
3656 vpackuswb ymm0, ymm0, ymm0 // mutates.
3657 vpermq ymm1, ymm1, 0xd8
3658 vpermq ymm0, ymm0, 0xd8
3659 vextractf128 [edx], ymm1, 0 // U
3660 vextractf128 [edx + edi], ymm0, 0 // V
3661 lea edx, [edx + 16]
3662 sub ecx, 32
3663 jg convertloop
3664
3665 pop edi
3666 pop esi
3667 vzeroupper
3668 ret
3669 }
3670 }
3671
3672 __declspec(naked) void YUY2ToUV422Row_AVX2(const uint8* src_yuy2,
3673 uint8* dst_u,
3674 uint8* dst_v,
3675 int width) {
3676 __asm {
3677 push edi
3678 mov eax, [esp + 4 + 4] // src_yuy2
3679 mov edx, [esp + 4 + 8] // dst_u
3680 mov edi, [esp + 4 + 12] // dst_v
3681 mov ecx, [esp + 4 + 16] // width
3682 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff
3683 vpsrlw ymm5, ymm5, 8
3684 sub edi, edx
3685
3686 convertloop:
3687 vmovdqu ymm0, [eax]
3688 vmovdqu ymm1, [eax + 32]
3689 lea eax, [eax + 64]
3690 vpsrlw ymm0, ymm0, 8 // YUYV -> UVUV
3691 vpsrlw ymm1, ymm1, 8
3692 vpackuswb ymm0, ymm0, ymm1 // mutates.
3693 vpermq ymm0, ymm0, 0xd8
3694 vpand ymm1, ymm0, ymm5 // U
3695 vpsrlw ymm0, ymm0, 8 // V
3696 vpackuswb ymm1, ymm1, ymm1 // mutates.
3697 vpackuswb ymm0, ymm0, ymm0 // mutates.
3698 vpermq ymm1, ymm1, 0xd8
3699 vpermq ymm0, ymm0, 0xd8
3700 vextractf128 [edx], ymm1, 0 // U
3701 vextractf128 [edx + edi], ymm0, 0 // V
3702 lea edx, [edx + 16]
3703 sub ecx, 32
3704 jg convertloop
3705
3706 pop edi
3707 vzeroupper
3708 ret
3709 }
3710 }
3711
3712 __declspec(naked) void UYVYToYRow_AVX2(const uint8* src_uyvy,
3713 uint8* dst_y,
3714 int width) {
3715 __asm {
3716 mov eax, [esp + 4] // src_uyvy
3717 mov edx, [esp + 8] // dst_y
3718 mov ecx, [esp + 12] // width
3719
3720 convertloop:
3721 vmovdqu ymm0, [eax]
3722 vmovdqu ymm1, [eax + 32]
3723 lea eax, [eax + 64]
3724 vpsrlw ymm0, ymm0, 8 // odd bytes are Y
3725 vpsrlw ymm1, ymm1, 8
3726 vpackuswb ymm0, ymm0, ymm1 // mutates.
3727 vpermq ymm0, ymm0, 0xd8
3728 vmovdqu [edx], ymm0
3729 lea edx, [edx + 32]
3730 sub ecx, 32
3731 jg convertloop
3732 vzeroupper
3733 ret
3734 }
3735 }
3736
3737 __declspec(naked) void UYVYToUVRow_AVX2(const uint8* src_uyvy,
3738 int stride_uyvy,
3739 uint8* dst_u,
3740 uint8* dst_v,
3741 int width) {
3742 __asm {
3743 push esi
3744 push edi
3745 mov eax, [esp + 8 + 4] // src_yuy2
3746 mov esi, [esp + 8 + 8] // stride_yuy2
3747 mov edx, [esp + 8 + 12] // dst_u
3748 mov edi, [esp + 8 + 16] // dst_v
3749 mov ecx, [esp + 8 + 20] // width
3750 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff
3751 vpsrlw ymm5, ymm5, 8
3752 sub edi, edx
3753
3754 convertloop:
3755 vmovdqu ymm0, [eax]
3756 vmovdqu ymm1, [eax + 32]
3757 vpavgb ymm0, ymm0, [eax + esi]
3758 vpavgb ymm1, ymm1, [eax + esi + 32]
3759 lea eax, [eax + 64]
3760 vpand ymm0, ymm0, ymm5 // UYVY -> UVUV
3761 vpand ymm1, ymm1, ymm5
3762 vpackuswb ymm0, ymm0, ymm1 // mutates.
3763 vpermq ymm0, ymm0, 0xd8
3764 vpand ymm1, ymm0, ymm5 // U
3765 vpsrlw ymm0, ymm0, 8 // V
3766 vpackuswb ymm1, ymm1, ymm1 // mutates.
3767 vpackuswb ymm0, ymm0, ymm0 // mutates.
3768 vpermq ymm1, ymm1, 0xd8
3769 vpermq ymm0, ymm0, 0xd8
3770 vextractf128 [edx], ymm1, 0 // U
3771 vextractf128 [edx + edi], ymm0, 0 // V
3772 lea edx, [edx + 16]
3773 sub ecx, 32
3774 jg convertloop
3775
3776 pop edi
3777 pop esi
3778 vzeroupper
3779 ret
3780 }
3781 }
3782
3783 __declspec(naked) void UYVYToUV422Row_AVX2(const uint8* src_uyvy,
3784 uint8* dst_u,
3785 uint8* dst_v,
3786 int width) {
3787 __asm {
3788 push edi
3789 mov eax, [esp + 4 + 4] // src_yuy2
3790 mov edx, [esp + 4 + 8] // dst_u
3791 mov edi, [esp + 4 + 12] // dst_v
3792 mov ecx, [esp + 4 + 16] // width
3793 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff
3794 vpsrlw ymm5, ymm5, 8
3795 sub edi, edx
3796
3797 convertloop:
3798 vmovdqu ymm0, [eax]
3799 vmovdqu ymm1, [eax + 32]
3800 lea eax, [eax + 64]
3801 vpand ymm0, ymm0, ymm5 // UYVY -> UVUV
3802 vpand ymm1, ymm1, ymm5
3803 vpackuswb ymm0, ymm0, ymm1 // mutates.
3804 vpermq ymm0, ymm0, 0xd8
3805 vpand ymm1, ymm0, ymm5 // U
3806 vpsrlw ymm0, ymm0, 8 // V
3807 vpackuswb ymm1, ymm1, ymm1 // mutates.
3808 vpackuswb ymm0, ymm0, ymm0 // mutates.
3809 vpermq ymm1, ymm1, 0xd8
3810 vpermq ymm0, ymm0, 0xd8
3811 vextractf128 [edx], ymm1, 0 // U
3812 vextractf128 [edx + edi], ymm0, 0 // V
3813 lea edx, [edx + 16]
3814 sub ecx, 32
3815 jg convertloop
3816
3817 pop edi
3818 vzeroupper
3819 ret
3820 }
3821 }
3822 #endif // HAS_YUY2TOYROW_AVX2
3823
3824 #ifdef HAS_YUY2TOYROW_SSE2
3825 __declspec(naked) void YUY2ToYRow_SSE2(const uint8* src_yuy2,
3826 uint8* dst_y,
3827 int width) {
3828 __asm {
3829 mov eax, [esp + 4] // src_yuy2
3830 mov edx, [esp + 8] // dst_y
3831 mov ecx, [esp + 12] // width
3832 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
3833 psrlw xmm5, 8
3834
3835 convertloop:
3836 movdqu xmm0, [eax]
3837 movdqu xmm1, [eax + 16]
3838 lea eax, [eax + 32]
3839 pand xmm0, xmm5 // even bytes are Y
3840 pand xmm1, xmm5
3841 packuswb xmm0, xmm1
3842 movdqu [edx], xmm0
3843 lea edx, [edx + 16]
3844 sub ecx, 16
3845 jg convertloop
3846 ret
3847 }
3848 }
3849
3850 __declspec(naked) void YUY2ToUVRow_SSE2(const uint8* src_yuy2,
3851 int stride_yuy2,
3852 uint8* dst_u,
3853 uint8* dst_v,
3854 int width) {
3855 __asm {
3856 push esi
3857 push edi
3858 mov eax, [esp + 8 + 4] // src_yuy2
3859 mov esi, [esp + 8 + 8] // stride_yuy2
3860 mov edx, [esp + 8 + 12] // dst_u
3861 mov edi, [esp + 8 + 16] // dst_v
3862 mov ecx, [esp + 8 + 20] // width
3863 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
3864 psrlw xmm5, 8
3865 sub edi, edx
3866
3867 convertloop:
3868 movdqu xmm0, [eax]
3869 movdqu xmm1, [eax + 16]
3870 movdqu xmm2, [eax + esi]
3871 movdqu xmm3, [eax + esi + 16]
3872 lea eax, [eax + 32]
3873 pavgb xmm0, xmm2
3874 pavgb xmm1, xmm3
3875 psrlw xmm0, 8 // YUYV -> UVUV
3876 psrlw xmm1, 8
3877 packuswb xmm0, xmm1
3878 movdqa xmm1, xmm0
3879 pand xmm0, xmm5 // U
3880 packuswb xmm0, xmm0
3881 psrlw xmm1, 8 // V
3882 packuswb xmm1, xmm1
3883 movq qword ptr [edx], xmm0
3884 movq qword ptr [edx + edi], xmm1
3885 lea edx, [edx + 8]
3886 sub ecx, 16
3887 jg convertloop
3888
3889 pop edi
3890 pop esi
3891 ret
3892 }
3893 }
3894
3895 __declspec(naked) void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
3896 uint8* dst_u,
3897 uint8* dst_v,
3898 int width) {
3899 __asm {
3900 push edi
3901 mov eax, [esp + 4 + 4] // src_yuy2
3902 mov edx, [esp + 4 + 8] // dst_u
3903 mov edi, [esp + 4 + 12] // dst_v
3904 mov ecx, [esp + 4 + 16] // width
3905 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
3906 psrlw xmm5, 8
3907 sub edi, edx
3908
3909 convertloop:
3910 movdqu xmm0, [eax]
3911 movdqu xmm1, [eax + 16]
3912 lea eax, [eax + 32]
3913 psrlw xmm0, 8 // YUYV -> UVUV
3914 psrlw xmm1, 8
3915 packuswb xmm0, xmm1
3916 movdqa xmm1, xmm0
3917 pand xmm0, xmm5 // U
3918 packuswb xmm0, xmm0
3919 psrlw xmm1, 8 // V
3920 packuswb xmm1, xmm1
3921 movq qword ptr [edx], xmm0
3922 movq qword ptr [edx + edi], xmm1
3923 lea edx, [edx + 8]
3924 sub ecx, 16
3925 jg convertloop
3926
3927 pop edi
3928 ret
3929 }
3930 }
3931
3932 __declspec(naked) void UYVYToYRow_SSE2(const uint8* src_uyvy,
3933 uint8* dst_y,
3934 int width) {
3935 __asm {
3936 mov eax, [esp + 4] // src_uyvy
3937 mov edx, [esp + 8] // dst_y
3938 mov ecx, [esp + 12] // width
3939
3940 convertloop:
3941 movdqu xmm0, [eax]
3942 movdqu xmm1, [eax + 16]
3943 lea eax, [eax + 32]
3944 psrlw xmm0, 8 // odd bytes are Y
3945 psrlw xmm1, 8
3946 packuswb xmm0, xmm1
3947 movdqu [edx], xmm0
3948 lea edx, [edx + 16]
3949 sub ecx, 16
3950 jg convertloop
3951 ret
3952 }
3953 }
3954
3955 __declspec(naked) void UYVYToUVRow_SSE2(const uint8* src_uyvy,
3956 int stride_uyvy,
3957 uint8* dst_u,
3958 uint8* dst_v,
3959 int width) {
3960 __asm {
3961 push esi
3962 push edi
3963 mov eax, [esp + 8 + 4] // src_yuy2
3964 mov esi, [esp + 8 + 8] // stride_yuy2
3965 mov edx, [esp + 8 + 12] // dst_u
3966 mov edi, [esp + 8 + 16] // dst_v
3967 mov ecx, [esp + 8 + 20] // width
3968 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
3969 psrlw xmm5, 8
3970 sub edi, edx
3971
3972 convertloop:
3973 movdqu xmm0, [eax]
3974 movdqu xmm1, [eax + 16]
3975 movdqu xmm2, [eax + esi]
3976 movdqu xmm3, [eax + esi + 16]
3977 lea eax, [eax + 32]
3978 pavgb xmm0, xmm2
3979 pavgb xmm1, xmm3
3980 pand xmm0, xmm5 // UYVY -> UVUV
3981 pand xmm1, xmm5
3982 packuswb xmm0, xmm1
3983 movdqa xmm1, xmm0
3984 pand xmm0, xmm5 // U
3985 packuswb xmm0, xmm0
3986 psrlw xmm1, 8 // V
3987 packuswb xmm1, xmm1
3988 movq qword ptr [edx], xmm0
3989 movq qword ptr [edx + edi], xmm1
3990 lea edx, [edx + 8]
3991 sub ecx, 16
3992 jg convertloop
3993
3994 pop edi
3995 pop esi
3996 ret
3997 }
3998 }
3999
4000 __declspec(naked) void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
4001 uint8* dst_u,
4002 uint8* dst_v,
4003 int width) {
4004 __asm {
4005 push edi
4006 mov eax, [esp + 4 + 4] // src_yuy2
4007 mov edx, [esp + 4 + 8] // dst_u
4008 mov edi, [esp + 4 + 12] // dst_v
4009 mov ecx, [esp + 4 + 16] // width
4010 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
4011 psrlw xmm5, 8
4012 sub edi, edx
4013
4014 convertloop:
4015 movdqu xmm0, [eax]
4016 movdqu xmm1, [eax + 16]
4017 lea eax, [eax + 32]
4018 pand xmm0, xmm5 // UYVY -> UVUV
4019 pand xmm1, xmm5
4020 packuswb xmm0, xmm1
4021 movdqa xmm1, xmm0
4022 pand xmm0, xmm5 // U
4023 packuswb xmm0, xmm0
4024 psrlw xmm1, 8 // V
4025 packuswb xmm1, xmm1
4026 movq qword ptr [edx], xmm0
4027 movq qword ptr [edx + edi], xmm1
4028 lea edx, [edx + 8]
4029 sub ecx, 16
4030 jg convertloop
4031
4032 pop edi
4033 ret
4034 }
4035 }
4036 #endif // HAS_YUY2TOYROW_SSE2
4037
4038 #ifdef HAS_BLENDPLANEROW_SSSE3
4039 // Blend 8 pixels at a time.
4040 // unsigned version of math
4041 // =((A2*C2)+(B2*(255-C2))+255)/256
4042 // signed version of math
4043 // =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256
4044 __declspec(naked) void BlendPlaneRow_SSSE3(const uint8* src0,
4045 const uint8* src1,
4046 const uint8* alpha,
4047 uint8* dst,
4048 int width) {
4049 __asm {
4050 push esi
4051 push edi
4052 pcmpeqb xmm5, xmm5 // generate mask 0xff00ff00
4053 psllw xmm5, 8
4054 mov eax, 0x80808080 // 128 for biasing image to signed.
4055 movd xmm6, eax
4056 pshufd xmm6, xmm6, 0x00
4057
4058 mov eax, 0x807f807f // 32768 + 127 for unbias and round.
4059 movd xmm7, eax
4060 pshufd xmm7, xmm7, 0x00
4061 mov eax, [esp + 8 + 4] // src0
4062 mov edx, [esp + 8 + 8] // src1
4063 mov esi, [esp + 8 + 12] // alpha
4064 mov edi, [esp + 8 + 16] // dst
4065 mov ecx, [esp + 8 + 20] // width
4066 sub eax, esi
4067 sub edx, esi
4068 sub edi, esi
4069
4070 // 8 pixel loop.
4071 convertloop8:
4072 movq xmm0, qword ptr [esi] // alpha
4073 punpcklbw xmm0, xmm0
4074 pxor xmm0, xmm5 // a, 255-a
4075 movq xmm1, qword ptr [eax + esi] // src0
4076 movq xmm2, qword ptr [edx + esi] // src1
4077 punpcklbw xmm1, xmm2
4078 psubb xmm1, xmm6 // bias src0/1 - 128
4079 pmaddubsw xmm0, xmm1
4080 paddw xmm0, xmm7 // unbias result - 32768 and round.
4081 psrlw xmm0, 8
4082 packuswb xmm0, xmm0
4083 movq qword ptr [edi + esi], xmm0
4084 lea esi, [esi + 8]
4085 sub ecx, 8
4086 jg convertloop8
4087
4088 pop edi
4089 pop esi
4090 ret
4091 }
4092 }
4093 #endif // HAS_BLENDPLANEROW_SSSE3
4094
4095 #ifdef HAS_BLENDPLANEROW_AVX2
4096 // Blend 32 pixels at a time.
4097 // unsigned version of math
4098 // =((A2*C2)+(B2*(255-C2))+255)/256
4099 // signed version of math
4100 // =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256
4101 __declspec(naked) void BlendPlaneRow_AVX2(const uint8* src0,
4102 const uint8* src1,
4103 const uint8* alpha,
4104 uint8* dst,
4105 int width) {
4106 __asm {
4107 push esi
4108 push edi
4109 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xff00ff00
4110 vpsllw ymm5, ymm5, 8
4111 mov eax, 0x80808080 // 128 for biasing image to signed.
4112 vmovd xmm6, eax
4113 vbroadcastss ymm6, xmm6
4114 mov eax, 0x807f807f // 32768 + 127 for unbias and round.
4115 vmovd xmm7, eax
4116 vbroadcastss ymm7, xmm7
4117 mov eax, [esp + 8 + 4] // src0
4118 mov edx, [esp + 8 + 8] // src1
4119 mov esi, [esp + 8 + 12] // alpha
4120 mov edi, [esp + 8 + 16] // dst
4121 mov ecx, [esp + 8 + 20] // width
4122 sub eax, esi
4123 sub edx, esi
4124 sub edi, esi
4125
4126 // 32 pixel loop.
4127 convertloop32:
4128 vmovdqu ymm0, [esi] // alpha
4129 vpunpckhbw ymm3, ymm0, ymm0 // 8..15, 24..31
4130 vpunpcklbw ymm0, ymm0, ymm0 // 0..7, 16..23
4131 vpxor ymm3, ymm3, ymm5 // a, 255-a
4132 vpxor ymm0, ymm0, ymm5 // a, 255-a
4133 vmovdqu ymm1, [eax + esi] // src0
4134 vmovdqu ymm2, [edx + esi] // src1
4135 vpunpckhbw ymm4, ymm1, ymm2
4136 vpunpcklbw ymm1, ymm1, ymm2
4137 vpsubb ymm4, ymm4, ymm6 // bias src0/1 - 128
4138 vpsubb ymm1, ymm1, ymm6 // bias src0/1 - 128
4139 vpmaddubsw ymm3, ymm3, ymm4
4140 vpmaddubsw ymm0, ymm0, ymm1
4141 vpaddw ymm3, ymm3, ymm7 // unbias result - 32768 and round.
4142 vpaddw ymm0, ymm0, ymm7 // unbias result - 32768 and round.
4143 vpsrlw ymm3, ymm3, 8
4144 vpsrlw ymm0, ymm0, 8
4145 vpackuswb ymm0, ymm0, ymm3
4146 vmovdqu [edi + esi], ymm0
4147 lea esi, [esi + 32]
4148 sub ecx, 32
4149 jg convertloop32
4150
4151 pop edi
4152 pop esi
4153 vzeroupper
4154 ret
4155 }
4156 }
4157 #endif // HAS_BLENDPLANEROW_AVX2
4158
4159 #ifdef HAS_ARGBBLENDROW_SSSE3
4160 // Shuffle table for isolating alpha.
4161 static const uvec8 kShuffleAlpha = {3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80,
4162 11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80};
4163
4164 // Blend 8 pixels at a time.
4165 __declspec(naked) void ARGBBlendRow_SSSE3(const uint8* src_argb0,
4166 const uint8* src_argb1,
4167 uint8* dst_argb,
4168 int width) {
4169 __asm {
4170 push esi
4171 mov eax, [esp + 4 + 4] // src_argb0
4172 mov esi, [esp + 4 + 8] // src_argb1
4173 mov edx, [esp + 4 + 12] // dst_argb
4174 mov ecx, [esp + 4 + 16] // width
4175 pcmpeqb xmm7, xmm7 // generate constant 0x0001
4176 psrlw xmm7, 15
4177 pcmpeqb xmm6, xmm6 // generate mask 0x00ff00ff
4178 psrlw xmm6, 8
4179 pcmpeqb xmm5, xmm5 // generate mask 0xff00ff00
4180 psllw xmm5, 8
4181 pcmpeqb xmm4, xmm4 // generate mask 0xff000000
4182 pslld xmm4, 24
4183 sub ecx, 4
4184 jl convertloop4b // less than 4 pixels?
4185
4186 // 4 pixel loop.
4187 convertloop4:
4188 movdqu xmm3, [eax] // src argb
4189 lea eax, [eax + 16]
4190 movdqa xmm0, xmm3 // src argb
4191 pxor xmm3, xmm4 // ~alpha
4192 movdqu xmm2, [esi] // _r_b
4193 pshufb xmm3, xmmword ptr kShuffleAlpha // alpha
4194 pand xmm2, xmm6 // _r_b
4195 paddw xmm3, xmm7 // 256 - alpha
4196 pmullw xmm2, xmm3 // _r_b * alpha
4197 movdqu xmm1, [esi] // _a_g
4198 lea esi, [esi + 16]
4199 psrlw xmm1, 8 // _a_g
4200 por xmm0, xmm4 // set alpha to 255
4201 pmullw xmm1, xmm3 // _a_g * alpha
4202 psrlw xmm2, 8 // _r_b convert to 8 bits again
4203 paddusb xmm0, xmm2 // + src argb
4204 pand xmm1, xmm5 // a_g_ convert to 8 bits again
4205 paddusb xmm0, xmm1 // + src argb
4206 movdqu [edx], xmm0
4207 lea edx, [edx + 16]
4208 sub ecx, 4
4209 jge convertloop4
4210
4211 convertloop4b:
4212 add ecx, 4 - 1
4213 jl convertloop1b
4214
4215 // 1 pixel loop.
4216 convertloop1:
4217 movd xmm3, [eax] // src argb
4218 lea eax, [eax + 4]
4219 movdqa xmm0, xmm3 // src argb
4220 pxor xmm3, xmm4 // ~alpha
4221 movd xmm2, [esi] // _r_b
4222 pshufb xmm3, xmmword ptr kShuffleAlpha // alpha
4223 pand xmm2, xmm6 // _r_b
4224 paddw xmm3, xmm7 // 256 - alpha
4225 pmullw xmm2, xmm3 // _r_b * alpha
4226 movd xmm1, [esi] // _a_g
4227 lea esi, [esi + 4]
4228 psrlw xmm1, 8 // _a_g
4229 por xmm0, xmm4 // set alpha to 255
4230 pmullw xmm1, xmm3 // _a_g * alpha
4231 psrlw xmm2, 8 // _r_b convert to 8 bits again
4232 paddusb xmm0, xmm2 // + src argb
4233 pand xmm1, xmm5 // a_g_ convert to 8 bits again
4234 paddusb xmm0, xmm1 // + src argb
4235 movd [edx], xmm0
4236 lea edx, [edx + 4]
4237 sub ecx, 1
4238 jge convertloop1
4239
4240 convertloop1b:
4241 pop esi
4242 ret
4243 }
4244 }
4245 #endif // HAS_ARGBBLENDROW_SSSE3
4246
4247 #ifdef HAS_ARGBATTENUATEROW_SSSE3
4248 // Shuffle table duplicating alpha.
4249 static const uvec8 kShuffleAlpha0 = {
4250 3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u,
4251 };
4252 static const uvec8 kShuffleAlpha1 = {
4253 11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
4254 15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u,
4255 };
4256 __declspec(naked) void ARGBAttenuateRow_SSSE3(const uint8* src_argb,
4257 uint8* dst_argb,
4258 int width) {
4259 __asm {
4260 mov eax, [esp + 4] // src_argb0
4261 mov edx, [esp + 8] // dst_argb
4262 mov ecx, [esp + 12] // width
4263 pcmpeqb xmm3, xmm3 // generate mask 0xff000000
4264 pslld xmm3, 24
4265 movdqa xmm4, xmmword ptr kShuffleAlpha0
4266 movdqa xmm5, xmmword ptr kShuffleAlpha1
4267
4268 convertloop:
4269 movdqu xmm0, [eax] // read 4 pixels
4270 pshufb xmm0, xmm4 // isolate first 2 alphas
4271 movdqu xmm1, [eax] // read 4 pixels
4272 punpcklbw xmm1, xmm1 // first 2 pixel rgbs
4273 pmulhuw xmm0, xmm1 // rgb * a
4274 movdqu xmm1, [eax] // read 4 pixels
4275 pshufb xmm1, xmm5 // isolate next 2 alphas
4276 movdqu xmm2, [eax] // read 4 pixels
4277 punpckhbw xmm2, xmm2 // next 2 pixel rgbs
4278 pmulhuw xmm1, xmm2 // rgb * a
4279 movdqu xmm2, [eax] // mask original alpha
4280 lea eax, [eax + 16]
4281 pand xmm2, xmm3
4282 psrlw xmm0, 8
4283 psrlw xmm1, 8
4284 packuswb xmm0, xmm1
4285 por xmm0, xmm2 // copy original alpha
4286 movdqu [edx], xmm0
4287 lea edx, [edx + 16]
4288 sub ecx, 4
4289 jg convertloop
4290
4291 ret
4292 }
4293 }
4294 #endif // HAS_ARGBATTENUATEROW_SSSE3
4295
4296 #ifdef HAS_ARGBATTENUATEROW_AVX2
4297 // Shuffle table duplicating alpha.
4298 static const uvec8 kShuffleAlpha_AVX2 = {6u, 7u, 6u, 7u, 6u, 7u,
4299 128u, 128u, 14u, 15u, 14u, 15u,
4300 14u, 15u, 128u, 128u};
4301 __declspec(naked) void ARGBAttenuateRow_AVX2(const uint8* src_argb,
4302 uint8* dst_argb,
4303 int width) {
4304 __asm {
4305 mov eax, [esp + 4] // src_argb0
4306 mov edx, [esp + 8] // dst_argb
4307 mov ecx, [esp + 12] // width
4308 sub edx, eax
4309 vbroadcastf128 ymm4, xmmword ptr kShuffleAlpha_AVX2
4310 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xff000000
4311 vpslld ymm5, ymm5, 24
4312
4313 convertloop:
4314 vmovdqu ymm6, [eax] // read 8 pixels.
4315 vpunpcklbw ymm0, ymm6, ymm6 // low 4 pixels. mutated.
4316 vpunpckhbw ymm1, ymm6, ymm6 // high 4 pixels. mutated.
4317 vpshufb ymm2, ymm0, ymm4 // low 4 alphas
4318 vpshufb ymm3, ymm1, ymm4 // high 4 alphas
4319 vpmulhuw ymm0, ymm0, ymm2 // rgb * a
4320 vpmulhuw ymm1, ymm1, ymm3 // rgb * a
4321 vpand ymm6, ymm6, ymm5 // isolate alpha
4322 vpsrlw ymm0, ymm0, 8
4323 vpsrlw ymm1, ymm1, 8
4324 vpackuswb ymm0, ymm0, ymm1 // unmutated.
4325 vpor ymm0, ymm0, ymm6 // copy original alpha
4326 vmovdqu [eax + edx], ymm0
4327 lea eax, [eax + 32]
4328 sub ecx, 8
4329 jg convertloop
4330
4331 vzeroupper
4332 ret
4333 }
4334 }
4335 #endif // HAS_ARGBATTENUATEROW_AVX2
4336
4337 #ifdef HAS_ARGBUNATTENUATEROW_SSE2
4338 // Unattenuate 4 pixels at a time.
4339 __declspec(naked) void ARGBUnattenuateRow_SSE2(const uint8* src_argb,
4340 uint8* dst_argb,
4341 int width) {
4342 __asm {
4343 push ebx
4344 push esi
4345 push edi
4346 mov eax, [esp + 12 + 4] // src_argb
4347 mov edx, [esp + 12 + 8] // dst_argb
4348 mov ecx, [esp + 12 + 12] // width
4349 lea ebx, fixed_invtbl8
4350
4351 convertloop:
4352 movdqu xmm0, [eax] // read 4 pixels
4353 movzx esi, byte ptr [eax + 3] // first alpha
4354 movzx edi, byte ptr [eax + 7] // second alpha
4355 punpcklbw xmm0, xmm0 // first 2
4356 movd xmm2, dword ptr [ebx + esi * 4]
4357 movd xmm3, dword ptr [ebx + edi * 4]
4358 pshuflw xmm2, xmm2, 040h // first 4 inv_alpha words. 1, a, a, a
4359 pshuflw xmm3, xmm3, 040h // next 4 inv_alpha words
4360 movlhps xmm2, xmm3
4361 pmulhuw xmm0, xmm2 // rgb * a
4362
4363 movdqu xmm1, [eax] // read 4 pixels
4364 movzx esi, byte ptr [eax + 11] // third alpha
4365 movzx edi, byte ptr [eax + 15] // forth alpha
4366 punpckhbw xmm1, xmm1 // next 2
4367 movd xmm2, dword ptr [ebx + esi * 4]
4368 movd xmm3, dword ptr [ebx + edi * 4]
4369 pshuflw xmm2, xmm2, 040h // first 4 inv_alpha words
4370 pshuflw xmm3, xmm3, 040h // next 4 inv_alpha words
4371 movlhps xmm2, xmm3
4372 pmulhuw xmm1, xmm2 // rgb * a
4373 lea eax, [eax + 16]
4374 packuswb xmm0, xmm1
4375 movdqu [edx], xmm0
4376 lea edx, [edx + 16]
4377 sub ecx, 4
4378 jg convertloop
4379
4380 pop edi
4381 pop esi
4382 pop ebx
4383 ret
4384 }
4385 }
4386 #endif // HAS_ARGBUNATTENUATEROW_SSE2
4387
4388 #ifdef HAS_ARGBUNATTENUATEROW_AVX2
4389 // Shuffle table duplicating alpha.
4390 static const uvec8 kUnattenShuffleAlpha_AVX2 = {
4391 0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15u};
4392 // TODO(fbarchard): Enable USE_GATHER for future hardware if faster.
4393 // USE_GATHER is not on by default, due to being a slow instruction.
4394 #ifdef USE_GATHER
4395 __declspec(naked) void ARGBUnattenuateRow_AVX2(const uint8* src_argb,
4396 uint8* dst_argb,
4397 int width) {
4398 __asm {
4399 mov eax, [esp + 4] // src_argb0
4400 mov edx, [esp + 8] // dst_argb
4401 mov ecx, [esp + 12] // width
4402 sub edx, eax
4403 vbroadcastf128 ymm4, xmmword ptr kUnattenShuffleAlpha_AVX2
4404
4405 convertloop:
4406 vmovdqu ymm6, [eax] // read 8 pixels.
4407 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xffffffff for gather.
4408 vpsrld ymm2, ymm6, 24 // alpha in low 8 bits.
4409 vpunpcklbw ymm0, ymm6, ymm6 // low 4 pixels. mutated.
4410 vpunpckhbw ymm1, ymm6, ymm6 // high 4 pixels. mutated.
4411 vpgatherdd ymm3, [ymm2 * 4 + fixed_invtbl8], ymm5 // ymm5 cleared. 1, a
4412 vpunpcklwd ymm2, ymm3, ymm3 // low 4 inverted alphas. mutated. 1, 1, a, a
4413 vpunpckhwd ymm3, ymm3, ymm3 // high 4 inverted alphas. mutated.
4414 vpshufb ymm2, ymm2, ymm4 // replicate low 4 alphas. 1, a, a, a
4415 vpshufb ymm3, ymm3, ymm4 // replicate high 4 alphas
4416 vpmulhuw ymm0, ymm0, ymm2 // rgb * ia
4417 vpmulhuw ymm1, ymm1, ymm3 // rgb * ia
4418 vpackuswb ymm0, ymm0, ymm1 // unmutated.
4419 vmovdqu [eax + edx], ymm0
4420 lea eax, [eax + 32]
4421 sub ecx, 8
4422 jg convertloop
4423
4424 vzeroupper
4425 ret
4426 }
4427 }
4428 #else // USE_GATHER
4429 __declspec(naked) void ARGBUnattenuateRow_AVX2(const uint8* src_argb,
4430 uint8* dst_argb,
4431 int width) {
4432 __asm {
4433
4434 push ebx
4435 push esi
4436 push edi
4437 mov eax, [esp + 12 + 4] // src_argb
4438 mov edx, [esp + 12 + 8] // dst_argb
4439 mov ecx, [esp + 12 + 12] // width
4440 sub edx, eax
4441 lea ebx, fixed_invtbl8
4442 vbroadcastf128 ymm5, xmmword ptr kUnattenShuffleAlpha_AVX2
4443
4444 convertloop:
4445 // replace VPGATHER
4446 movzx esi, byte ptr [eax + 3] // alpha0
4447 movzx edi, byte ptr [eax + 7] // alpha1
4448 vmovd xmm0, dword ptr [ebx + esi * 4] // [1,a0]
4449 vmovd xmm1, dword ptr [ebx + edi * 4] // [1,a1]
4450 movzx esi, byte ptr [eax + 11] // alpha2
4451 movzx edi, byte ptr [eax + 15] // alpha3
4452 vpunpckldq xmm6, xmm0, xmm1 // [1,a1,1,a0]
4453 vmovd xmm2, dword ptr [ebx + esi * 4] // [1,a2]
4454 vmovd xmm3, dword ptr [ebx + edi * 4] // [1,a3]
4455 movzx esi, byte ptr [eax + 19] // alpha4
4456 movzx edi, byte ptr [eax + 23] // alpha5
4457 vpunpckldq xmm7, xmm2, xmm3 // [1,a3,1,a2]
4458 vmovd xmm0, dword ptr [ebx + esi * 4] // [1,a4]
4459 vmovd xmm1, dword ptr [ebx + edi * 4] // [1,a5]
4460 movzx esi, byte ptr [eax + 27] // alpha6
4461 movzx edi, byte ptr [eax + 31] // alpha7
4462 vpunpckldq xmm0, xmm0, xmm1 // [1,a5,1,a4]
4463 vmovd xmm2, dword ptr [ebx + esi * 4] // [1,a6]
4464 vmovd xmm3, dword ptr [ebx + edi * 4] // [1,a7]
4465 vpunpckldq xmm2, xmm2, xmm3 // [1,a7,1,a6]
4466 vpunpcklqdq xmm3, xmm6, xmm7 // [1,a3,1,a2,1,a1,1,a0]
4467 vpunpcklqdq xmm0, xmm0, xmm2 // [1,a7,1,a6,1,a5,1,a4]
4468 vinserti128 ymm3, ymm3, xmm0, 1 // [1,a7,1,a6,1,a5,1,a4,1,a3,1,a2,1,a1,1,a0]
4469 // end of VPGATHER
4470
4471 vmovdqu ymm6, [eax] // read 8 pixels.
4472 vpunpcklbw ymm0, ymm6, ymm6 // low 4 pixels. mutated.
4473 vpunpckhbw ymm1, ymm6, ymm6 // high 4 pixels. mutated.
4474 vpunpcklwd ymm2, ymm3, ymm3 // low 4 inverted alphas. mutated. 1, 1, a, a
4475 vpunpckhwd ymm3, ymm3, ymm3 // high 4 inverted alphas. mutated.
4476 vpshufb ymm2, ymm2, ymm5 // replicate low 4 alphas. 1, a, a, a
4477 vpshufb ymm3, ymm3, ymm5 // replicate high 4 alphas
4478 vpmulhuw ymm0, ymm0, ymm2 // rgb * ia
4479 vpmulhuw ymm1, ymm1, ymm3 // rgb * ia
4480 vpackuswb ymm0, ymm0, ymm1 // unmutated.
4481 vmovdqu [eax + edx], ymm0
4482 lea eax, [eax + 32]
4483 sub ecx, 8
4484 jg convertloop
4485
4486 pop edi
4487 pop esi
4488 pop ebx
4489 vzeroupper
4490 ret
4491 }
4492 }
4493 #endif // USE_GATHER
4494 #endif // HAS_ARGBATTENUATEROW_AVX2
4495
4496 #ifdef HAS_ARGBGRAYROW_SSSE3
4497 // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels.
4498 __declspec(naked) void ARGBGrayRow_SSSE3(const uint8* src_argb,
4499 uint8* dst_argb,
4500 int width) {
4501 __asm {
4502 mov eax, [esp + 4] /* src_argb */
4503 mov edx, [esp + 8] /* dst_argb */
4504 mov ecx, [esp + 12] /* width */
4505 movdqa xmm4, xmmword ptr kARGBToYJ
4506 movdqa xmm5, xmmword ptr kAddYJ64
4507
4508 convertloop:
4509 movdqu xmm0, [eax] // G
4510 movdqu xmm1, [eax + 16]
4511 pmaddubsw xmm0, xmm4
4512 pmaddubsw xmm1, xmm4
4513 phaddw xmm0, xmm1
4514 paddw xmm0, xmm5 // Add .5 for rounding.
4515 psrlw xmm0, 7
4516 packuswb xmm0, xmm0 // 8 G bytes
4517 movdqu xmm2, [eax] // A
4518 movdqu xmm3, [eax + 16]
4519 lea eax, [eax + 32]
4520 psrld xmm2, 24
4521 psrld xmm3, 24
4522 packuswb xmm2, xmm3
4523 packuswb xmm2, xmm2 // 8 A bytes
4524 movdqa xmm3, xmm0 // Weave into GG, GA, then GGGA
4525 punpcklbw xmm0, xmm0 // 8 GG words
4526 punpcklbw xmm3, xmm2 // 8 GA words
4527 movdqa xmm1, xmm0
4528 punpcklwd xmm0, xmm3 // GGGA first 4
4529 punpckhwd xmm1, xmm3 // GGGA next 4
4530 movdqu [edx], xmm0
4531 movdqu [edx + 16], xmm1
4532 lea edx, [edx + 32]
4533 sub ecx, 8
4534 jg convertloop
4535 ret
4536 }
4537 }
4538 #endif // HAS_ARGBGRAYROW_SSSE3
4539
4540 #ifdef HAS_ARGBSEPIAROW_SSSE3
4541 // b = (r * 35 + g * 68 + b * 17) >> 7
4542 // g = (r * 45 + g * 88 + b * 22) >> 7
4543 // r = (r * 50 + g * 98 + b * 24) >> 7
4544 // Constant for ARGB color to sepia tone.
4545 static const vec8 kARGBToSepiaB = {17, 68, 35, 0, 17, 68, 35, 0,
4546 17, 68, 35, 0, 17, 68, 35, 0};
4547
4548 static const vec8 kARGBToSepiaG = {22, 88, 45, 0, 22, 88, 45, 0,
4549 22, 88, 45, 0, 22, 88, 45, 0};
4550
4551 static const vec8 kARGBToSepiaR = {24, 98, 50, 0, 24, 98, 50, 0,
4552 24, 98, 50, 0, 24, 98, 50, 0};
4553
4554 // Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
4555 __declspec(naked) void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
4556 __asm {
4557 mov eax, [esp + 4] /* dst_argb */
4558 mov ecx, [esp + 8] /* width */
4559 movdqa xmm2, xmmword ptr kARGBToSepiaB
4560 movdqa xmm3, xmmword ptr kARGBToSepiaG
4561 movdqa xmm4, xmmword ptr kARGBToSepiaR
4562
4563 convertloop:
4564 movdqu xmm0, [eax] // B
4565 movdqu xmm6, [eax + 16]
4566 pmaddubsw xmm0, xmm2
4567 pmaddubsw xmm6, xmm2
4568 phaddw xmm0, xmm6
4569 psrlw xmm0, 7
4570 packuswb xmm0, xmm0 // 8 B values
4571 movdqu xmm5, [eax] // G
4572 movdqu xmm1, [eax + 16]
4573 pmaddubsw xmm5, xmm3
4574 pmaddubsw xmm1, xmm3
4575 phaddw xmm5, xmm1
4576 psrlw xmm5, 7
4577 packuswb xmm5, xmm5 // 8 G values
4578 punpcklbw xmm0, xmm5 // 8 BG values
4579 movdqu xmm5, [eax] // R
4580 movdqu xmm1, [eax + 16]
4581 pmaddubsw xmm5, xmm4
4582 pmaddubsw xmm1, xmm4
4583 phaddw xmm5, xmm1
4584 psrlw xmm5, 7
4585 packuswb xmm5, xmm5 // 8 R values
4586 movdqu xmm6, [eax] // A
4587 movdqu xmm1, [eax + 16]
4588 psrld xmm6, 24
4589 psrld xmm1, 24
4590 packuswb xmm6, xmm1
4591 packuswb xmm6, xmm6 // 8 A values
4592 punpcklbw xmm5, xmm6 // 8 RA values
4593 movdqa xmm1, xmm0 // Weave BG, RA together
4594 punpcklwd xmm0, xmm5 // BGRA first 4
4595 punpckhwd xmm1, xmm5 // BGRA next 4
4596 movdqu [eax], xmm0
4597 movdqu [eax + 16], xmm1
4598 lea eax, [eax + 32]
4599 sub ecx, 8
4600 jg convertloop
4601 ret
4602 }
4603 }
4604 #endif // HAS_ARGBSEPIAROW_SSSE3
4605
4606 #ifdef HAS_ARGBCOLORMATRIXROW_SSSE3
4607 // Tranform 8 ARGB pixels (32 bytes) with color matrix.
4608 // Same as Sepia except matrix is provided.
4609 // TODO(fbarchard): packuswbs only use half of the reg. To make RGBA, combine R
4610 // and B into a high and low, then G/A, unpackl/hbw and then unpckl/hwd.
4611 __declspec(naked) void ARGBColorMatrixRow_SSSE3(const uint8* src_argb,
4612 uint8* dst_argb,
4613 const int8* matrix_argb,
4614 int width) {
4615 __asm {
4616 mov eax, [esp + 4] /* src_argb */
4617 mov edx, [esp + 8] /* dst_argb */
4618 mov ecx, [esp + 12] /* matrix_argb */
4619 movdqu xmm5, [ecx]
4620 pshufd xmm2, xmm5, 0x00
4621 pshufd xmm3, xmm5, 0x55
4622 pshufd xmm4, xmm5, 0xaa
4623 pshufd xmm5, xmm5, 0xff
4624 mov ecx, [esp + 16] /* width */
4625
4626 convertloop:
4627 movdqu xmm0, [eax] // B
4628 movdqu xmm7, [eax + 16]
4629 pmaddubsw xmm0, xmm2
4630 pmaddubsw xmm7, xmm2
4631 movdqu xmm6, [eax] // G
4632 movdqu xmm1, [eax + 16]
4633 pmaddubsw xmm6, xmm3
4634 pmaddubsw xmm1, xmm3
4635 phaddsw xmm0, xmm7 // B
4636 phaddsw xmm6, xmm1 // G
4637 psraw xmm0, 6 // B
4638 psraw xmm6, 6 // G
4639 packuswb xmm0, xmm0 // 8 B values
4640 packuswb xmm6, xmm6 // 8 G values
4641 punpcklbw xmm0, xmm6 // 8 BG values
4642 movdqu xmm1, [eax] // R
4643 movdqu xmm7, [eax + 16]
4644 pmaddubsw xmm1, xmm4
4645 pmaddubsw xmm7, xmm4
4646 phaddsw xmm1, xmm7 // R
4647 movdqu xmm6, [eax] // A
4648 movdqu xmm7, [eax + 16]
4649 pmaddubsw xmm6, xmm5
4650 pmaddubsw xmm7, xmm5
4651 phaddsw xmm6, xmm7 // A
4652 psraw xmm1, 6 // R
4653 psraw xmm6, 6 // A
4654 packuswb xmm1, xmm1 // 8 R values
4655 packuswb xmm6, xmm6 // 8 A values
4656 punpcklbw xmm1, xmm6 // 8 RA values
4657 movdqa xmm6, xmm0 // Weave BG, RA together
4658 punpcklwd xmm0, xmm1 // BGRA first 4
4659 punpckhwd xmm6, xmm1 // BGRA next 4
4660 movdqu [edx], xmm0
4661 movdqu [edx + 16], xmm6
4662 lea eax, [eax + 32]
4663 lea edx, [edx + 32]
4664 sub ecx, 8
4665 jg convertloop
4666 ret
4667 }
4668 }
4669 #endif // HAS_ARGBCOLORMATRIXROW_SSSE3
4670
4671 #ifdef HAS_ARGBQUANTIZEROW_SSE2
4672 // Quantize 4 ARGB pixels (16 bytes).
4673 __declspec(naked) void ARGBQuantizeRow_SSE2(uint8* dst_argb,
4674 int scale,
4675 int interval_size,
4676 int interval_offset,
4677 int width) {
4678 __asm {
4679 mov eax, [esp + 4] /* dst_argb */
4680 movd xmm2, [esp + 8] /* scale */
4681 movd xmm3, [esp + 12] /* interval_size */
4682 movd xmm4, [esp + 16] /* interval_offset */
4683 mov ecx, [esp + 20] /* width */
4684 pshuflw xmm2, xmm2, 040h
4685 pshufd xmm2, xmm2, 044h
4686 pshuflw xmm3, xmm3, 040h
4687 pshufd xmm3, xmm3, 044h
4688 pshuflw xmm4, xmm4, 040h
4689 pshufd xmm4, xmm4, 044h
4690 pxor xmm5, xmm5 // constant 0
4691 pcmpeqb xmm6, xmm6 // generate mask 0xff000000
4692 pslld xmm6, 24
4693
4694 convertloop:
4695 movdqu xmm0, [eax] // read 4 pixels
4696 punpcklbw xmm0, xmm5 // first 2 pixels
4697 pmulhuw xmm0, xmm2 // pixel * scale >> 16
4698 movdqu xmm1, [eax] // read 4 pixels
4699 punpckhbw xmm1, xmm5 // next 2 pixels
4700 pmulhuw xmm1, xmm2
4701 pmullw xmm0, xmm3 // * interval_size
4702 movdqu xmm7, [eax] // read 4 pixels
4703 pmullw xmm1, xmm3
4704 pand xmm7, xmm6 // mask alpha
4705 paddw xmm0, xmm4 // + interval_size / 2
4706 paddw xmm1, xmm4
4707 packuswb xmm0, xmm1
4708 por xmm0, xmm7
4709 movdqu [eax], xmm0
4710 lea eax, [eax + 16]
4711 sub ecx, 4
4712 jg convertloop
4713 ret
4714 }
4715 }
4716 #endif // HAS_ARGBQUANTIZEROW_SSE2
4717
4718 #ifdef HAS_ARGBSHADEROW_SSE2
4719 // Shade 4 pixels at a time by specified value.
4720 __declspec(naked) void ARGBShadeRow_SSE2(const uint8* src_argb,
4721 uint8* dst_argb,
4722 int width,
4723 uint32 value) {
4724 __asm {
4725 mov eax, [esp + 4] // src_argb
4726 mov edx, [esp + 8] // dst_argb
4727 mov ecx, [esp + 12] // width
4728 movd xmm2, [esp + 16] // value
4729 punpcklbw xmm2, xmm2
4730 punpcklqdq xmm2, xmm2
4731
4732 convertloop:
4733 movdqu xmm0, [eax] // read 4 pixels
4734 lea eax, [eax + 16]
4735 movdqa xmm1, xmm0
4736 punpcklbw xmm0, xmm0 // first 2
4737 punpckhbw xmm1, xmm1 // next 2
4738 pmulhuw xmm0, xmm2 // argb * value
4739 pmulhuw xmm1, xmm2 // argb * value
4740 psrlw xmm0, 8
4741 psrlw xmm1, 8
4742 packuswb xmm0, xmm1
4743 movdqu [edx], xmm0
4744 lea edx, [edx + 16]
4745 sub ecx, 4
4746 jg convertloop
4747
4748 ret
4749 }
4750 }
4751 #endif // HAS_ARGBSHADEROW_SSE2
4752
4753 #ifdef HAS_ARGBMULTIPLYROW_SSE2
4754 // Multiply 2 rows of ARGB pixels together, 4 pixels at a time.
4755 __declspec(naked) void ARGBMultiplyRow_SSE2(const uint8* src_argb0,
4756 const uint8* src_argb1,
4757 uint8* dst_argb,
4758 int width) {
4759 __asm {
4760 push esi
4761 mov eax, [esp + 4 + 4] // src_argb0
4762 mov esi, [esp + 4 + 8] // src_argb1
4763 mov edx, [esp + 4 + 12] // dst_argb
4764 mov ecx, [esp + 4 + 16] // width
4765 pxor xmm5, xmm5 // constant 0
4766
4767 convertloop:
4768 movdqu xmm0, [eax] // read 4 pixels from src_argb0
4769 movdqu xmm2, [esi] // read 4 pixels from src_argb1
4770 movdqu xmm1, xmm0
4771 movdqu xmm3, xmm2
4772 punpcklbw xmm0, xmm0 // first 2
4773 punpckhbw xmm1, xmm1 // next 2
4774 punpcklbw xmm2, xmm5 // first 2
4775 punpckhbw xmm3, xmm5 // next 2
4776 pmulhuw xmm0, xmm2 // src_argb0 * src_argb1 first 2
4777 pmulhuw xmm1, xmm3 // src_argb0 * src_argb1 next 2
4778 lea eax, [eax + 16]
4779 lea esi, [esi + 16]
4780 packuswb xmm0, xmm1
4781 movdqu [edx], xmm0
4782 lea edx, [edx + 16]
4783 sub ecx, 4
4784 jg convertloop
4785
4786 pop esi
4787 ret
4788 }
4789 }
4790 #endif // HAS_ARGBMULTIPLYROW_SSE2
4791
4792 #ifdef HAS_ARGBADDROW_SSE2
4793 // Add 2 rows of ARGB pixels together, 4 pixels at a time.
4794 // TODO(fbarchard): Port this to posix, neon and other math functions.
4795 __declspec(naked) void ARGBAddRow_SSE2(const uint8* src_argb0,
4796 const uint8* src_argb1,
4797 uint8* dst_argb,
4798 int width) {
4799 __asm {
4800 push esi
4801 mov eax, [esp + 4 + 4] // src_argb0
4802 mov esi, [esp + 4 + 8] // src_argb1
4803 mov edx, [esp + 4 + 12] // dst_argb
4804 mov ecx, [esp + 4 + 16] // width
4805
4806 sub ecx, 4
4807 jl convertloop49
4808
4809 convertloop4:
4810 movdqu xmm0, [eax] // read 4 pixels from src_argb0
4811 lea eax, [eax + 16]
4812 movdqu xmm1, [esi] // read 4 pixels from src_argb1
4813 lea esi, [esi + 16]
4814 paddusb xmm0, xmm1 // src_argb0 + src_argb1
4815 movdqu [edx], xmm0
4816 lea edx, [edx + 16]
4817 sub ecx, 4
4818 jge convertloop4
4819
4820 convertloop49:
4821 add ecx, 4 - 1
4822 jl convertloop19
4823
4824 convertloop1:
4825 movd xmm0, [eax] // read 1 pixels from src_argb0
4826 lea eax, [eax + 4]
4827 movd xmm1, [esi] // read 1 pixels from src_argb1
4828 lea esi, [esi + 4]
4829 paddusb xmm0, xmm1 // src_argb0 + src_argb1
4830 movd [edx], xmm0
4831 lea edx, [edx + 4]
4832 sub ecx, 1
4833 jge convertloop1
4834
4835 convertloop19:
4836 pop esi
4837 ret
4838 }
4839 }
4840 #endif // HAS_ARGBADDROW_SSE2
4841
4842 #ifdef HAS_ARGBSUBTRACTROW_SSE2
4843 // Subtract 2 rows of ARGB pixels together, 4 pixels at a time.
4844 __declspec(naked) void ARGBSubtractRow_SSE2(const uint8* src_argb0,
4845 const uint8* src_argb1,
4846 uint8* dst_argb,
4847 int width) {
4848 __asm {
4849 push esi
4850 mov eax, [esp + 4 + 4] // src_argb0
4851 mov esi, [esp + 4 + 8] // src_argb1
4852 mov edx, [esp + 4 + 12] // dst_argb
4853 mov ecx, [esp + 4 + 16] // width
4854
4855 convertloop:
4856 movdqu xmm0, [eax] // read 4 pixels from src_argb0
4857 lea eax, [eax + 16]
4858 movdqu xmm1, [esi] // read 4 pixels from src_argb1
4859 lea esi, [esi + 16]
4860 psubusb xmm0, xmm1 // src_argb0 - src_argb1
4861 movdqu [edx], xmm0
4862 lea edx, [edx + 16]
4863 sub ecx, 4
4864 jg convertloop
4865
4866 pop esi
4867 ret
4868 }
4869 }
4870 #endif // HAS_ARGBSUBTRACTROW_SSE2
4871
4872 #ifdef HAS_ARGBMULTIPLYROW_AVX2
4873 // Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
4874 __declspec(naked) void ARGBMultiplyRow_AVX2(const uint8* src_argb0,
4875 const uint8* src_argb1,
4876 uint8* dst_argb,
4877 int width) {
4878 __asm {
4879 push esi
4880 mov eax, [esp + 4 + 4] // src_argb0
4881 mov esi, [esp + 4 + 8] // src_argb1
4882 mov edx, [esp + 4 + 12] // dst_argb
4883 mov ecx, [esp + 4 + 16] // width
4884 vpxor ymm5, ymm5, ymm5 // constant 0
4885
4886 convertloop:
4887 vmovdqu ymm1, [eax] // read 8 pixels from src_argb0
4888 lea eax, [eax + 32]
4889 vmovdqu ymm3, [esi] // read 8 pixels from src_argb1
4890 lea esi, [esi + 32]
4891 vpunpcklbw ymm0, ymm1, ymm1 // low 4
4892 vpunpckhbw ymm1, ymm1, ymm1 // high 4
4893 vpunpcklbw ymm2, ymm3, ymm5 // low 4
4894 vpunpckhbw ymm3, ymm3, ymm5 // high 4
4895 vpmulhuw ymm0, ymm0, ymm2 // src_argb0 * src_argb1 low 4
4896 vpmulhuw ymm1, ymm1, ymm3 // src_argb0 * src_argb1 high 4
4897 vpackuswb ymm0, ymm0, ymm1
4898 vmovdqu [edx], ymm0
4899 lea edx, [edx + 32]
4900 sub ecx, 8
4901 jg convertloop
4902
4903 pop esi
4904 vzeroupper
4905 ret
4906 }
4907 }
4908 #endif // HAS_ARGBMULTIPLYROW_AVX2
4909
4910 #ifdef HAS_ARGBADDROW_AVX2
4911 // Add 2 rows of ARGB pixels together, 8 pixels at a time.
4912 __declspec(naked) void ARGBAddRow_AVX2(const uint8* src_argb0,
4913 const uint8* src_argb1,
4914 uint8* dst_argb,
4915 int width) {
4916 __asm {
4917 push esi
4918 mov eax, [esp + 4 + 4] // src_argb0
4919 mov esi, [esp + 4 + 8] // src_argb1
4920 mov edx, [esp + 4 + 12] // dst_argb
4921 mov ecx, [esp + 4 + 16] // width
4922
4923 convertloop:
4924 vmovdqu ymm0, [eax] // read 8 pixels from src_argb0
4925 lea eax, [eax + 32]
4926 vpaddusb ymm0, ymm0, [esi] // add 8 pixels from src_argb1
4927 lea esi, [esi + 32]
4928 vmovdqu [edx], ymm0
4929 lea edx, [edx + 32]
4930 sub ecx, 8
4931 jg convertloop
4932
4933 pop esi
4934 vzeroupper
4935 ret
4936 }
4937 }
4938 #endif // HAS_ARGBADDROW_AVX2
4939
4940 #ifdef HAS_ARGBSUBTRACTROW_AVX2
4941 // Subtract 2 rows of ARGB pixels together, 8 pixels at a time.
4942 __declspec(naked) void ARGBSubtractRow_AVX2(const uint8* src_argb0,
4943 const uint8* src_argb1,
4944 uint8* dst_argb,
4945 int width) {
4946 __asm {
4947 push esi
4948 mov eax, [esp + 4 + 4] // src_argb0
4949 mov esi, [esp + 4 + 8] // src_argb1
4950 mov edx, [esp + 4 + 12] // dst_argb
4951 mov ecx, [esp + 4 + 16] // width
4952
4953 convertloop:
4954 vmovdqu ymm0, [eax] // read 8 pixels from src_argb0
4955 lea eax, [eax + 32]
4956 vpsubusb ymm0, ymm0, [esi] // src_argb0 - src_argb1
4957 lea esi, [esi + 32]
4958 vmovdqu [edx], ymm0
4959 lea edx, [edx + 32]
4960 sub ecx, 8
4961 jg convertloop
4962
4963 pop esi
4964 vzeroupper
4965 ret
4966 }
4967 }
4968 #endif // HAS_ARGBSUBTRACTROW_AVX2
4969
4970 #ifdef HAS_SOBELXROW_SSE2
4971 // SobelX as a matrix is
4972 // -1 0 1
4973 // -2 0 2
4974 // -1 0 1
4975 __declspec(naked) void SobelXRow_SSE2(const uint8* src_y0,
4976 const uint8* src_y1,
4977 const uint8* src_y2,
4978 uint8* dst_sobelx,
4979 int width) {
4980 __asm {
4981 push esi
4982 push edi
4983 mov eax, [esp + 8 + 4] // src_y0
4984 mov esi, [esp + 8 + 8] // src_y1
4985 mov edi, [esp + 8 + 12] // src_y2
4986 mov edx, [esp + 8 + 16] // dst_sobelx
4987 mov ecx, [esp + 8 + 20] // width
4988 sub esi, eax
4989 sub edi, eax
4990 sub edx, eax
4991 pxor xmm5, xmm5 // constant 0
4992
4993 convertloop:
4994 movq xmm0, qword ptr [eax] // read 8 pixels from src_y0[0]
4995 movq xmm1, qword ptr [eax + 2] // read 8 pixels from src_y0[2]
4996 punpcklbw xmm0, xmm5
4997 punpcklbw xmm1, xmm5
4998 psubw xmm0, xmm1
4999 movq xmm1, qword ptr [eax + esi] // read 8 pixels from src_y1[0]
5000 movq xmm2, qword ptr [eax + esi + 2] // read 8 pixels from src_y1[2]
5001 punpcklbw xmm1, xmm5
5002 punpcklbw xmm2, xmm5
5003 psubw xmm1, xmm2
5004 movq xmm2, qword ptr [eax + edi] // read 8 pixels from src_y2[0]
5005 movq xmm3, qword ptr [eax + edi + 2] // read 8 pixels from src_y2[2]
5006 punpcklbw xmm2, xmm5
5007 punpcklbw xmm3, xmm5
5008 psubw xmm2, xmm3
5009 paddw xmm0, xmm2
5010 paddw xmm0, xmm1
5011 paddw xmm0, xmm1
5012 pxor xmm1, xmm1 // abs = max(xmm0, -xmm0). SSSE3 could use pabsw
5013 psubw xmm1, xmm0
5014 pmaxsw xmm0, xmm1
5015 packuswb xmm0, xmm0
5016 movq qword ptr [eax + edx], xmm0
5017 lea eax, [eax + 8]
5018 sub ecx, 8
5019 jg convertloop
5020
5021 pop edi
5022 pop esi
5023 ret
5024 }
5025 }
5026 #endif // HAS_SOBELXROW_SSE2
5027
5028 #ifdef HAS_SOBELYROW_SSE2
5029 // SobelY as a matrix is
5030 // -1 -2 -1
5031 // 0 0 0
5032 // 1 2 1
5033 __declspec(naked) void SobelYRow_SSE2(const uint8* src_y0,
5034 const uint8* src_y1,
5035 uint8* dst_sobely,
5036 int width) {
5037 __asm {
5038 push esi
5039 mov eax, [esp + 4 + 4] // src_y0
5040 mov esi, [esp + 4 + 8] // src_y1
5041 mov edx, [esp + 4 + 12] // dst_sobely
5042 mov ecx, [esp + 4 + 16] // width
5043 sub esi, eax
5044 sub edx, eax
5045 pxor xmm5, xmm5 // constant 0
5046
5047 convertloop:
5048 movq xmm0, qword ptr [eax] // read 8 pixels from src_y0[0]
5049 movq xmm1, qword ptr [eax + esi] // read 8 pixels from src_y1[0]
5050 punpcklbw xmm0, xmm5
5051 punpcklbw xmm1, xmm5
5052 psubw xmm0, xmm1
5053 movq xmm1, qword ptr [eax + 1] // read 8 pixels from src_y0[1]
5054 movq xmm2, qword ptr [eax + esi + 1] // read 8 pixels from src_y1[1]
5055 punpcklbw xmm1, xmm5
5056 punpcklbw xmm2, xmm5
5057 psubw xmm1, xmm2
5058 movq xmm2, qword ptr [eax + 2] // read 8 pixels from src_y0[2]
5059 movq xmm3, qword ptr [eax + esi + 2] // read 8 pixels from src_y1[2]
5060 punpcklbw xmm2, xmm5
5061 punpcklbw xmm3, xmm5
5062 psubw xmm2, xmm3
5063 paddw xmm0, xmm2
5064 paddw xmm0, xmm1
5065 paddw xmm0, xmm1
5066 pxor xmm1, xmm1 // abs = max(xmm0, -xmm0). SSSE3 could use pabsw
5067 psubw xmm1, xmm0
5068 pmaxsw xmm0, xmm1
5069 packuswb xmm0, xmm0
5070 movq qword ptr [eax + edx], xmm0
5071 lea eax, [eax + 8]
5072 sub ecx, 8
5073 jg convertloop
5074
5075 pop esi
5076 ret
5077 }
5078 }
5079 #endif // HAS_SOBELYROW_SSE2
5080
5081 #ifdef HAS_SOBELROW_SSE2
5082 // Adds Sobel X and Sobel Y and stores Sobel into ARGB.
5083 // A = 255
5084 // R = Sobel
5085 // G = Sobel
5086 // B = Sobel
5087 __declspec(naked) void SobelRow_SSE2(const uint8* src_sobelx,
5088 const uint8* src_sobely,
5089 uint8* dst_argb,
5090 int width) {
5091 __asm {
5092 push esi
5093 mov eax, [esp + 4 + 4] // src_sobelx
5094 mov esi, [esp + 4 + 8] // src_sobely
5095 mov edx, [esp + 4 + 12] // dst_argb
5096 mov ecx, [esp + 4 + 16] // width
5097 sub esi, eax
5098 pcmpeqb xmm5, xmm5 // alpha 255
5099 pslld xmm5, 24 // 0xff000000
5100
5101 convertloop:
5102 movdqu xmm0, [eax] // read 16 pixels src_sobelx
5103 movdqu xmm1, [eax + esi] // read 16 pixels src_sobely
5104 lea eax, [eax + 16]
5105 paddusb xmm0, xmm1 // sobel = sobelx + sobely
5106 movdqa xmm2, xmm0 // GG
5107 punpcklbw xmm2, xmm0 // First 8
5108 punpckhbw xmm0, xmm0 // Next 8
5109 movdqa xmm1, xmm2 // GGGG
5110 punpcklwd xmm1, xmm2 // First 4
5111 punpckhwd xmm2, xmm2 // Next 4
5112 por xmm1, xmm5 // GGGA
5113 por xmm2, xmm5
5114 movdqa xmm3, xmm0 // GGGG
5115 punpcklwd xmm3, xmm0 // Next 4
5116 punpckhwd xmm0, xmm0 // Last 4
5117 por xmm3, xmm5 // GGGA
5118 por xmm0, xmm5
5119 movdqu [edx], xmm1
5120 movdqu [edx + 16], xmm2
5121 movdqu [edx + 32], xmm3
5122 movdqu [edx + 48], xmm0
5123 lea edx, [edx + 64]
5124 sub ecx, 16
5125 jg convertloop
5126
5127 pop esi
5128 ret
5129 }
5130 }
5131 #endif // HAS_SOBELROW_SSE2
5132
5133 #ifdef HAS_SOBELTOPLANEROW_SSE2
5134 // Adds Sobel X and Sobel Y and stores Sobel into a plane.
5135 __declspec(naked) void SobelToPlaneRow_SSE2(const uint8* src_sobelx,
5136 const uint8* src_sobely,
5137 uint8* dst_y,
5138 int width) {
5139 __asm {
5140 push esi
5141 mov eax, [esp + 4 + 4] // src_sobelx
5142 mov esi, [esp + 4 + 8] // src_sobely
5143 mov edx, [esp + 4 + 12] // dst_argb
5144 mov ecx, [esp + 4 + 16] // width
5145 sub esi, eax
5146
5147 convertloop:
5148 movdqu xmm0, [eax] // read 16 pixels src_sobelx
5149 movdqu xmm1, [eax + esi] // read 16 pixels src_sobely
5150 lea eax, [eax + 16]
5151 paddusb xmm0, xmm1 // sobel = sobelx + sobely
5152 movdqu [edx], xmm0
5153 lea edx, [edx + 16]
5154 sub ecx, 16
5155 jg convertloop
5156
5157 pop esi
5158 ret
5159 }
5160 }
5161 #endif // HAS_SOBELTOPLANEROW_SSE2
5162
5163 #ifdef HAS_SOBELXYROW_SSE2
5164 // Mixes Sobel X, Sobel Y and Sobel into ARGB.
5165 // A = 255
5166 // R = Sobel X
5167 // G = Sobel
5168 // B = Sobel Y
5169 __declspec(naked) void SobelXYRow_SSE2(const uint8* src_sobelx,
5170 const uint8* src_sobely,
5171 uint8* dst_argb,
5172 int width) {
5173 __asm {
5174 push esi
5175 mov eax, [esp + 4 + 4] // src_sobelx
5176 mov esi, [esp + 4 + 8] // src_sobely
5177 mov edx, [esp + 4 + 12] // dst_argb
5178 mov ecx, [esp + 4 + 16] // width
5179 sub esi, eax
5180 pcmpeqb xmm5, xmm5 // alpha 255
5181
5182 convertloop:
5183 movdqu xmm0, [eax] // read 16 pixels src_sobelx
5184 movdqu xmm1, [eax + esi] // read 16 pixels src_sobely
5185 lea eax, [eax + 16]
5186 movdqa xmm2, xmm0
5187 paddusb xmm2, xmm1 // sobel = sobelx + sobely
5188 movdqa xmm3, xmm0 // XA
5189 punpcklbw xmm3, xmm5
5190 punpckhbw xmm0, xmm5
5191 movdqa xmm4, xmm1 // YS
5192 punpcklbw xmm4, xmm2
5193 punpckhbw xmm1, xmm2
5194 movdqa xmm6, xmm4 // YSXA
5195 punpcklwd xmm6, xmm3 // First 4
5196 punpckhwd xmm4, xmm3 // Next 4
5197 movdqa xmm7, xmm1 // YSXA
5198 punpcklwd xmm7, xmm0 // Next 4
5199 punpckhwd xmm1, xmm0 // Last 4
5200 movdqu [edx], xmm6
5201 movdqu [edx + 16], xmm4
5202 movdqu [edx + 32], xmm7
5203 movdqu [edx + 48], xmm1
5204 lea edx, [edx + 64]
5205 sub ecx, 16
5206 jg convertloop
5207
5208 pop esi
5209 ret
5210 }
5211 }
5212 #endif // HAS_SOBELXYROW_SSE2
5213
5214 #ifdef HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
5215 // Consider float CumulativeSum.
5216 // Consider calling CumulativeSum one row at time as needed.
5217 // Consider circular CumulativeSum buffer of radius * 2 + 1 height.
5218 // Convert cumulative sum for an area to an average for 1 pixel.
5219 // topleft is pointer to top left of CumulativeSum buffer for area.
5220 // botleft is pointer to bottom left of CumulativeSum buffer.
5221 // width is offset from left to right of area in CumulativeSum buffer measured
5222 // in number of ints.
5223 // area is the number of pixels in the area being averaged.
5224 // dst points to pixel to store result to.
5225 // count is number of averaged pixels to produce.
5226 // Does 4 pixels at a time.
5227 // This function requires alignment on accumulation buffer pointers.
5228 void CumulativeSumToAverageRow_SSE2(const int32* topleft,
5229 const int32* botleft,
5230 int width,
5231 int area,
5232 uint8* dst,
5233 int count) {
5234 __asm {
5235 mov eax, topleft // eax topleft
5236 mov esi, botleft // esi botleft
5237 mov edx, width
5238 movd xmm5, area
5239 mov edi, dst
5240 mov ecx, count
5241 cvtdq2ps xmm5, xmm5
5242 rcpss xmm4, xmm5 // 1.0f / area
5243 pshufd xmm4, xmm4, 0
5244 sub ecx, 4
5245 jl l4b
5246
5247 cmp area, 128 // 128 pixels will not overflow 15 bits.
5248 ja l4
5249
5250 pshufd xmm5, xmm5, 0 // area
5251 pcmpeqb xmm6, xmm6 // constant of 65536.0 - 1 = 65535.0
5252 psrld xmm6, 16
5253 cvtdq2ps xmm6, xmm6
5254 addps xmm5, xmm6 // (65536.0 + area - 1)
5255 mulps xmm5, xmm4 // (65536.0 + area - 1) * 1 / area
5256 cvtps2dq xmm5, xmm5 // 0.16 fixed point
5257 packssdw xmm5, xmm5 // 16 bit shorts
5258
5259 // 4 pixel loop small blocks.
5260 s4:
5261 // top left
5262 movdqu xmm0, [eax]
5263 movdqu xmm1, [eax + 16]
5264 movdqu xmm2, [eax + 32]
5265 movdqu xmm3, [eax + 48]
5266
5267 // - top right
5268 psubd xmm0, [eax + edx * 4]
5269 psubd xmm1, [eax + edx * 4 + 16]
5270 psubd xmm2, [eax + edx * 4 + 32]
5271 psubd xmm3, [eax + edx * 4 + 48]
5272 lea eax, [eax + 64]
5273
5274 // - bottom left
5275 psubd xmm0, [esi]
5276 psubd xmm1, [esi + 16]
5277 psubd xmm2, [esi + 32]
5278 psubd xmm3, [esi + 48]
5279
5280 // + bottom right
5281 paddd xmm0, [esi + edx * 4]
5282 paddd xmm1, [esi + edx * 4 + 16]
5283 paddd xmm2, [esi + edx * 4 + 32]
5284 paddd xmm3, [esi + edx * 4 + 48]
5285 lea esi, [esi + 64]
5286
5287 packssdw xmm0, xmm1 // pack 4 pixels into 2 registers
5288 packssdw xmm2, xmm3
5289
5290 pmulhuw xmm0, xmm5
5291 pmulhuw xmm2, xmm5
5292
5293 packuswb xmm0, xmm2
5294 movdqu [edi], xmm0
5295 lea edi, [edi + 16]
5296 sub ecx, 4
5297 jge s4
5298
5299 jmp l4b
5300
5301 // 4 pixel loop
5302 l4:
5303 // top left
5304 movdqu xmm0, [eax]
5305 movdqu xmm1, [eax + 16]
5306 movdqu xmm2, [eax + 32]
5307 movdqu xmm3, [eax + 48]
5308
5309 // - top right
5310 psubd xmm0, [eax + edx * 4]
5311 psubd xmm1, [eax + edx * 4 + 16]
5312 psubd xmm2, [eax + edx * 4 + 32]
5313 psubd xmm3, [eax + edx * 4 + 48]
5314 lea eax, [eax + 64]
5315
5316 // - bottom left
5317 psubd xmm0, [esi]
5318 psubd xmm1, [esi + 16]
5319 psubd xmm2, [esi + 32]
5320 psubd xmm3, [esi + 48]
5321
5322 // + bottom right
5323 paddd xmm0, [esi + edx * 4]
5324 paddd xmm1, [esi + edx * 4 + 16]
5325 paddd xmm2, [esi + edx * 4 + 32]
5326 paddd xmm3, [esi + edx * 4 + 48]
5327 lea esi, [esi + 64]
5328
5329 cvtdq2ps xmm0, xmm0 // Average = Sum * 1 / Area
5330 cvtdq2ps xmm1, xmm1
5331 mulps xmm0, xmm4
5332 mulps xmm1, xmm4
5333 cvtdq2ps xmm2, xmm2
5334 cvtdq2ps xmm3, xmm3
5335 mulps xmm2, xmm4
5336 mulps xmm3, xmm4
5337 cvtps2dq xmm0, xmm0
5338 cvtps2dq xmm1, xmm1
5339 cvtps2dq xmm2, xmm2
5340 cvtps2dq xmm3, xmm3
5341 packssdw xmm0, xmm1
5342 packssdw xmm2, xmm3
5343 packuswb xmm0, xmm2
5344 movdqu [edi], xmm0
5345 lea edi, [edi + 16]
5346 sub ecx, 4
5347 jge l4
5348
5349 l4b:
5350 add ecx, 4 - 1
5351 jl l1b
5352
5353 // 1 pixel loop
5354 l1:
5355 movdqu xmm0, [eax]
5356 psubd xmm0, [eax + edx * 4]
5357 lea eax, [eax + 16]
5358 psubd xmm0, [esi]
5359 paddd xmm0, [esi + edx * 4]
5360 lea esi, [esi + 16]
5361 cvtdq2ps xmm0, xmm0
5362 mulps xmm0, xmm4
5363 cvtps2dq xmm0, xmm0
5364 packssdw xmm0, xmm0
5365 packuswb xmm0, xmm0
5366 movd dword ptr [edi], xmm0
5367 lea edi, [edi + 4]
5368 sub ecx, 1
5369 jge l1
5370 l1b:
5371 }
5372 }
5373 #endif // HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
5374
5375 #ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2
5376 // Creates a table of cumulative sums where each value is a sum of all values
5377 // above and to the left of the value.
5378 void ComputeCumulativeSumRow_SSE2(const uint8* row,
5379 int32* cumsum,
5380 const int32* previous_cumsum,
5381 int width) {
5382 __asm {
5383 mov eax, row
5384 mov edx, cumsum
5385 mov esi, previous_cumsum
5386 mov ecx, width
5387 pxor xmm0, xmm0
5388 pxor xmm1, xmm1
5389
5390 sub ecx, 4
5391 jl l4b
5392 test edx, 15
5393 jne l4b
5394
5395 // 4 pixel loop
5396 l4:
5397 movdqu xmm2, [eax] // 4 argb pixels 16 bytes.
5398 lea eax, [eax + 16]
5399 movdqa xmm4, xmm2
5400
5401 punpcklbw xmm2, xmm1
5402 movdqa xmm3, xmm2
5403 punpcklwd xmm2, xmm1
5404 punpckhwd xmm3, xmm1
5405
5406 punpckhbw xmm4, xmm1
5407 movdqa xmm5, xmm4
5408 punpcklwd xmm4, xmm1
5409 punpckhwd xmm5, xmm1
5410
5411 paddd xmm0, xmm2
5412 movdqu xmm2, [esi] // previous row above.
5413 paddd xmm2, xmm0
5414
5415 paddd xmm0, xmm3
5416 movdqu xmm3, [esi + 16]
5417 paddd xmm3, xmm0
5418
5419 paddd xmm0, xmm4
5420 movdqu xmm4, [esi + 32]
5421 paddd xmm4, xmm0
5422
5423 paddd xmm0, xmm5
5424 movdqu xmm5, [esi + 48]
5425 lea esi, [esi + 64]
5426 paddd xmm5, xmm0
5427
5428 movdqu [edx], xmm2
5429 movdqu [edx + 16], xmm3
5430 movdqu [edx + 32], xmm4
5431 movdqu [edx + 48], xmm5
5432
5433 lea edx, [edx + 64]
5434 sub ecx, 4
5435 jge l4
5436
5437 l4b:
5438 add ecx, 4 - 1
5439 jl l1b
5440
5441 // 1 pixel loop
5442 l1:
5443 movd xmm2, dword ptr [eax] // 1 argb pixel 4 bytes.
5444 lea eax, [eax + 4]
5445 punpcklbw xmm2, xmm1
5446 punpcklwd xmm2, xmm1
5447 paddd xmm0, xmm2
5448 movdqu xmm2, [esi]
5449 lea esi, [esi + 16]
5450 paddd xmm2, xmm0
5451 movdqu [edx], xmm2
5452 lea edx, [edx + 16]
5453 sub ecx, 1
5454 jge l1
5455
5456 l1b:
5457 }
5458 }
5459 #endif // HAS_COMPUTECUMULATIVESUMROW_SSE2
5460
5461 #ifdef HAS_ARGBAFFINEROW_SSE2
5462 // Copy ARGB pixels from source image with slope to a row of destination.
5463 __declspec(naked) LIBYUV_API void ARGBAffineRow_SSE2(const uint8* src_argb,
5464 int src_argb_stride,
5465 uint8* dst_argb,
5466 const float* uv_dudv,
5467 int width) {
5468 __asm {
5469 push esi
5470 push edi
5471 mov eax, [esp + 12] // src_argb
5472 mov esi, [esp + 16] // stride
5473 mov edx, [esp + 20] // dst_argb
5474 mov ecx, [esp + 24] // pointer to uv_dudv
5475 movq xmm2, qword ptr [ecx] // uv
5476 movq xmm7, qword ptr [ecx + 8] // dudv
5477 mov ecx, [esp + 28] // width
5478 shl esi, 16 // 4, stride
5479 add esi, 4
5480 movd xmm5, esi
5481 sub ecx, 4
5482 jl l4b
5483
5484 // setup for 4 pixel loop
5485 pshufd xmm7, xmm7, 0x44 // dup dudv
5486 pshufd xmm5, xmm5, 0 // dup 4, stride
5487 movdqa xmm0, xmm2 // x0, y0, x1, y1
5488 addps xmm0, xmm7
5489 movlhps xmm2, xmm0
5490 movdqa xmm4, xmm7
5491 addps xmm4, xmm4 // dudv *= 2
5492 movdqa xmm3, xmm2 // x2, y2, x3, y3
5493 addps xmm3, xmm4
5494 addps xmm4, xmm4 // dudv *= 4
5495
5496 // 4 pixel loop
5497 l4:
5498 cvttps2dq xmm0, xmm2 // x, y float to int first 2
5499 cvttps2dq xmm1, xmm3 // x, y float to int next 2
5500 packssdw xmm0, xmm1 // x, y as 8 shorts
5501 pmaddwd xmm0, xmm5 // offsets = x * 4 + y * stride.
5502 movd esi, xmm0
5503 pshufd xmm0, xmm0, 0x39 // shift right
5504 movd edi, xmm0
5505 pshufd xmm0, xmm0, 0x39 // shift right
5506 movd xmm1, [eax + esi] // read pixel 0
5507 movd xmm6, [eax + edi] // read pixel 1
5508 punpckldq xmm1, xmm6 // combine pixel 0 and 1
5509 addps xmm2, xmm4 // x, y += dx, dy first 2
5510 movq qword ptr [edx], xmm1
5511 movd esi, xmm0
5512 pshufd xmm0, xmm0, 0x39 // shift right
5513 movd edi, xmm0
5514 movd xmm6, [eax + esi] // read pixel 2
5515 movd xmm0, [eax + edi] // read pixel 3
5516 punpckldq xmm6, xmm0 // combine pixel 2 and 3
5517 addps xmm3, xmm4 // x, y += dx, dy next 2
5518 movq qword ptr 8[edx], xmm6
5519 lea edx, [edx + 16]
5520 sub ecx, 4
5521 jge l4
5522
5523 l4b:
5524 add ecx, 4 - 1
5525 jl l1b
5526
5527 // 1 pixel loop
5528 l1:
5529 cvttps2dq xmm0, xmm2 // x, y float to int
5530 packssdw xmm0, xmm0 // x, y as shorts
5531 pmaddwd xmm0, xmm5 // offset = x * 4 + y * stride
5532 addps xmm2, xmm7 // x, y += dx, dy
5533 movd esi, xmm0
5534 movd xmm0, [eax + esi] // copy a pixel
5535 movd [edx], xmm0
5536 lea edx, [edx + 4]
5537 sub ecx, 1
5538 jge l1
5539 l1b:
5540 pop edi
5541 pop esi
5542 ret
5543 }
5544 }
5545 #endif // HAS_ARGBAFFINEROW_SSE2
5546
5547 #ifdef HAS_INTERPOLATEROW_AVX2
5548 // Bilinear filter 32x2 -> 32x1
5549 __declspec(naked) void InterpolateRow_AVX2(uint8* dst_ptr,
5550 const uint8* src_ptr,
5551 ptrdiff_t src_stride,
5552 int dst_width,
5553 int source_y_fraction) {
5554 __asm {
5555 push esi
5556 push edi
5557 mov edi, [esp + 8 + 4] // dst_ptr
5558 mov esi, [esp + 8 + 8] // src_ptr
5559 mov edx, [esp + 8 + 12] // src_stride
5560 mov ecx, [esp + 8 + 16] // dst_width
5561 mov eax, [esp + 8 + 20] // source_y_fraction (0..255)
5562 // Dispatch to specialized filters if applicable.
5563 cmp eax, 0
5564 je xloop100 // 0 / 256. Blend 100 / 0.
5565 sub edi, esi
5566 cmp eax, 128
5567 je xloop50 // 128 /256 is 0.50. Blend 50 / 50.
5568
5569 vmovd xmm0, eax // high fraction 0..255
5570 neg eax
5571 add eax, 256
5572 vmovd xmm5, eax // low fraction 256..1
5573 vpunpcklbw xmm5, xmm5, xmm0
5574 vpunpcklwd xmm5, xmm5, xmm5
5575 vbroadcastss ymm5, xmm5
5576
5577 mov eax, 0x80808080 // 128b for bias and rounding.
5578 vmovd xmm4, eax
5579 vbroadcastss ymm4, xmm4
5580
5581 xloop:
5582 vmovdqu ymm0, [esi]
5583 vmovdqu ymm2, [esi + edx]
5584 vpunpckhbw ymm1, ymm0, ymm2 // mutates
5585 vpunpcklbw ymm0, ymm0, ymm2
5586 vpsubb ymm1, ymm1, ymm4 // bias to signed image
5587 vpsubb ymm0, ymm0, ymm4
5588 vpmaddubsw ymm1, ymm5, ymm1
5589 vpmaddubsw ymm0, ymm5, ymm0
5590 vpaddw ymm1, ymm1, ymm4 // unbias and round
5591 vpaddw ymm0, ymm0, ymm4
5592 vpsrlw ymm1, ymm1, 8
5593 vpsrlw ymm0, ymm0, 8
5594 vpackuswb ymm0, ymm0, ymm1 // unmutates
5595 vmovdqu [esi + edi], ymm0
5596 lea esi, [esi + 32]
5597 sub ecx, 32
5598 jg xloop
5599 jmp xloop99
5600
5601 // Blend 50 / 50.
5602 xloop50:
5603 vmovdqu ymm0, [esi]
5604 vpavgb ymm0, ymm0, [esi + edx]
5605 vmovdqu [esi + edi], ymm0
5606 lea esi, [esi + 32]
5607 sub ecx, 32
5608 jg xloop50
5609 jmp xloop99
5610
5611 // Blend 100 / 0 - Copy row unchanged.
5612 xloop100:
5613 rep movsb
5614
5615 xloop99:
5616 pop edi
5617 pop esi
5618 vzeroupper
5619 ret
5620 }
5621 }
5622 #endif // HAS_INTERPOLATEROW_AVX2
5623
5624 // Bilinear filter 16x2 -> 16x1
5625 // TODO(fbarchard): Consider allowing 256 using memcpy.
5626 __declspec(naked) void InterpolateRow_SSSE3(uint8* dst_ptr,
5627 const uint8* src_ptr,
5628 ptrdiff_t src_stride,
5629 int dst_width,
5630 int source_y_fraction) {
5631 __asm {
5632 push esi
5633 push edi
5634
5635 mov edi, [esp + 8 + 4] // dst_ptr
5636 mov esi, [esp + 8 + 8] // src_ptr
5637 mov edx, [esp + 8 + 12] // src_stride
5638 mov ecx, [esp + 8 + 16] // dst_width
5639 mov eax, [esp + 8 + 20] // source_y_fraction (0..255)
5640 sub edi, esi
5641 // Dispatch to specialized filters if applicable.
5642 cmp eax, 0
5643 je xloop100 // 0 /256. Blend 100 / 0.
5644 cmp eax, 128
5645 je xloop50 // 128 / 256 is 0.50. Blend 50 / 50.
5646
5647 movd xmm0, eax // high fraction 0..255
5648 neg eax
5649 add eax, 256
5650 movd xmm5, eax // low fraction 255..1
5651 punpcklbw xmm5, xmm0
5652 punpcklwd xmm5, xmm5
5653 pshufd xmm5, xmm5, 0
5654 mov eax, 0x80808080 // 128 for biasing image to signed.
5655 movd xmm4, eax
5656 pshufd xmm4, xmm4, 0x00
5657
5658 xloop:
5659 movdqu xmm0, [esi]
5660 movdqu xmm2, [esi + edx]
5661 movdqu xmm1, xmm0
5662 punpcklbw xmm0, xmm2
5663 punpckhbw xmm1, xmm2
5664 psubb xmm0, xmm4 // bias image by -128
5665 psubb xmm1, xmm4
5666 movdqa xmm2, xmm5
5667 movdqa xmm3, xmm5
5668 pmaddubsw xmm2, xmm0
5669 pmaddubsw xmm3, xmm1
5670 paddw xmm2, xmm4
5671 paddw xmm3, xmm4
5672 psrlw xmm2, 8
5673 psrlw xmm3, 8
5674 packuswb xmm2, xmm3
5675 movdqu [esi + edi], xmm2
5676 lea esi, [esi + 16]
5677 sub ecx, 16
5678 jg xloop
5679 jmp xloop99
5680
5681 // Blend 50 / 50.
5682 xloop50:
5683 movdqu xmm0, [esi]
5684 movdqu xmm1, [esi + edx]
5685 pavgb xmm0, xmm1
5686 movdqu [esi + edi], xmm0
5687 lea esi, [esi + 16]
5688 sub ecx, 16
5689 jg xloop50
5690 jmp xloop99
5691
5692 // Blend 100 / 0 - Copy row unchanged.
5693 xloop100:
5694 movdqu xmm0, [esi]
5695 movdqu [esi + edi], xmm0
5696 lea esi, [esi + 16]
5697 sub ecx, 16
5698 jg xloop100
5699
5700 xloop99:
5701 pop edi
5702 pop esi
5703 ret
5704 }
5705 }
5706
5707 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
5708 __declspec(naked) void ARGBShuffleRow_SSSE3(const uint8* src_argb,
5709 uint8* dst_argb,
5710 const uint8* shuffler,
5711 int width) {
5712 __asm {
5713 mov eax, [esp + 4] // src_argb
5714 mov edx, [esp + 8] // dst_argb
5715 mov ecx, [esp + 12] // shuffler
5716 movdqu xmm5, [ecx]
5717 mov ecx, [esp + 16] // width
5718
5719 wloop:
5720 movdqu xmm0, [eax]
5721 movdqu xmm1, [eax + 16]
5722 lea eax, [eax + 32]
5723 pshufb xmm0, xmm5
5724 pshufb xmm1, xmm5
5725 movdqu [edx], xmm0
5726 movdqu [edx + 16], xmm1
5727 lea edx, [edx + 32]
5728 sub ecx, 8
5729 jg wloop
5730 ret
5731 }
5732 }
5733
5734 #ifdef HAS_ARGBSHUFFLEROW_AVX2
5735 __declspec(naked) void ARGBShuffleRow_AVX2(const uint8* src_argb,
5736 uint8* dst_argb,
5737 const uint8* shuffler,
5738 int width) {
5739 __asm {
5740 mov eax, [esp + 4] // src_argb
5741 mov edx, [esp + 8] // dst_argb
5742 mov ecx, [esp + 12] // shuffler
5743 vbroadcastf128 ymm5, [ecx] // same shuffle in high as low.
5744 mov ecx, [esp + 16] // width
5745
5746 wloop:
5747 vmovdqu ymm0, [eax]
5748 vmovdqu ymm1, [eax + 32]
5749 lea eax, [eax + 64]
5750 vpshufb ymm0, ymm0, ymm5
5751 vpshufb ymm1, ymm1, ymm5
5752 vmovdqu [edx], ymm0
5753 vmovdqu [edx + 32], ymm1
5754 lea edx, [edx + 64]
5755 sub ecx, 16
5756 jg wloop
5757
5758 vzeroupper
5759 ret
5760 }
5761 }
5762 #endif // HAS_ARGBSHUFFLEROW_AVX2
5763
5764 __declspec(naked) void ARGBShuffleRow_SSE2(const uint8* src_argb,
5765 uint8* dst_argb,
5766 const uint8* shuffler,
5767 int width) {
5768 __asm {
5769 push ebx
5770 push esi
5771 mov eax, [esp + 8 + 4] // src_argb
5772 mov edx, [esp + 8 + 8] // dst_argb
5773 mov esi, [esp + 8 + 12] // shuffler
5774 mov ecx, [esp + 8 + 16] // width
5775 pxor xmm5, xmm5
5776
5777 mov ebx, [esi] // shuffler
5778 cmp ebx, 0x03000102
5779 je shuf_3012
5780 cmp ebx, 0x00010203
5781 je shuf_0123
5782 cmp ebx, 0x00030201
5783 je shuf_0321
5784 cmp ebx, 0x02010003
5785 je shuf_2103
5786
5787 // TODO(fbarchard): Use one source pointer and 3 offsets.
5788 shuf_any1:
5789 movzx ebx, byte ptr [esi]
5790 movzx ebx, byte ptr [eax + ebx]
5791 mov [edx], bl
5792 movzx ebx, byte ptr [esi + 1]
5793 movzx ebx, byte ptr [eax + ebx]
5794 mov [edx + 1], bl
5795 movzx ebx, byte ptr [esi + 2]
5796 movzx ebx, byte ptr [eax + ebx]
5797 mov [edx + 2], bl
5798 movzx ebx, byte ptr [esi + 3]
5799 movzx ebx, byte ptr [eax + ebx]
5800 mov [edx + 3], bl
5801 lea eax, [eax + 4]
5802 lea edx, [edx + 4]
5803 sub ecx, 1
5804 jg shuf_any1
5805 jmp shuf99
5806
5807 shuf_0123:
5808 movdqu xmm0, [eax]
5809 lea eax, [eax + 16]
5810 movdqa xmm1, xmm0
5811 punpcklbw xmm0, xmm5
5812 punpckhbw xmm1, xmm5
5813 pshufhw xmm0, xmm0, 01Bh // 1B = 00011011 = 0x0123 = BGRAToARGB
5814 pshuflw xmm0, xmm0, 01Bh
5815 pshufhw xmm1, xmm1, 01Bh
5816 pshuflw xmm1, xmm1, 01Bh
5817 packuswb xmm0, xmm1
5818 movdqu [edx], xmm0
5819 lea edx, [edx + 16]
5820 sub ecx, 4
5821 jg shuf_0123
5822 jmp shuf99
5823
5824 shuf_0321:
5825 movdqu xmm0, [eax]
5826 lea eax, [eax + 16]
5827 movdqa xmm1, xmm0
5828 punpcklbw xmm0, xmm5
5829 punpckhbw xmm1, xmm5
5830 pshufhw xmm0, xmm0, 039h // 39 = 00111001 = 0x0321 = RGBAToARGB
5831 pshuflw xmm0, xmm0, 039h
5832 pshufhw xmm1, xmm1, 039h
5833 pshuflw xmm1, xmm1, 039h
5834 packuswb xmm0, xmm1
5835 movdqu [edx], xmm0
5836 lea edx, [edx + 16]
5837 sub ecx, 4
5838 jg shuf_0321
5839 jmp shuf99
5840
5841 shuf_2103:
5842 movdqu xmm0, [eax]
5843 lea eax, [eax + 16]
5844 movdqa xmm1, xmm0
5845 punpcklbw xmm0, xmm5
5846 punpckhbw xmm1, xmm5
5847 pshufhw xmm0, xmm0, 093h // 93 = 10010011 = 0x2103 = ARGBToRGBA
5848 pshuflw xmm0, xmm0, 093h
5849 pshufhw xmm1, xmm1, 093h
5850 pshuflw xmm1, xmm1, 093h
5851 packuswb xmm0, xmm1
5852 movdqu [edx], xmm0
5853 lea edx, [edx + 16]
5854 sub ecx, 4
5855 jg shuf_2103
5856 jmp shuf99
5857
5858 shuf_3012:
5859 movdqu xmm0, [eax]
5860 lea eax, [eax + 16]
5861 movdqa xmm1, xmm0
5862 punpcklbw xmm0, xmm5
5863 punpckhbw xmm1, xmm5
5864 pshufhw xmm0, xmm0, 0C6h // C6 = 11000110 = 0x3012 = ABGRToARGB
5865 pshuflw xmm0, xmm0, 0C6h
5866 pshufhw xmm1, xmm1, 0C6h
5867 pshuflw xmm1, xmm1, 0C6h
5868 packuswb xmm0, xmm1
5869 movdqu [edx], xmm0
5870 lea edx, [edx + 16]
5871 sub ecx, 4
5872 jg shuf_3012
5873
5874 shuf99:
5875 pop esi
5876 pop ebx
5877 ret
5878 }
5879 }
5880
5881 // YUY2 - Macro-pixel = 2 image pixels
5882 // Y0U0Y1V0....Y2U2Y3V2...Y4U4Y5V4....
5883
5884 // UYVY - Macro-pixel = 2 image pixels
5885 // U0Y0V0Y1
5886
5887 __declspec(naked) void I422ToYUY2Row_SSE2(const uint8* src_y,
5888 const uint8* src_u,
5889 const uint8* src_v,
5890 uint8* dst_frame,
5891 int width) {
5892 __asm {
5893 push esi
5894 push edi
5895 mov eax, [esp + 8 + 4] // src_y
5896 mov esi, [esp + 8 + 8] // src_u
5897 mov edx, [esp + 8 + 12] // src_v
5898 mov edi, [esp + 8 + 16] // dst_frame
5899 mov ecx, [esp + 8 + 20] // width
5900 sub edx, esi
5901
5902 convertloop:
5903 movq xmm2, qword ptr [esi] // U
5904 movq xmm3, qword ptr [esi + edx] // V
5905 lea esi, [esi + 8]
5906 punpcklbw xmm2, xmm3 // UV
5907 movdqu xmm0, [eax] // Y
5908 lea eax, [eax + 16]
5909 movdqa xmm1, xmm0
5910 punpcklbw xmm0, xmm2 // YUYV
5911 punpckhbw xmm1, xmm2
5912 movdqu [edi], xmm0
5913 movdqu [edi + 16], xmm1
5914 lea edi, [edi + 32]
5915 sub ecx, 16
5916 jg convertloop
5917
5918 pop edi
5919 pop esi
5920 ret
5921 }
5922 }
5923
5924 __declspec(naked) void I422ToUYVYRow_SSE2(const uint8* src_y,
5925 const uint8* src_u,
5926 const uint8* src_v,
5927 uint8* dst_frame,
5928 int width) {
5929 __asm {
5930 push esi
5931 push edi
5932 mov eax, [esp + 8 + 4] // src_y
5933 mov esi, [esp + 8 + 8] // src_u
5934 mov edx, [esp + 8 + 12] // src_v
5935 mov edi, [esp + 8 + 16] // dst_frame
5936 mov ecx, [esp + 8 + 20] // width
5937 sub edx, esi
5938
5939 convertloop:
5940 movq xmm2, qword ptr [esi] // U
5941 movq xmm3, qword ptr [esi + edx] // V
5942 lea esi, [esi + 8]
5943 punpcklbw xmm2, xmm3 // UV
5944 movdqu xmm0, [eax] // Y
5945 movdqa xmm1, xmm2
5946 lea eax, [eax + 16]
5947 punpcklbw xmm1, xmm0 // UYVY
5948 punpckhbw xmm2, xmm0
5949 movdqu [edi], xmm1
5950 movdqu [edi + 16], xmm2
5951 lea edi, [edi + 32]
5952 sub ecx, 16
5953 jg convertloop
5954
5955 pop edi
5956 pop esi
5957 ret
5958 }
5959 }
5960
5961 #ifdef HAS_ARGBPOLYNOMIALROW_SSE2
5962 __declspec(naked) void ARGBPolynomialRow_SSE2(const uint8* src_argb,
5963 uint8* dst_argb,
5964 const float* poly,
5965 int width) {
5966 __asm {
5967 push esi
5968 mov eax, [esp + 4 + 4] /* src_argb */
5969 mov edx, [esp + 4 + 8] /* dst_argb */
5970 mov esi, [esp + 4 + 12] /* poly */
5971 mov ecx, [esp + 4 + 16] /* width */
5972 pxor xmm3, xmm3 // 0 constant for zero extending bytes to ints.
5973
5974 // 2 pixel loop.
5975 convertloop:
5976 // pmovzxbd xmm0, dword ptr [eax] // BGRA pixel
5977 // pmovzxbd xmm4, dword ptr [eax + 4] // BGRA pixel
5978 movq xmm0, qword ptr [eax] // BGRABGRA
5979 lea eax, [eax + 8]
5980 punpcklbw xmm0, xmm3
5981 movdqa xmm4, xmm0
5982 punpcklwd xmm0, xmm3 // pixel 0
5983 punpckhwd xmm4, xmm3 // pixel 1
5984 cvtdq2ps xmm0, xmm0 // 4 floats
5985 cvtdq2ps xmm4, xmm4
5986 movdqa xmm1, xmm0 // X
5987 movdqa xmm5, xmm4
5988 mulps xmm0, [esi + 16] // C1 * X
5989 mulps xmm4, [esi + 16]
5990 addps xmm0, [esi] // result = C0 + C1 * X
5991 addps xmm4, [esi]
5992 movdqa xmm2, xmm1
5993 movdqa xmm6, xmm5
5994 mulps xmm2, xmm1 // X * X
5995 mulps xmm6, xmm5
5996 mulps xmm1, xmm2 // X * X * X
5997 mulps xmm5, xmm6
5998 mulps xmm2, [esi + 32] // C2 * X * X
5999 mulps xmm6, [esi + 32]
6000 mulps xmm1, [esi + 48] // C3 * X * X * X
6001 mulps xmm5, [esi + 48]
6002 addps xmm0, xmm2 // result += C2 * X * X
6003 addps xmm4, xmm6
6004 addps xmm0, xmm1 // result += C3 * X * X * X
6005 addps xmm4, xmm5
6006 cvttps2dq xmm0, xmm0
6007 cvttps2dq xmm4, xmm4
6008 packuswb xmm0, xmm4
6009 packuswb xmm0, xmm0
6010 movq qword ptr [edx], xmm0
6011 lea edx, [edx + 8]
6012 sub ecx, 2
6013 jg convertloop
6014 pop esi
6015 ret
6016 }
6017 }
6018 #endif // HAS_ARGBPOLYNOMIALROW_SSE2
6019
6020 #ifdef HAS_ARGBPOLYNOMIALROW_AVX2
6021 __declspec(naked) void ARGBPolynomialRow_AVX2(const uint8* src_argb,
6022 uint8* dst_argb,
6023 const float* poly,
6024 int width) {
6025 __asm {
6026 mov eax, [esp + 4] /* src_argb */
6027 mov edx, [esp + 8] /* dst_argb */
6028 mov ecx, [esp + 12] /* poly */
6029 vbroadcastf128 ymm4, [ecx] // C0
6030 vbroadcastf128 ymm5, [ecx + 16] // C1
6031 vbroadcastf128 ymm6, [ecx + 32] // C2
6032 vbroadcastf128 ymm7, [ecx + 48] // C3
6033 mov ecx, [esp + 16] /* width */
6034
6035 // 2 pixel loop.
6036 convertloop:
6037 vpmovzxbd ymm0, qword ptr [eax] // 2 BGRA pixels
6038 lea eax, [eax + 8]
6039 vcvtdq2ps ymm0, ymm0 // X 8 floats
6040 vmulps ymm2, ymm0, ymm0 // X * X
6041 vmulps ymm3, ymm0, ymm7 // C3 * X
6042 vfmadd132ps ymm0, ymm4, ymm5 // result = C0 + C1 * X
6043 vfmadd231ps ymm0, ymm2, ymm6 // result += C2 * X * X
6044 vfmadd231ps ymm0, ymm2, ymm3 // result += C3 * X * X * X
6045 vcvttps2dq ymm0, ymm0
6046 vpackusdw ymm0, ymm0, ymm0 // b0g0r0a0_00000000_b0g0r0a0_00000000
6047 vpermq ymm0, ymm0, 0xd8 // b0g0r0a0_b0g0r0a0_00000000_00000000
6048 vpackuswb xmm0, xmm0, xmm0 // bgrabgra_00000000_00000000_00000000
6049 vmovq qword ptr [edx], xmm0
6050 lea edx, [edx + 8]
6051 sub ecx, 2
6052 jg convertloop
6053 vzeroupper
6054 ret
6055 }
6056 }
6057 #endif // HAS_ARGBPOLYNOMIALROW_AVX2
6058
6059 #ifdef HAS_HALFFLOATROW_SSE2
6060 static float kExpBias = 1.9259299444e-34f;
6061 __declspec(naked) void HalfFloatRow_SSE2(const uint16* src,
6062 uint16* dst,
6063 float scale,
6064 int width) {
6065 __asm {
6066 mov eax, [esp + 4] /* src */
6067 mov edx, [esp + 8] /* dst */
6068 movd xmm4, dword ptr [esp + 12] /* scale */
6069 mov ecx, [esp + 16] /* width */
6070 mulss xmm4, kExpBias
6071 pshufd xmm4, xmm4, 0
6072 pxor xmm5, xmm5
6073 sub edx, eax
6074
6075 // 8 pixel loop.
6076 convertloop:
6077 movdqu xmm2, xmmword ptr [eax] // 8 shorts
6078 add eax, 16
6079 movdqa xmm3, xmm2
6080 punpcklwd xmm2, xmm5
6081 cvtdq2ps xmm2, xmm2 // convert 8 ints to floats
6082 punpckhwd xmm3, xmm5
6083 cvtdq2ps xmm3, xmm3
6084 mulps xmm2, xmm4
6085 mulps xmm3, xmm4
6086 psrld xmm2, 13
6087 psrld xmm3, 13
6088 packssdw xmm2, xmm3
6089 movdqu [eax + edx - 16], xmm2
6090 sub ecx, 8
6091 jg convertloop
6092 ret
6093 }
6094 }
6095 #endif // HAS_HALFFLOATROW_SSE2
6096
6097 #ifdef HAS_HALFFLOATROW_AVX2
6098 __declspec(naked) void HalfFloatRow_AVX2(const uint16* src,
6099 uint16* dst,
6100 float scale,
6101 int width) {
6102 __asm {
6103 mov eax, [esp + 4] /* src */
6104 mov edx, [esp + 8] /* dst */
6105 movd xmm4, dword ptr [esp + 12] /* scale */
6106 mov ecx, [esp + 16] /* width */
6107
6108 vmulss xmm4, xmm4, kExpBias
6109 vbroadcastss ymm4, xmm4
6110 vpxor ymm5, ymm5, ymm5
6111 sub edx, eax
6112
6113 // 16 pixel loop.
6114 convertloop:
6115 vmovdqu ymm2, [eax] // 16 shorts
6116 add eax, 32
6117 vpunpckhwd ymm3, ymm2, ymm5 // convert 16 shorts to 16 ints
6118 vpunpcklwd ymm2, ymm2, ymm5
6119 vcvtdq2ps ymm3, ymm3 // convert 16 ints to floats
6120 vcvtdq2ps ymm2, ymm2
6121 vmulps ymm3, ymm3, ymm4 // scale to adjust exponent for 5 bit range.
6122 vmulps ymm2, ymm2, ymm4
6123 vpsrld ymm3, ymm3, 13 // float convert to 8 half floats truncate
6124 vpsrld ymm2, ymm2, 13
6125 vpackssdw ymm2, ymm2, ymm3
6126 vmovdqu [eax + edx - 32], ymm2
6127 sub ecx, 16
6128 jg convertloop
6129 vzeroupper
6130 ret
6131 }
6132 }
6133 #endif // HAS_HALFFLOATROW_AVX2
6134
6135 #ifdef HAS_HALFFLOATROW_F16C
6136 __declspec(naked) void HalfFloatRow_F16C(const uint16* src,
6137 uint16* dst,
6138 float scale,
6139 int width) {
6140 __asm {
6141 mov eax, [esp + 4] /* src */
6142 mov edx, [esp + 8] /* dst */
6143 vbroadcastss ymm4, [esp + 12] /* scale */
6144 mov ecx, [esp + 16] /* width */
6145 sub edx, eax
6146
6147 // 16 pixel loop.
6148 convertloop:
6149 vpmovzxwd ymm2, xmmword ptr [eax] // 8 shorts -> 8 ints
6150 vpmovzxwd ymm3, xmmword ptr [eax + 16] // 8 more shorts
6151 add eax, 32
6152 vcvtdq2ps ymm2, ymm2 // convert 8 ints to floats
6153 vcvtdq2ps ymm3, ymm3
6154 vmulps ymm2, ymm2, ymm4 // scale to normalized range 0 to 1
6155 vmulps ymm3, ymm3, ymm4
6156 vcvtps2ph xmm2, ymm2, 3 // float convert to 8 half floats truncate
6157 vcvtps2ph xmm3, ymm3, 3
6158 vmovdqu [eax + edx + 32], xmm2
6159 vmovdqu [eax + edx + 32 + 16], xmm3
6160 sub ecx, 16
6161 jg convertloop
6162 vzeroupper
6163 ret
6164 }
6165 }
6166 #endif // HAS_HALFFLOATROW_F16C
6167
6168 #ifdef HAS_ARGBCOLORTABLEROW_X86
6169 // Tranform ARGB pixels with color table.
6170 __declspec(naked) void ARGBColorTableRow_X86(uint8* dst_argb,
6171 const uint8* table_argb,
6172 int width) {
6173 __asm {
6174 push esi
6175 mov eax, [esp + 4 + 4] /* dst_argb */
6176 mov esi, [esp + 4 + 8] /* table_argb */
6177 mov ecx, [esp + 4 + 12] /* width */
6178
6179 // 1 pixel loop.
6180 convertloop:
6181 movzx edx, byte ptr [eax]
6182 lea eax, [eax + 4]
6183 movzx edx, byte ptr [esi + edx * 4]
6184 mov byte ptr [eax - 4], dl
6185 movzx edx, byte ptr [eax - 4 + 1]
6186 movzx edx, byte ptr [esi + edx * 4 + 1]
6187 mov byte ptr [eax - 4 + 1], dl
6188 movzx edx, byte ptr [eax - 4 + 2]
6189 movzx edx, byte ptr [esi + edx * 4 + 2]
6190 mov byte ptr [eax - 4 + 2], dl
6191 movzx edx, byte ptr [eax - 4 + 3]
6192 movzx edx, byte ptr [esi + edx * 4 + 3]
6193 mov byte ptr [eax - 4 + 3], dl
6194 dec ecx
6195 jg convertloop
6196 pop esi
6197 ret
6198 }
6199 }
6200 #endif // HAS_ARGBCOLORTABLEROW_X86
6201
6202 #ifdef HAS_RGBCOLORTABLEROW_X86
6203 // Tranform RGB pixels with color table.
6204 __declspec(naked) void RGBColorTableRow_X86(uint8* dst_argb,
6205 const uint8* table_argb,
6206 int width) {
6207 __asm {
6208 push esi
6209 mov eax, [esp + 4 + 4] /* dst_argb */
6210 mov esi, [esp + 4 + 8] /* table_argb */
6211 mov ecx, [esp + 4 + 12] /* width */
6212
6213 // 1 pixel loop.
6214 convertloop:
6215 movzx edx, byte ptr [eax]
6216 lea eax, [eax + 4]
6217 movzx edx, byte ptr [esi + edx * 4]
6218 mov byte ptr [eax - 4], dl
6219 movzx edx, byte ptr [eax - 4 + 1]
6220 movzx edx, byte ptr [esi + edx * 4 + 1]
6221 mov byte ptr [eax - 4 + 1], dl
6222 movzx edx, byte ptr [eax - 4 + 2]
6223 movzx edx, byte ptr [esi + edx * 4 + 2]
6224 mov byte ptr [eax - 4 + 2], dl
6225 dec ecx
6226 jg convertloop
6227
6228 pop esi
6229 ret
6230 }
6231 }
6232 #endif // HAS_RGBCOLORTABLEROW_X86
6233
6234 #ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3
6235 // Tranform RGB pixels with luma table.
6236 __declspec(naked) void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb,
6237 uint8* dst_argb,
6238 int width,
6239 const uint8* luma,
6240 uint32 lumacoeff) {
6241 __asm {
6242 push esi
6243 push edi
6244 mov eax, [esp + 8 + 4] /* src_argb */
6245 mov edi, [esp + 8 + 8] /* dst_argb */
6246 mov ecx, [esp + 8 + 12] /* width */
6247 movd xmm2, dword ptr [esp + 8 + 16] // luma table
6248 movd xmm3, dword ptr [esp + 8 + 20] // lumacoeff
6249 pshufd xmm2, xmm2, 0
6250 pshufd xmm3, xmm3, 0
6251 pcmpeqb xmm4, xmm4 // generate mask 0xff00ff00
6252 psllw xmm4, 8
6253 pxor xmm5, xmm5
6254
6255 // 4 pixel loop.
6256 convertloop:
6257 movdqu xmm0, xmmword ptr [eax] // generate luma ptr
6258 pmaddubsw xmm0, xmm3
6259 phaddw xmm0, xmm0
6260 pand xmm0, xmm4 // mask out low bits
6261 punpcklwd xmm0, xmm5
6262 paddd xmm0, xmm2 // add table base
6263 movd esi, xmm0
6264 pshufd xmm0, xmm0, 0x39 // 00111001 to rotate right 32
6265
6266 movzx edx, byte ptr [eax]
6267 movzx edx, byte ptr [esi + edx]
6268 mov byte ptr [edi], dl
6269 movzx edx, byte ptr [eax + 1]
6270 movzx edx, byte ptr [esi + edx]
6271 mov byte ptr [edi + 1], dl
6272 movzx edx, byte ptr [eax + 2]
6273 movzx edx, byte ptr [esi + edx]
6274 mov byte ptr [edi + 2], dl
6275 movzx edx, byte ptr [eax + 3] // copy alpha.
6276 mov byte ptr [edi + 3], dl
6277
6278 movd esi, xmm0
6279 pshufd xmm0, xmm0, 0x39 // 00111001 to rotate right 32
6280
6281 movzx edx, byte ptr [eax + 4]
6282 movzx edx, byte ptr [esi + edx]
6283 mov byte ptr [edi + 4], dl
6284 movzx edx, byte ptr [eax + 5]
6285 movzx edx, byte ptr [esi + edx]
6286 mov byte ptr [edi + 5], dl
6287 movzx edx, byte ptr [eax + 6]
6288 movzx edx, byte ptr [esi + edx]
6289 mov byte ptr [edi + 6], dl
6290 movzx edx, byte ptr [eax + 7] // copy alpha.
6291 mov byte ptr [edi + 7], dl
6292
6293 movd esi, xmm0
6294 pshufd xmm0, xmm0, 0x39 // 00111001 to rotate right 32
6295
6296 movzx edx, byte ptr [eax + 8]
6297 movzx edx, byte ptr [esi + edx]
6298 mov byte ptr [edi + 8], dl
6299 movzx edx, byte ptr [eax + 9]
6300 movzx edx, byte ptr [esi + edx]
6301 mov byte ptr [edi + 9], dl
6302 movzx edx, byte ptr [eax + 10]
6303 movzx edx, byte ptr [esi + edx]
6304 mov byte ptr [edi + 10], dl
6305 movzx edx, byte ptr [eax + 11] // copy alpha.
6306 mov byte ptr [edi + 11], dl
6307
6308 movd esi, xmm0
6309
6310 movzx edx, byte ptr [eax + 12]
6311 movzx edx, byte ptr [esi + edx]
6312 mov byte ptr [edi + 12], dl
6313 movzx edx, byte ptr [eax + 13]
6314 movzx edx, byte ptr [esi + edx]
6315 mov byte ptr [edi + 13], dl
6316 movzx edx, byte ptr [eax + 14]
6317 movzx edx, byte ptr [esi + edx]
6318 mov byte ptr [edi + 14], dl
6319 movzx edx, byte ptr [eax + 15] // copy alpha.
6320 mov byte ptr [edi + 15], dl
6321
6322 lea eax, [eax + 16]
6323 lea edi, [edi + 16]
6324 sub ecx, 4
6325 jg convertloop
6326
6327 pop edi
6328 pop esi
6329 ret
6330 }
6331 }
6332 #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3
6333
6334 #endif // defined(_M_X64)
6335
6336 #ifdef __cplusplus
6337 } // extern "C"
6338 } // namespace libyuv
6339 #endif
6340
6341 #endif // !defined(LIBYUV_DISABLE_X86) && (defined(_M_IX86) || defined(_M_X64))
6342