1 /*
2  *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include "libyuv/row.h"
12 
13 #ifdef __cplusplus
14 namespace libyuv {
15 extern "C" {
16 #endif
17 
18 // This module is for Visual C x86.
19 #if !defined(YUV_DISABLE_ASM) && defined(_M_IX86)
20 
21 // TODO(fbarchard): I420ToRGB24, I420ToRAW
22 #ifdef HAS_ARGBTOYROW_SSSE3
23 
24 // Constants for ARGB.
25 static const vec8 kARGBToY = {
26   13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0
27 };
28 
29 static const vec8 kARGBToU = {
30   112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0
31 };
32 
33 static const vec8 kARGBToV = {
34   -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
35 };
36 
37 // Constants for BGRA.
38 static const vec8 kBGRAToY = {
39   0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13
40 };
41 
42 static const vec8 kBGRAToU = {
43   0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112
44 };
45 
46 static const vec8 kBGRAToV = {
47   0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18
48 };
49 
50 // Constants for ABGR.
51 static const vec8 kABGRToY = {
52   33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0
53 };
54 
55 static const vec8 kABGRToU = {
56   -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0
57 };
58 
59 static const vec8 kABGRToV = {
60   112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0
61 };
62 
63 // Constants for RGBA.
64 static const vec8 kRGBAToY = {
65   0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33
66 };
67 
68 static const vec8 kRGBAToU = {
69   0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38
70 };
71 
72 static const vec8 kRGBAToV = {
73   0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112
74 };
75 
76 static const uvec8 kAddY16 = {
77   16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u
78 };
79 
80 static const uvec8 kAddUV128 = {
81   128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
82   128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
83 };
84 
85 // Shuffle table for converting RGB24 to ARGB.
86 static const uvec8 kShuffleMaskRGB24ToARGB = {
87   0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u
88 };
89 
90 // Shuffle table for converting RAW to ARGB.
91 static const uvec8 kShuffleMaskRAWToARGB = {
92   2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u
93 };
94 
95 // Shuffle table for converting BGRA to ARGB.
96 static const uvec8 kShuffleMaskBGRAToARGB = {
97   3u, 2u, 1u, 0u, 7u, 6u, 5u, 4u, 11u, 10u, 9u, 8u, 15u, 14u, 13u, 12u
98 };
99 
100 // Shuffle table for converting ABGR to ARGB.
101 static const uvec8 kShuffleMaskABGRToARGB = {
102   2u, 1u, 0u, 3u, 6u, 5u, 4u, 7u, 10u, 9u, 8u, 11u, 14u, 13u, 12u, 15u
103 };
104 
105 // Shuffle table for converting RGBA to ARGB.
106 static const uvec8 kShuffleMaskRGBAToARGB = {
107   1u, 2u, 3u, 0u, 5u, 6u, 7u, 4u, 9u, 10u, 11u, 8u, 13u, 14u, 15u, 12u
108 };
109 
110 // Shuffle table for converting ARGB to RGBA.
111 static const uvec8 kShuffleMaskARGBToRGBA = {
112   3u, 0u, 1u, 2u, 7u, 4u, 5u, 6u, 11u, 8u, 9u, 10u, 15u, 12u, 13u, 14u
113 };
114 
115 // Shuffle table for converting ARGB to RGB24.
116 static const uvec8 kShuffleMaskARGBToRGB24 = {
117   0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u
118 };
119 
120 // Shuffle table for converting ARGB to RAW.
121 static const uvec8 kShuffleMaskARGBToRAW = {
122   2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u
123 };
124 
125 __declspec(naked) __declspec(align(16))
I400ToARGBRow_SSE2(const uint8 * src_y,uint8 * dst_argb,int pix)126 void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
127   __asm {
128     mov        eax, [esp + 4]        // src_y
129     mov        edx, [esp + 8]        // dst_argb
130     mov        ecx, [esp + 12]       // pix
131     pcmpeqb    xmm5, xmm5            // generate mask 0xff000000
132     pslld      xmm5, 24
133 
134     align      16
135   convertloop:
136     movq       xmm0, qword ptr [eax]
137     lea        eax,  [eax + 8]
138     punpcklbw  xmm0, xmm0
139     movdqa     xmm1, xmm0
140     punpcklwd  xmm0, xmm0
141     punpckhwd  xmm1, xmm1
142     por        xmm0, xmm5
143     por        xmm1, xmm5
144     movdqa     [edx], xmm0
145     movdqa     [edx + 16], xmm1
146     lea        edx, [edx + 32]
147     sub        ecx, 8
148     jg         convertloop
149     ret
150   }
151 }
152 
153 __declspec(naked) __declspec(align(16))
BGRAToARGBRow_SSSE3(const uint8 * src_bgra,uint8 * dst_argb,int pix)154 void BGRAToARGBRow_SSSE3(const uint8* src_bgra, uint8* dst_argb, int pix) {
155 __asm {
156     mov       eax, [esp + 4]   // src_bgra
157     mov       edx, [esp + 8]   // dst_argb
158     mov       ecx, [esp + 12]  // pix
159     movdqa    xmm5, kShuffleMaskBGRAToARGB
160     sub       edx, eax
161 
162     align      16
163  convertloop:
164     movdqa    xmm0, [eax]
165     pshufb    xmm0, xmm5
166     sub       ecx, 4
167     movdqa    [eax + edx], xmm0
168     lea       eax, [eax + 16]
169     jg        convertloop
170     ret
171   }
172 }
173 
174 __declspec(naked) __declspec(align(16))
ABGRToARGBRow_SSSE3(const uint8 * src_abgr,uint8 * dst_argb,int pix)175 void ABGRToARGBRow_SSSE3(const uint8* src_abgr, uint8* dst_argb, int pix) {
176 __asm {
177     mov       eax, [esp + 4]   // src_abgr
178     mov       edx, [esp + 8]   // dst_argb
179     mov       ecx, [esp + 12]  // pix
180     movdqa    xmm5, kShuffleMaskABGRToARGB
181     sub       edx, eax
182 
183     align      16
184  convertloop:
185     movdqa    xmm0, [eax]
186     pshufb    xmm0, xmm5
187     sub       ecx, 4
188     movdqa    [eax + edx], xmm0
189     lea       eax, [eax + 16]
190     jg        convertloop
191     ret
192   }
193 }
194 
195 __declspec(naked) __declspec(align(16))
RGBAToARGBRow_SSSE3(const uint8 * src_rgba,uint8 * dst_argb,int pix)196 void RGBAToARGBRow_SSSE3(const uint8* src_rgba, uint8* dst_argb, int pix) {
197 __asm {
198     mov       eax, [esp + 4]   // src_rgba
199     mov       edx, [esp + 8]   // dst_argb
200     mov       ecx, [esp + 12]  // pix
201     movdqa    xmm5, kShuffleMaskRGBAToARGB
202     sub       edx, eax
203 
204     align      16
205  convertloop:
206     movdqa    xmm0, [eax]
207     pshufb    xmm0, xmm5
208     sub       ecx, 4
209     movdqa    [eax + edx], xmm0
210     lea       eax, [eax + 16]
211     jg        convertloop
212     ret
213   }
214 }
215 
216 __declspec(naked) __declspec(align(16))
ARGBToRGBARow_SSSE3(const uint8 * src_argb,uint8 * dst_rgba,int pix)217 void ARGBToRGBARow_SSSE3(const uint8* src_argb, uint8* dst_rgba, int pix) {
218 __asm {
219     mov       eax, [esp + 4]   // src_argb
220     mov       edx, [esp + 8]   // dst_rgba
221     mov       ecx, [esp + 12]  // pix
222     movdqa    xmm5, kShuffleMaskARGBToRGBA
223     sub       edx, eax
224 
225     align      16
226  convertloop:
227     movdqa    xmm0, [eax]
228     pshufb    xmm0, xmm5
229     sub       ecx, 4
230     movdqa    [eax + edx], xmm0
231     lea       eax, [eax + 16]
232     jg        convertloop
233     ret
234   }
235 }
236 
237 __declspec(naked) __declspec(align(16))
RGB24ToARGBRow_SSSE3(const uint8 * src_rgb24,uint8 * dst_argb,int pix)238 void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) {
239 __asm {
240     mov       eax, [esp + 4]   // src_rgb24
241     mov       edx, [esp + 8]   // dst_argb
242     mov       ecx, [esp + 12]  // pix
243     pcmpeqb   xmm5, xmm5       // generate mask 0xff000000
244     pslld     xmm5, 24
245     movdqa    xmm4, kShuffleMaskRGB24ToARGB
246 
247     align      16
248  convertloop:
249     movdqu    xmm0, [eax]
250     movdqu    xmm1, [eax + 16]
251     movdqu    xmm3, [eax + 32]
252     lea       eax, [eax + 48]
253     movdqa    xmm2, xmm3
254     palignr   xmm2, xmm1, 8    // xmm2 = { xmm3[0:3] xmm1[8:15]}
255     pshufb    xmm2, xmm4
256     por       xmm2, xmm5
257     palignr   xmm1, xmm0, 12   // xmm1 = { xmm3[0:7] xmm0[12:15]}
258     pshufb    xmm0, xmm4
259     movdqa    [edx + 32], xmm2
260     por       xmm0, xmm5
261     pshufb    xmm1, xmm4
262     movdqa    [edx], xmm0
263     por       xmm1, xmm5
264     palignr   xmm3, xmm3, 4    // xmm3 = { xmm3[4:15]}
265     pshufb    xmm3, xmm4
266     movdqa    [edx + 16], xmm1
267     por       xmm3, xmm5
268     sub       ecx, 16
269     movdqa    [edx + 48], xmm3
270     lea       edx, [edx + 64]
271     jg        convertloop
272     ret
273   }
274 }
275 
276 __declspec(naked) __declspec(align(16))
RAWToARGBRow_SSSE3(const uint8 * src_raw,uint8 * dst_argb,int pix)277 void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb,
278                         int pix) {
279 __asm {
280     mov       eax, [esp + 4]   // src_raw
281     mov       edx, [esp + 8]   // dst_argb
282     mov       ecx, [esp + 12]  // pix
283     pcmpeqb   xmm5, xmm5       // generate mask 0xff000000
284     pslld     xmm5, 24
285     movdqa    xmm4, kShuffleMaskRAWToARGB
286 
287     align      16
288  convertloop:
289     movdqu    xmm0, [eax]
290     movdqu    xmm1, [eax + 16]
291     movdqu    xmm3, [eax + 32]
292     lea       eax, [eax + 48]
293     movdqa    xmm2, xmm3
294     palignr   xmm2, xmm1, 8    // xmm2 = { xmm3[0:3] xmm1[8:15]}
295     pshufb    xmm2, xmm4
296     por       xmm2, xmm5
297     palignr   xmm1, xmm0, 12   // xmm1 = { xmm3[0:7] xmm0[12:15]}
298     pshufb    xmm0, xmm4
299     movdqa    [edx + 32], xmm2
300     por       xmm0, xmm5
301     pshufb    xmm1, xmm4
302     movdqa    [edx], xmm0
303     por       xmm1, xmm5
304     palignr   xmm3, xmm3, 4    // xmm3 = { xmm3[4:15]}
305     pshufb    xmm3, xmm4
306     movdqa    [edx + 16], xmm1
307     por       xmm3, xmm5
308     sub       ecx, 16
309     movdqa    [edx + 48], xmm3
310     lea       edx, [edx + 64]
311     jg        convertloop
312     ret
313   }
314 }
315 
316 // pmul method to replicate bits.
317 // Math to replicate bits:
318 // (v << 8) | (v << 3)
319 // v * 256 + v * 8
320 // v * (256 + 8)
321 // G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3
322 // 20 instructions.
323 __declspec(naked) __declspec(align(16))
RGB565ToARGBRow_SSE2(const uint8 * src_rgb565,uint8 * dst_argb,int pix)324 void RGB565ToARGBRow_SSE2(const uint8* src_rgb565, uint8* dst_argb,
325                           int pix) {
326 __asm {
327     mov       eax, 0x01080108  // generate multiplier to repeat 5 bits
328     movd      xmm5, eax
329     pshufd    xmm5, xmm5, 0
330     mov       eax, 0x20802080  // multiplier shift by 5 and then repeat 6 bits
331     movd      xmm6, eax
332     pshufd    xmm6, xmm6, 0
333     pcmpeqb   xmm3, xmm3       // generate mask 0xf800f800 for Red
334     psllw     xmm3, 11
335     pcmpeqb   xmm4, xmm4       // generate mask 0x07e007e0 for Green
336     psllw     xmm4, 10
337     psrlw     xmm4, 5
338     pcmpeqb   xmm7, xmm7       // generate mask 0xff00ff00 for Alpha
339     psllw     xmm7, 8
340 
341     mov       eax, [esp + 4]   // src_rgb565
342     mov       edx, [esp + 8]   // dst_argb
343     mov       ecx, [esp + 12]  // pix
344     sub       edx, eax
345     sub       edx, eax
346 
347     align      16
348  convertloop:
349     movdqu    xmm0, [eax]   // fetch 8 pixels of bgr565
350     movdqa    xmm1, xmm0
351     movdqa    xmm2, xmm0
352     pand      xmm1, xmm3    // R in upper 5 bits
353     psllw     xmm2, 11      // B in upper 5 bits
354     pmulhuw   xmm1, xmm5    // * (256 + 8)
355     pmulhuw   xmm2, xmm5    // * (256 + 8)
356     psllw     xmm1, 8
357     por       xmm1, xmm2    // RB
358     pand      xmm0, xmm4    // G in middle 6 bits
359     pmulhuw   xmm0, xmm6    // << 5 * (256 + 4)
360     por       xmm0, xmm7    // AG
361     movdqa    xmm2, xmm1
362     punpcklbw xmm1, xmm0
363     punpckhbw xmm2, xmm0
364     movdqa    [eax * 2 + edx], xmm1  // store 4 pixels of ARGB
365     movdqa    [eax * 2 + edx + 16], xmm2  // store next 4 pixels of ARGB
366     lea       eax, [eax + 16]
367     sub       ecx, 8
368     jg        convertloop
369     ret
370   }
371 }
372 
373 // 24 instructions
374 __declspec(naked) __declspec(align(16))
ARGB1555ToARGBRow_SSE2(const uint8 * src_argb1555,uint8 * dst_argb,int pix)375 void ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555, uint8* dst_argb,
376                             int pix) {
377 __asm {
378     mov       eax, 0x01080108  // generate multiplier to repeat 5 bits
379     movd      xmm5, eax
380     pshufd    xmm5, xmm5, 0
381     mov       eax, 0x42004200  // multiplier shift by 6 and then repeat 5 bits
382     movd      xmm6, eax
383     pshufd    xmm6, xmm6, 0
384     pcmpeqb   xmm3, xmm3       // generate mask 0xf800f800 for Red
385     psllw     xmm3, 11
386     movdqa    xmm4, xmm3       // generate mask 0x03e003e0 for Green
387     psrlw     xmm4, 6
388     pcmpeqb   xmm7, xmm7       // generate mask 0xff00ff00 for Alpha
389     psllw     xmm7, 8
390 
391     mov       eax, [esp + 4]   // src_argb1555
392     mov       edx, [esp + 8]   // dst_argb
393     mov       ecx, [esp + 12]  // pix
394     sub       edx, eax
395     sub       edx, eax
396 
397     align      16
398  convertloop:
399     movdqu    xmm0, [eax]   // fetch 8 pixels of 1555
400     movdqa    xmm1, xmm0
401     movdqa    xmm2, xmm0
402     psllw     xmm1, 1       // R in upper 5 bits
403     psllw     xmm2, 11      // B in upper 5 bits
404     pand      xmm1, xmm3
405     pmulhuw   xmm2, xmm5    // * (256 + 8)
406     pmulhuw   xmm1, xmm5    // * (256 + 8)
407     psllw     xmm1, 8
408     por       xmm1, xmm2    // RB
409     movdqa    xmm2, xmm0
410     pand      xmm0, xmm4    // G in middle 5 bits
411     psraw     xmm2, 8       // A
412     pmulhuw   xmm0, xmm6    // << 6 * (256 + 8)
413     pand      xmm2, xmm7
414     por       xmm0, xmm2    // AG
415     movdqa    xmm2, xmm1
416     punpcklbw xmm1, xmm0
417     punpckhbw xmm2, xmm0
418     movdqa    [eax * 2 + edx], xmm1  // store 4 pixels of ARGB
419     movdqa    [eax * 2 + edx + 16], xmm2  // store next 4 pixels of ARGB
420     lea       eax, [eax + 16]
421     sub       ecx, 8
422     jg        convertloop
423     ret
424   }
425 }
426 
427 // 18 instructions.
428 __declspec(naked) __declspec(align(16))
ARGB4444ToARGBRow_SSE2(const uint8 * src_argb4444,uint8 * dst_argb,int pix)429 void ARGB4444ToARGBRow_SSE2(const uint8* src_argb4444, uint8* dst_argb,
430                             int pix) {
431 __asm {
432     mov       eax, 0x0f0f0f0f  // generate mask 0x0f0f0f0f
433     movd      xmm4, eax
434     pshufd    xmm4, xmm4, 0
435     movdqa    xmm5, xmm4       // 0xf0f0f0f0 for high nibbles
436     pslld     xmm5, 4
437     mov       eax, [esp + 4]   // src_argb4444
438     mov       edx, [esp + 8]   // dst_argb
439     mov       ecx, [esp + 12]  // pix
440     sub       edx, eax
441     sub       edx, eax
442 
443     align      16
444  convertloop:
445     movdqu    xmm0, [eax]   // fetch 8 pixels of bgra4444
446     movdqa    xmm2, xmm0
447     pand      xmm0, xmm4    // mask low nibbles
448     pand      xmm2, xmm5    // mask high nibbles
449     movdqa    xmm1, xmm0
450     movdqa    xmm3, xmm2
451     psllw     xmm1, 4
452     psrlw     xmm3, 4
453     por       xmm0, xmm1
454     por       xmm2, xmm3
455     movdqa    xmm1, xmm0
456     punpcklbw xmm0, xmm2
457     punpckhbw xmm1, xmm2
458     movdqa    [eax * 2 + edx], xmm0  // store 4 pixels of ARGB
459     movdqa    [eax * 2 + edx + 16], xmm1  // store next 4 pixels of ARGB
460     lea       eax, [eax + 16]
461     sub       ecx, 8
462     jg        convertloop
463     ret
464   }
465 }
466 
467 __declspec(naked) __declspec(align(16))
ARGBToRGB24Row_SSSE3(const uint8 * src_argb,uint8 * dst_rgb,int pix)468 void ARGBToRGB24Row_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) {
469 __asm {
470     mov       eax, [esp + 4]   // src_argb
471     mov       edx, [esp + 8]   // dst_rgb
472     mov       ecx, [esp + 12]  // pix
473     movdqa    xmm6, kShuffleMaskARGBToRGB24
474 
475     align      16
476  convertloop:
477     movdqa    xmm0, [eax]   // fetch 16 pixels of argb
478     movdqa    xmm1, [eax + 16]
479     movdqa    xmm2, [eax + 32]
480     movdqa    xmm3, [eax + 48]
481     lea       eax, [eax + 64]
482     pshufb    xmm0, xmm6    // pack 16 bytes of ARGB to 12 bytes of RGB
483     pshufb    xmm1, xmm6
484     pshufb    xmm2, xmm6
485     pshufb    xmm3, xmm6
486     movdqa    xmm4, xmm1   // 4 bytes from 1 for 0
487     psrldq    xmm1, 4      // 8 bytes from 1
488     pslldq    xmm4, 12     // 4 bytes from 1 for 0
489     movdqa    xmm5, xmm2   // 8 bytes from 2 for 1
490     por       xmm0, xmm4   // 4 bytes from 1 for 0
491     pslldq    xmm5, 8      // 8 bytes from 2 for 1
492     movdqa    [edx], xmm0  // store 0
493     por       xmm1, xmm5   // 8 bytes from 2 for 1
494     psrldq    xmm2, 8      // 4 bytes from 2
495     pslldq    xmm3, 4      // 12 bytes from 3 for 2
496     por       xmm2, xmm3   // 12 bytes from 3 for 2
497     movdqa    [edx + 16], xmm1   // store 1
498     movdqa    [edx + 32], xmm2   // store 2
499     lea       edx, [edx + 48]
500     sub       ecx, 16
501     jg        convertloop
502     ret
503   }
504 }
505 
506 __declspec(naked) __declspec(align(16))
ARGBToRAWRow_SSSE3(const uint8 * src_argb,uint8 * dst_rgb,int pix)507 void ARGBToRAWRow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) {
508 __asm {
509     mov       eax, [esp + 4]   // src_argb
510     mov       edx, [esp + 8]   // dst_rgb
511     mov       ecx, [esp + 12]  // pix
512     movdqa    xmm6, kShuffleMaskARGBToRAW
513 
514     align      16
515  convertloop:
516     movdqa    xmm0, [eax]   // fetch 16 pixels of argb
517     movdqa    xmm1, [eax + 16]
518     movdqa    xmm2, [eax + 32]
519     movdqa    xmm3, [eax + 48]
520     lea       eax, [eax + 64]
521     pshufb    xmm0, xmm6    // pack 16 bytes of ARGB to 12 bytes of RGB
522     pshufb    xmm1, xmm6
523     pshufb    xmm2, xmm6
524     pshufb    xmm3, xmm6
525     movdqa    xmm4, xmm1   // 4 bytes from 1 for 0
526     psrldq    xmm1, 4      // 8 bytes from 1
527     pslldq    xmm4, 12     // 4 bytes from 1 for 0
528     movdqa    xmm5, xmm2   // 8 bytes from 2 for 1
529     por       xmm0, xmm4   // 4 bytes from 1 for 0
530     pslldq    xmm5, 8      // 8 bytes from 2 for 1
531     movdqa    [edx], xmm0  // store 0
532     por       xmm1, xmm5   // 8 bytes from 2 for 1
533     psrldq    xmm2, 8      // 4 bytes from 2
534     pslldq    xmm3, 4      // 12 bytes from 3 for 2
535     por       xmm2, xmm3   // 12 bytes from 3 for 2
536     movdqa    [edx + 16], xmm1   // store 1
537     movdqa    [edx + 32], xmm2   // store 2
538     lea       edx, [edx + 48]
539     sub       ecx, 16
540     jg        convertloop
541     ret
542   }
543 }
544 
545 __declspec(naked) __declspec(align(16))
ARGBToRGB565Row_SSE2(const uint8 * src_argb,uint8 * dst_rgb,int pix)546 void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
547 __asm {
548     mov       eax, [esp + 4]   // src_argb
549     mov       edx, [esp + 8]   // dst_rgb
550     mov       ecx, [esp + 12]  // pix
551     pcmpeqb   xmm3, xmm3       // generate mask 0x0000001f
552     psrld     xmm3, 27
553     pcmpeqb   xmm4, xmm4       // generate mask 0x000007e0
554     psrld     xmm4, 26
555     pslld     xmm4, 5
556     pcmpeqb   xmm5, xmm5       // generate mask 0xfffff800
557     pslld     xmm5, 11
558 
559     align      16
560  convertloop:
561     movdqa    xmm0, [eax]   // fetch 4 pixels of argb
562     movdqa    xmm1, xmm0    // B
563     movdqa    xmm2, xmm0    // G
564     pslld     xmm0, 8       // R
565     psrld     xmm1, 3       // B
566     psrld     xmm2, 5       // G
567     psrad     xmm0, 16      // R
568     pand      xmm1, xmm3    // B
569     pand      xmm2, xmm4    // G
570     pand      xmm0, xmm5    // R
571     por       xmm1, xmm2    // BG
572     por       xmm0, xmm1    // BGR
573     packssdw  xmm0, xmm0
574     lea       eax, [eax + 16]
575     movq      qword ptr [edx], xmm0  // store 4 pixels of ARGB1555
576     lea       edx, [edx + 8]
577     sub       ecx, 4
578     jg        convertloop
579     ret
580   }
581 }
582 
583 // TODO(fbarchard): Improve sign extension/packing.
584 __declspec(naked) __declspec(align(16))
ARGBToARGB1555Row_SSE2(const uint8 * src_argb,uint8 * dst_rgb,int pix)585 void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
586 __asm {
587     mov       eax, [esp + 4]   // src_argb
588     mov       edx, [esp + 8]   // dst_rgb
589     mov       ecx, [esp + 12]  // pix
590     pcmpeqb   xmm4, xmm4       // generate mask 0x0000001f
591     psrld     xmm4, 27
592     movdqa    xmm5, xmm4       // generate mask 0x000003e0
593     pslld     xmm5, 5
594     movdqa    xmm6, xmm4       // generate mask 0x00007c00
595     pslld     xmm6, 10
596     pcmpeqb   xmm7, xmm7       // generate mask 0xffff8000
597     pslld     xmm7, 15
598 
599     align      16
600  convertloop:
601     movdqa    xmm0, [eax]   // fetch 4 pixels of argb
602     movdqa    xmm1, xmm0    // B
603     movdqa    xmm2, xmm0    // G
604     movdqa    xmm3, xmm0    // R
605     psrad     xmm0, 16      // A
606     psrld     xmm1, 3       // B
607     psrld     xmm2, 6       // G
608     psrld     xmm3, 9       // R
609     pand      xmm0, xmm7    // A
610     pand      xmm1, xmm4    // B
611     pand      xmm2, xmm5    // G
612     pand      xmm3, xmm6    // R
613     por       xmm0, xmm1    // BA
614     por       xmm2, xmm3    // GR
615     por       xmm0, xmm2    // BGRA
616     packssdw  xmm0, xmm0
617     lea       eax, [eax + 16]
618     movq      qword ptr [edx], xmm0  // store 4 pixels of ARGB1555
619     lea       edx, [edx + 8]
620     sub       ecx, 4
621     jg        convertloop
622     ret
623   }
624 }
625 
626 __declspec(naked) __declspec(align(16))
ARGBToARGB4444Row_SSE2(const uint8 * src_argb,uint8 * dst_rgb,int pix)627 void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
628 __asm {
629     mov       eax, [esp + 4]   // src_argb
630     mov       edx, [esp + 8]   // dst_rgb
631     mov       ecx, [esp + 12]  // pix
632     pcmpeqb   xmm4, xmm4       // generate mask 0xf000f000
633     psllw     xmm4, 12
634     movdqa    xmm3, xmm4       // generate mask 0x00f000f0
635     psrlw     xmm3, 8
636 
637     align      16
638  convertloop:
639     movdqa    xmm0, [eax]   // fetch 4 pixels of argb
640     movdqa    xmm1, xmm0
641     pand      xmm0, xmm3    // low nibble
642     pand      xmm1, xmm4    // high nibble
643     psrl      xmm0, 4
644     psrl      xmm1, 8
645     por       xmm0, xmm1
646     packuswb  xmm0, xmm0
647     lea       eax, [eax + 16]
648     movq      qword ptr [edx], xmm0  // store 4 pixels of ARGB4444
649     lea       edx, [edx + 8]
650     sub       ecx, 4
651     jg        convertloop
652     ret
653   }
654 }
655 
656 // Convert 16 ARGB pixels (64 bytes) to 16 Y values.
657 __declspec(naked) __declspec(align(16))
ARGBToYRow_SSSE3(const uint8 * src_argb,uint8 * dst_y,int pix)658 void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
659 __asm {
660     mov        eax, [esp + 4]   /* src_argb */
661     mov        edx, [esp + 8]   /* dst_y */
662     mov        ecx, [esp + 12]  /* pix */
663     movdqa     xmm5, kAddY16
664     movdqa     xmm4, kARGBToY
665 
666     align      16
667  convertloop:
668     movdqa     xmm0, [eax]
669     movdqa     xmm1, [eax + 16]
670     movdqa     xmm2, [eax + 32]
671     movdqa     xmm3, [eax + 48]
672     pmaddubsw  xmm0, xmm4
673     pmaddubsw  xmm1, xmm4
674     pmaddubsw  xmm2, xmm4
675     pmaddubsw  xmm3, xmm4
676     lea        eax, [eax + 64]
677     phaddw     xmm0, xmm1
678     phaddw     xmm2, xmm3
679     psrlw      xmm0, 7
680     psrlw      xmm2, 7
681     packuswb   xmm0, xmm2
682     paddb      xmm0, xmm5
683     sub        ecx, 16
684     movdqa     [edx], xmm0
685     lea        edx, [edx + 16]
686     jg         convertloop
687     ret
688   }
689 }
690 
691 __declspec(naked) __declspec(align(16))
ARGBToYRow_Unaligned_SSSE3(const uint8 * src_argb,uint8 * dst_y,int pix)692 void ARGBToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
693 __asm {
694     mov        eax, [esp + 4]   /* src_argb */
695     mov        edx, [esp + 8]   /* dst_y */
696     mov        ecx, [esp + 12]  /* pix */
697     movdqa     xmm5, kAddY16
698     movdqa     xmm4, kARGBToY
699 
700     align      16
701  convertloop:
702     movdqu     xmm0, [eax]
703     movdqu     xmm1, [eax + 16]
704     movdqu     xmm2, [eax + 32]
705     movdqu     xmm3, [eax + 48]
706     pmaddubsw  xmm0, xmm4
707     pmaddubsw  xmm1, xmm4
708     pmaddubsw  xmm2, xmm4
709     pmaddubsw  xmm3, xmm4
710     lea        eax, [eax + 64]
711     phaddw     xmm0, xmm1
712     phaddw     xmm2, xmm3
713     psrlw      xmm0, 7
714     psrlw      xmm2, 7
715     packuswb   xmm0, xmm2
716     paddb      xmm0, xmm5
717     sub        ecx, 16
718     movdqu     [edx], xmm0
719     lea        edx, [edx + 16]
720     jg         convertloop
721     ret
722   }
723 }
724 
725 __declspec(naked) __declspec(align(16))
BGRAToYRow_SSSE3(const uint8 * src_argb,uint8 * dst_y,int pix)726 void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
727 __asm {
728     mov        eax, [esp + 4]   /* src_argb */
729     mov        edx, [esp + 8]   /* dst_y */
730     mov        ecx, [esp + 12]  /* pix */
731     movdqa     xmm5, kAddY16
732     movdqa     xmm4, kBGRAToY
733 
734     align      16
735  convertloop:
736     movdqa     xmm0, [eax]
737     movdqa     xmm1, [eax + 16]
738     movdqa     xmm2, [eax + 32]
739     movdqa     xmm3, [eax + 48]
740     pmaddubsw  xmm0, xmm4
741     pmaddubsw  xmm1, xmm4
742     pmaddubsw  xmm2, xmm4
743     pmaddubsw  xmm3, xmm4
744     lea        eax, [eax + 64]
745     phaddw     xmm0, xmm1
746     phaddw     xmm2, xmm3
747     psrlw      xmm0, 7
748     psrlw      xmm2, 7
749     packuswb   xmm0, xmm2
750     paddb      xmm0, xmm5
751     sub        ecx, 16
752     movdqa     [edx], xmm0
753     lea        edx, [edx + 16]
754     jg         convertloop
755     ret
756   }
757 }
758 
759 __declspec(naked) __declspec(align(16))
BGRAToYRow_Unaligned_SSSE3(const uint8 * src_argb,uint8 * dst_y,int pix)760 void BGRAToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
761 __asm {
762     mov        eax, [esp + 4]   /* src_argb */
763     mov        edx, [esp + 8]   /* dst_y */
764     mov        ecx, [esp + 12]  /* pix */
765     movdqa     xmm5, kAddY16
766     movdqa     xmm4, kBGRAToY
767 
768     align      16
769  convertloop:
770     movdqu     xmm0, [eax]
771     movdqu     xmm1, [eax + 16]
772     movdqu     xmm2, [eax + 32]
773     movdqu     xmm3, [eax + 48]
774     pmaddubsw  xmm0, xmm4
775     pmaddubsw  xmm1, xmm4
776     pmaddubsw  xmm2, xmm4
777     pmaddubsw  xmm3, xmm4
778     lea        eax, [eax + 64]
779     phaddw     xmm0, xmm1
780     phaddw     xmm2, xmm3
781     psrlw      xmm0, 7
782     psrlw      xmm2, 7
783     packuswb   xmm0, xmm2
784     paddb      xmm0, xmm5
785     sub        ecx, 16
786     movdqu     [edx], xmm0
787     lea        edx, [edx + 16]
788     jg         convertloop
789     ret
790   }
791 }
792 
793 __declspec(naked) __declspec(align(16))
ABGRToYRow_SSSE3(const uint8 * src_argb,uint8 * dst_y,int pix)794 void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
795 __asm {
796     mov        eax, [esp + 4]   /* src_argb */
797     mov        edx, [esp + 8]   /* dst_y */
798     mov        ecx, [esp + 12]  /* pix */
799     movdqa     xmm5, kAddY16
800     movdqa     xmm4, kABGRToY
801 
802     align      16
803  convertloop:
804     movdqa     xmm0, [eax]
805     movdqa     xmm1, [eax + 16]
806     movdqa     xmm2, [eax + 32]
807     movdqa     xmm3, [eax + 48]
808     pmaddubsw  xmm0, xmm4
809     pmaddubsw  xmm1, xmm4
810     pmaddubsw  xmm2, xmm4
811     pmaddubsw  xmm3, xmm4
812     lea        eax, [eax + 64]
813     phaddw     xmm0, xmm1
814     phaddw     xmm2, xmm3
815     psrlw      xmm0, 7
816     psrlw      xmm2, 7
817     packuswb   xmm0, xmm2
818     paddb      xmm0, xmm5
819     sub        ecx, 16
820     movdqa     [edx], xmm0
821     lea        edx, [edx + 16]
822     jg         convertloop
823     ret
824   }
825 }
826 
827 __declspec(naked) __declspec(align(16))
ABGRToYRow_Unaligned_SSSE3(const uint8 * src_argb,uint8 * dst_y,int pix)828 void ABGRToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
829 __asm {
830     mov        eax, [esp + 4]   /* src_argb */
831     mov        edx, [esp + 8]   /* dst_y */
832     mov        ecx, [esp + 12]  /* pix */
833     movdqa     xmm5, kAddY16
834     movdqa     xmm4, kABGRToY
835 
836     align      16
837  convertloop:
838     movdqu     xmm0, [eax]
839     movdqu     xmm1, [eax + 16]
840     movdqu     xmm2, [eax + 32]
841     movdqu     xmm3, [eax + 48]
842     pmaddubsw  xmm0, xmm4
843     pmaddubsw  xmm1, xmm4
844     pmaddubsw  xmm2, xmm4
845     pmaddubsw  xmm3, xmm4
846     lea        eax, [eax + 64]
847     phaddw     xmm0, xmm1
848     phaddw     xmm2, xmm3
849     psrlw      xmm0, 7
850     psrlw      xmm2, 7
851     packuswb   xmm0, xmm2
852     paddb      xmm0, xmm5
853     sub        ecx, 16
854     movdqu     [edx], xmm0
855     lea        edx, [edx + 16]
856     jg         convertloop
857     ret
858   }
859 }
860 
861 __declspec(naked) __declspec(align(16))
RGBAToYRow_SSSE3(const uint8 * src_argb,uint8 * dst_y,int pix)862 void RGBAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
863 __asm {
864     mov        eax, [esp + 4]   /* src_argb */
865     mov        edx, [esp + 8]   /* dst_y */
866     mov        ecx, [esp + 12]  /* pix */
867     movdqa     xmm5, kAddY16
868     movdqa     xmm4, kRGBAToY
869 
870     align      16
871  convertloop:
872     movdqa     xmm0, [eax]
873     movdqa     xmm1, [eax + 16]
874     movdqa     xmm2, [eax + 32]
875     movdqa     xmm3, [eax + 48]
876     pmaddubsw  xmm0, xmm4
877     pmaddubsw  xmm1, xmm4
878     pmaddubsw  xmm2, xmm4
879     pmaddubsw  xmm3, xmm4
880     lea        eax, [eax + 64]
881     phaddw     xmm0, xmm1
882     phaddw     xmm2, xmm3
883     psrlw      xmm0, 7
884     psrlw      xmm2, 7
885     packuswb   xmm0, xmm2
886     paddb      xmm0, xmm5
887     sub        ecx, 16
888     movdqa     [edx], xmm0
889     lea        edx, [edx + 16]
890     jg         convertloop
891     ret
892   }
893 }
894 
895 __declspec(naked) __declspec(align(16))
RGBAToYRow_Unaligned_SSSE3(const uint8 * src_argb,uint8 * dst_y,int pix)896 void RGBAToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
897 __asm {
898     mov        eax, [esp + 4]   /* src_argb */
899     mov        edx, [esp + 8]   /* dst_y */
900     mov        ecx, [esp + 12]  /* pix */
901     movdqa     xmm5, kAddY16
902     movdqa     xmm4, kRGBAToY
903 
904     align      16
905  convertloop:
906     movdqu     xmm0, [eax]
907     movdqu     xmm1, [eax + 16]
908     movdqu     xmm2, [eax + 32]
909     movdqu     xmm3, [eax + 48]
910     pmaddubsw  xmm0, xmm4
911     pmaddubsw  xmm1, xmm4
912     pmaddubsw  xmm2, xmm4
913     pmaddubsw  xmm3, xmm4
914     lea        eax, [eax + 64]
915     phaddw     xmm0, xmm1
916     phaddw     xmm2, xmm3
917     psrlw      xmm0, 7
918     psrlw      xmm2, 7
919     packuswb   xmm0, xmm2
920     paddb      xmm0, xmm5
921     sub        ecx, 16
922     movdqu     [edx], xmm0
923     lea        edx, [edx + 16]
924     jg         convertloop
925     ret
926   }
927 }
928 
929 __declspec(naked) __declspec(align(16))
ARGBToUVRow_SSSE3(const uint8 * src_argb0,int src_stride_argb,uint8 * dst_u,uint8 * dst_v,int width)930 void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
931                        uint8* dst_u, uint8* dst_v, int width) {
932 __asm {
933     push       esi
934     push       edi
935     mov        eax, [esp + 8 + 4]   // src_argb
936     mov        esi, [esp + 8 + 8]   // src_stride_argb
937     mov        edx, [esp + 8 + 12]  // dst_u
938     mov        edi, [esp + 8 + 16]  // dst_v
939     mov        ecx, [esp + 8 + 20]  // pix
940     movdqa     xmm7, kARGBToU
941     movdqa     xmm6, kARGBToV
942     movdqa     xmm5, kAddUV128
943     sub        edi, edx             // stride from u to v
944 
945     align      16
946  convertloop:
947     /* step 1 - subsample 16x2 argb pixels to 8x1 */
948     movdqa     xmm0, [eax]
949     movdqa     xmm1, [eax + 16]
950     movdqa     xmm2, [eax + 32]
951     movdqa     xmm3, [eax + 48]
952     pavgb      xmm0, [eax + esi]
953     pavgb      xmm1, [eax + esi + 16]
954     pavgb      xmm2, [eax + esi + 32]
955     pavgb      xmm3, [eax + esi + 48]
956     lea        eax,  [eax + 64]
957     movdqa     xmm4, xmm0
958     shufps     xmm0, xmm1, 0x88
959     shufps     xmm4, xmm1, 0xdd
960     pavgb      xmm0, xmm4
961     movdqa     xmm4, xmm2
962     shufps     xmm2, xmm3, 0x88
963     shufps     xmm4, xmm3, 0xdd
964     pavgb      xmm2, xmm4
965 
966     // step 2 - convert to U and V
967     // from here down is very similar to Y code except
968     // instead of 16 different pixels, its 8 pixels of U and 8 of V
969     movdqa     xmm1, xmm0
970     movdqa     xmm3, xmm2
971     pmaddubsw  xmm0, xmm7  // U
972     pmaddubsw  xmm2, xmm7
973     pmaddubsw  xmm1, xmm6  // V
974     pmaddubsw  xmm3, xmm6
975     phaddw     xmm0, xmm2
976     phaddw     xmm1, xmm3
977     psraw      xmm0, 8
978     psraw      xmm1, 8
979     packsswb   xmm0, xmm1
980     paddb      xmm0, xmm5            // -> unsigned
981 
982     // step 3 - store 8 U and 8 V values
983     sub        ecx, 16
984     movlps     qword ptr [edx], xmm0 // U
985     movhps     qword ptr [edx + edi], xmm0 // V
986     lea        edx, [edx + 8]
987     jg         convertloop
988 
989     pop        edi
990     pop        esi
991     ret
992   }
993 }
994 
995 __declspec(naked) __declspec(align(16))
ARGBToUVRow_Unaligned_SSSE3(const uint8 * src_argb0,int src_stride_argb,uint8 * dst_u,uint8 * dst_v,int width)996 void ARGBToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
997                                  uint8* dst_u, uint8* dst_v, int width) {
998 __asm {
999     push       esi
1000     push       edi
1001     mov        eax, [esp + 8 + 4]   // src_argb
1002     mov        esi, [esp + 8 + 8]   // src_stride_argb
1003     mov        edx, [esp + 8 + 12]  // dst_u
1004     mov        edi, [esp + 8 + 16]  // dst_v
1005     mov        ecx, [esp + 8 + 20]  // pix
1006     movdqa     xmm7, kARGBToU
1007     movdqa     xmm6, kARGBToV
1008     movdqa     xmm5, kAddUV128
1009     sub        edi, edx             // stride from u to v
1010 
1011     align      16
1012  convertloop:
1013     /* step 1 - subsample 16x2 argb pixels to 8x1 */
1014     movdqu     xmm0, [eax]
1015     movdqu     xmm1, [eax + 16]
1016     movdqu     xmm2, [eax + 32]
1017     movdqu     xmm3, [eax + 48]
1018     movdqu     xmm4, [eax + esi]
1019     pavgb      xmm0, xmm4
1020     movdqu     xmm4, [eax + esi + 16]
1021     pavgb      xmm1, xmm4
1022     movdqu     xmm4, [eax + esi + 32]
1023     pavgb      xmm2, xmm4
1024     movdqu     xmm4, [eax + esi + 48]
1025     pavgb      xmm3, xmm4
1026     lea        eax,  [eax + 64]
1027     movdqa     xmm4, xmm0
1028     shufps     xmm0, xmm1, 0x88
1029     shufps     xmm4, xmm1, 0xdd
1030     pavgb      xmm0, xmm4
1031     movdqa     xmm4, xmm2
1032     shufps     xmm2, xmm3, 0x88
1033     shufps     xmm4, xmm3, 0xdd
1034     pavgb      xmm2, xmm4
1035 
1036     // step 2 - convert to U and V
1037     // from here down is very similar to Y code except
1038     // instead of 16 different pixels, its 8 pixels of U and 8 of V
1039     movdqa     xmm1, xmm0
1040     movdqa     xmm3, xmm2
1041     pmaddubsw  xmm0, xmm7  // U
1042     pmaddubsw  xmm2, xmm7
1043     pmaddubsw  xmm1, xmm6  // V
1044     pmaddubsw  xmm3, xmm6
1045     phaddw     xmm0, xmm2
1046     phaddw     xmm1, xmm3
1047     psraw      xmm0, 8
1048     psraw      xmm1, 8
1049     packsswb   xmm0, xmm1
1050     paddb      xmm0, xmm5            // -> unsigned
1051 
1052     // step 3 - store 8 U and 8 V values
1053     sub        ecx, 16
1054     movlps     qword ptr [edx], xmm0 // U
1055     movhps     qword ptr [edx + edi], xmm0 // V
1056     lea        edx, [edx + 8]
1057     jg         convertloop
1058 
1059     pop        edi
1060     pop        esi
1061     ret
1062   }
1063 }
1064 
1065 __declspec(naked) __declspec(align(16))
BGRAToUVRow_SSSE3(const uint8 * src_argb0,int src_stride_argb,uint8 * dst_u,uint8 * dst_v,int width)1066 void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
1067                        uint8* dst_u, uint8* dst_v, int width) {
1068 __asm {
1069     push       esi
1070     push       edi
1071     mov        eax, [esp + 8 + 4]   // src_argb
1072     mov        esi, [esp + 8 + 8]   // src_stride_argb
1073     mov        edx, [esp + 8 + 12]  // dst_u
1074     mov        edi, [esp + 8 + 16]  // dst_v
1075     mov        ecx, [esp + 8 + 20]  // pix
1076     movdqa     xmm7, kBGRAToU
1077     movdqa     xmm6, kBGRAToV
1078     movdqa     xmm5, kAddUV128
1079     sub        edi, edx             // stride from u to v
1080 
1081     align      16
1082  convertloop:
1083     /* step 1 - subsample 16x2 argb pixels to 8x1 */
1084     movdqa     xmm0, [eax]
1085     movdqa     xmm1, [eax + 16]
1086     movdqa     xmm2, [eax + 32]
1087     movdqa     xmm3, [eax + 48]
1088     pavgb      xmm0, [eax + esi]
1089     pavgb      xmm1, [eax + esi + 16]
1090     pavgb      xmm2, [eax + esi + 32]
1091     pavgb      xmm3, [eax + esi + 48]
1092     lea        eax,  [eax + 64]
1093     movdqa     xmm4, xmm0
1094     shufps     xmm0, xmm1, 0x88
1095     shufps     xmm4, xmm1, 0xdd
1096     pavgb      xmm0, xmm4
1097     movdqa     xmm4, xmm2
1098     shufps     xmm2, xmm3, 0x88
1099     shufps     xmm4, xmm3, 0xdd
1100     pavgb      xmm2, xmm4
1101 
1102     // step 2 - convert to U and V
1103     // from here down is very similar to Y code except
1104     // instead of 16 different pixels, its 8 pixels of U and 8 of V
1105     movdqa     xmm1, xmm0
1106     movdqa     xmm3, xmm2
1107     pmaddubsw  xmm0, xmm7  // U
1108     pmaddubsw  xmm2, xmm7
1109     pmaddubsw  xmm1, xmm6  // V
1110     pmaddubsw  xmm3, xmm6
1111     phaddw     xmm0, xmm2
1112     phaddw     xmm1, xmm3
1113     psraw      xmm0, 8
1114     psraw      xmm1, 8
1115     packsswb   xmm0, xmm1
1116     paddb      xmm0, xmm5            // -> unsigned
1117 
1118     // step 3 - store 8 U and 8 V values
1119     sub        ecx, 16
1120     movlps     qword ptr [edx], xmm0 // U
1121     movhps     qword ptr [edx + edi], xmm0 // V
1122     lea        edx, [edx + 8]
1123     jg         convertloop
1124 
1125     pop        edi
1126     pop        esi
1127     ret
1128   }
1129 }
1130 
1131 __declspec(naked) __declspec(align(16))
BGRAToUVRow_Unaligned_SSSE3(const uint8 * src_argb0,int src_stride_argb,uint8 * dst_u,uint8 * dst_v,int width)1132 void BGRAToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
1133                                  uint8* dst_u, uint8* dst_v, int width) {
1134 __asm {
1135     push       esi
1136     push       edi
1137     mov        eax, [esp + 8 + 4]   // src_argb
1138     mov        esi, [esp + 8 + 8]   // src_stride_argb
1139     mov        edx, [esp + 8 + 12]  // dst_u
1140     mov        edi, [esp + 8 + 16]  // dst_v
1141     mov        ecx, [esp + 8 + 20]  // pix
1142     movdqa     xmm7, kBGRAToU
1143     movdqa     xmm6, kBGRAToV
1144     movdqa     xmm5, kAddUV128
1145     sub        edi, edx             // stride from u to v
1146 
1147     align      16
1148  convertloop:
1149     /* step 1 - subsample 16x2 argb pixels to 8x1 */
1150     movdqu     xmm0, [eax]
1151     movdqu     xmm1, [eax + 16]
1152     movdqu     xmm2, [eax + 32]
1153     movdqu     xmm3, [eax + 48]
1154     movdqu     xmm4, [eax + esi]
1155     pavgb      xmm0, xmm4
1156     movdqu     xmm4, [eax + esi + 16]
1157     pavgb      xmm1, xmm4
1158     movdqu     xmm4, [eax + esi + 32]
1159     pavgb      xmm2, xmm4
1160     movdqu     xmm4, [eax + esi + 48]
1161     pavgb      xmm3, xmm4
1162     lea        eax,  [eax + 64]
1163     movdqa     xmm4, xmm0
1164     shufps     xmm0, xmm1, 0x88
1165     shufps     xmm4, xmm1, 0xdd
1166     pavgb      xmm0, xmm4
1167     movdqa     xmm4, xmm2
1168     shufps     xmm2, xmm3, 0x88
1169     shufps     xmm4, xmm3, 0xdd
1170     pavgb      xmm2, xmm4
1171 
1172     // step 2 - convert to U and V
1173     // from here down is very similar to Y code except
1174     // instead of 16 different pixels, its 8 pixels of U and 8 of V
1175     movdqa     xmm1, xmm0
1176     movdqa     xmm3, xmm2
1177     pmaddubsw  xmm0, xmm7  // U
1178     pmaddubsw  xmm2, xmm7
1179     pmaddubsw  xmm1, xmm6  // V
1180     pmaddubsw  xmm3, xmm6
1181     phaddw     xmm0, xmm2
1182     phaddw     xmm1, xmm3
1183     psraw      xmm0, 8
1184     psraw      xmm1, 8
1185     packsswb   xmm0, xmm1
1186     paddb      xmm0, xmm5            // -> unsigned
1187 
1188     // step 3 - store 8 U and 8 V values
1189     sub        ecx, 16
1190     movlps     qword ptr [edx], xmm0 // U
1191     movhps     qword ptr [edx + edi], xmm0 // V
1192     lea        edx, [edx + 8]
1193     jg         convertloop
1194 
1195     pop        edi
1196     pop        esi
1197     ret
1198   }
1199 }
1200 
1201 __declspec(naked) __declspec(align(16))
ABGRToUVRow_SSSE3(const uint8 * src_argb0,int src_stride_argb,uint8 * dst_u,uint8 * dst_v,int width)1202 void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
1203                        uint8* dst_u, uint8* dst_v, int width) {
1204 __asm {
1205     push       esi
1206     push       edi
1207     mov        eax, [esp + 8 + 4]   // src_argb
1208     mov        esi, [esp + 8 + 8]   // src_stride_argb
1209     mov        edx, [esp + 8 + 12]  // dst_u
1210     mov        edi, [esp + 8 + 16]  // dst_v
1211     mov        ecx, [esp + 8 + 20]  // pix
1212     movdqa     xmm7, kABGRToU
1213     movdqa     xmm6, kABGRToV
1214     movdqa     xmm5, kAddUV128
1215     sub        edi, edx             // stride from u to v
1216 
1217     align      16
1218  convertloop:
1219     /* step 1 - subsample 16x2 argb pixels to 8x1 */
1220     movdqa     xmm0, [eax]
1221     movdqa     xmm1, [eax + 16]
1222     movdqa     xmm2, [eax + 32]
1223     movdqa     xmm3, [eax + 48]
1224     pavgb      xmm0, [eax + esi]
1225     pavgb      xmm1, [eax + esi + 16]
1226     pavgb      xmm2, [eax + esi + 32]
1227     pavgb      xmm3, [eax + esi + 48]
1228     lea        eax,  [eax + 64]
1229     movdqa     xmm4, xmm0
1230     shufps     xmm0, xmm1, 0x88
1231     shufps     xmm4, xmm1, 0xdd
1232     pavgb      xmm0, xmm4
1233     movdqa     xmm4, xmm2
1234     shufps     xmm2, xmm3, 0x88
1235     shufps     xmm4, xmm3, 0xdd
1236     pavgb      xmm2, xmm4
1237 
1238     // step 2 - convert to U and V
1239     // from here down is very similar to Y code except
1240     // instead of 16 different pixels, its 8 pixels of U and 8 of V
1241     movdqa     xmm1, xmm0
1242     movdqa     xmm3, xmm2
1243     pmaddubsw  xmm0, xmm7  // U
1244     pmaddubsw  xmm2, xmm7
1245     pmaddubsw  xmm1, xmm6  // V
1246     pmaddubsw  xmm3, xmm6
1247     phaddw     xmm0, xmm2
1248     phaddw     xmm1, xmm3
1249     psraw      xmm0, 8
1250     psraw      xmm1, 8
1251     packsswb   xmm0, xmm1
1252     paddb      xmm0, xmm5            // -> unsigned
1253 
1254     // step 3 - store 8 U and 8 V values
1255     sub        ecx, 16
1256     movlps     qword ptr [edx], xmm0 // U
1257     movhps     qword ptr [edx + edi], xmm0 // V
1258     lea        edx, [edx + 8]
1259     jg         convertloop
1260 
1261     pop        edi
1262     pop        esi
1263     ret
1264   }
1265 }
1266 
1267 __declspec(naked) __declspec(align(16))
ABGRToUVRow_Unaligned_SSSE3(const uint8 * src_argb0,int src_stride_argb,uint8 * dst_u,uint8 * dst_v,int width)1268 void ABGRToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
1269                                  uint8* dst_u, uint8* dst_v, int width) {
1270 __asm {
1271     push       esi
1272     push       edi
1273     mov        eax, [esp + 8 + 4]   // src_argb
1274     mov        esi, [esp + 8 + 8]   // src_stride_argb
1275     mov        edx, [esp + 8 + 12]  // dst_u
1276     mov        edi, [esp + 8 + 16]  // dst_v
1277     mov        ecx, [esp + 8 + 20]  // pix
1278     movdqa     xmm7, kABGRToU
1279     movdqa     xmm6, kABGRToV
1280     movdqa     xmm5, kAddUV128
1281     sub        edi, edx             // stride from u to v
1282 
1283     align      16
1284  convertloop:
1285     /* step 1 - subsample 16x2 argb pixels to 8x1 */
1286     movdqu     xmm0, [eax]
1287     movdqu     xmm1, [eax + 16]
1288     movdqu     xmm2, [eax + 32]
1289     movdqu     xmm3, [eax + 48]
1290     movdqu     xmm4, [eax + esi]
1291     pavgb      xmm0, xmm4
1292     movdqu     xmm4, [eax + esi + 16]
1293     pavgb      xmm1, xmm4
1294     movdqu     xmm4, [eax + esi + 32]
1295     pavgb      xmm2, xmm4
1296     movdqu     xmm4, [eax + esi + 48]
1297     pavgb      xmm3, xmm4
1298     lea        eax,  [eax + 64]
1299     movdqa     xmm4, xmm0
1300     shufps     xmm0, xmm1, 0x88
1301     shufps     xmm4, xmm1, 0xdd
1302     pavgb      xmm0, xmm4
1303     movdqa     xmm4, xmm2
1304     shufps     xmm2, xmm3, 0x88
1305     shufps     xmm4, xmm3, 0xdd
1306     pavgb      xmm2, xmm4
1307 
1308     // step 2 - convert to U and V
1309     // from here down is very similar to Y code except
1310     // instead of 16 different pixels, its 8 pixels of U and 8 of V
1311     movdqa     xmm1, xmm0
1312     movdqa     xmm3, xmm2
1313     pmaddubsw  xmm0, xmm7  // U
1314     pmaddubsw  xmm2, xmm7
1315     pmaddubsw  xmm1, xmm6  // V
1316     pmaddubsw  xmm3, xmm6
1317     phaddw     xmm0, xmm2
1318     phaddw     xmm1, xmm3
1319     psraw      xmm0, 8
1320     psraw      xmm1, 8
1321     packsswb   xmm0, xmm1
1322     paddb      xmm0, xmm5            // -> unsigned
1323 
1324     // step 3 - store 8 U and 8 V values
1325     sub        ecx, 16
1326     movlps     qword ptr [edx], xmm0 // U
1327     movhps     qword ptr [edx + edi], xmm0 // V
1328     lea        edx, [edx + 8]
1329     jg         convertloop
1330 
1331     pop        edi
1332     pop        esi
1333     ret
1334   }
1335 }
1336 
1337 __declspec(naked) __declspec(align(16))
RGBAToUVRow_SSSE3(const uint8 * src_argb0,int src_stride_argb,uint8 * dst_u,uint8 * dst_v,int width)1338 void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
1339                        uint8* dst_u, uint8* dst_v, int width) {
1340 __asm {
1341     push       esi
1342     push       edi
1343     mov        eax, [esp + 8 + 4]   // src_argb
1344     mov        esi, [esp + 8 + 8]   // src_stride_argb
1345     mov        edx, [esp + 8 + 12]  // dst_u
1346     mov        edi, [esp + 8 + 16]  // dst_v
1347     mov        ecx, [esp + 8 + 20]  // pix
1348     movdqa     xmm7, kRGBAToU
1349     movdqa     xmm6, kRGBAToV
1350     movdqa     xmm5, kAddUV128
1351     sub        edi, edx             // stride from u to v
1352 
1353     align      16
1354  convertloop:
1355     /* step 1 - subsample 16x2 argb pixels to 8x1 */
1356     movdqa     xmm0, [eax]
1357     movdqa     xmm1, [eax + 16]
1358     movdqa     xmm2, [eax + 32]
1359     movdqa     xmm3, [eax + 48]
1360     pavgb      xmm0, [eax + esi]
1361     pavgb      xmm1, [eax + esi + 16]
1362     pavgb      xmm2, [eax + esi + 32]
1363     pavgb      xmm3, [eax + esi + 48]
1364     lea        eax,  [eax + 64]
1365     movdqa     xmm4, xmm0
1366     shufps     xmm0, xmm1, 0x88
1367     shufps     xmm4, xmm1, 0xdd
1368     pavgb      xmm0, xmm4
1369     movdqa     xmm4, xmm2
1370     shufps     xmm2, xmm3, 0x88
1371     shufps     xmm4, xmm3, 0xdd
1372     pavgb      xmm2, xmm4
1373 
1374     // step 2 - convert to U and V
1375     // from here down is very similar to Y code except
1376     // instead of 16 different pixels, its 8 pixels of U and 8 of V
1377     movdqa     xmm1, xmm0
1378     movdqa     xmm3, xmm2
1379     pmaddubsw  xmm0, xmm7  // U
1380     pmaddubsw  xmm2, xmm7
1381     pmaddubsw  xmm1, xmm6  // V
1382     pmaddubsw  xmm3, xmm6
1383     phaddw     xmm0, xmm2
1384     phaddw     xmm1, xmm3
1385     psraw      xmm0, 8
1386     psraw      xmm1, 8
1387     packsswb   xmm0, xmm1
1388     paddb      xmm0, xmm5            // -> unsigned
1389 
1390     // step 3 - store 8 U and 8 V values
1391     sub        ecx, 16
1392     movlps     qword ptr [edx], xmm0 // U
1393     movhps     qword ptr [edx + edi], xmm0 // V
1394     lea        edx, [edx + 8]
1395     jg         convertloop
1396 
1397     pop        edi
1398     pop        esi
1399     ret
1400   }
1401 }
1402 
1403 __declspec(naked) __declspec(align(16))
RGBAToUVRow_Unaligned_SSSE3(const uint8 * src_argb0,int src_stride_argb,uint8 * dst_u,uint8 * dst_v,int width)1404 void RGBAToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
1405                                  uint8* dst_u, uint8* dst_v, int width) {
1406 __asm {
1407     push       esi
1408     push       edi
1409     mov        eax, [esp + 8 + 4]   // src_argb
1410     mov        esi, [esp + 8 + 8]   // src_stride_argb
1411     mov        edx, [esp + 8 + 12]  // dst_u
1412     mov        edi, [esp + 8 + 16]  // dst_v
1413     mov        ecx, [esp + 8 + 20]  // pix
1414     movdqa     xmm7, kRGBAToU
1415     movdqa     xmm6, kRGBAToV
1416     movdqa     xmm5, kAddUV128
1417     sub        edi, edx             // stride from u to v
1418 
1419     align      16
1420  convertloop:
1421     /* step 1 - subsample 16x2 argb pixels to 8x1 */
1422     movdqu     xmm0, [eax]
1423     movdqu     xmm1, [eax + 16]
1424     movdqu     xmm2, [eax + 32]
1425     movdqu     xmm3, [eax + 48]
1426     movdqu     xmm4, [eax + esi]
1427     pavgb      xmm0, xmm4
1428     movdqu     xmm4, [eax + esi + 16]
1429     pavgb      xmm1, xmm4
1430     movdqu     xmm4, [eax + esi + 32]
1431     pavgb      xmm2, xmm4
1432     movdqu     xmm4, [eax + esi + 48]
1433     pavgb      xmm3, xmm4
1434     lea        eax,  [eax + 64]
1435     movdqa     xmm4, xmm0
1436     shufps     xmm0, xmm1, 0x88
1437     shufps     xmm4, xmm1, 0xdd
1438     pavgb      xmm0, xmm4
1439     movdqa     xmm4, xmm2
1440     shufps     xmm2, xmm3, 0x88
1441     shufps     xmm4, xmm3, 0xdd
1442     pavgb      xmm2, xmm4
1443 
1444     // step 2 - convert to U and V
1445     // from here down is very similar to Y code except
1446     // instead of 16 different pixels, its 8 pixels of U and 8 of V
1447     movdqa     xmm1, xmm0
1448     movdqa     xmm3, xmm2
1449     pmaddubsw  xmm0, xmm7  // U
1450     pmaddubsw  xmm2, xmm7
1451     pmaddubsw  xmm1, xmm6  // V
1452     pmaddubsw  xmm3, xmm6
1453     phaddw     xmm0, xmm2
1454     phaddw     xmm1, xmm3
1455     psraw      xmm0, 8
1456     psraw      xmm1, 8
1457     packsswb   xmm0, xmm1
1458     paddb      xmm0, xmm5            // -> unsigned
1459 
1460     // step 3 - store 8 U and 8 V values
1461     sub        ecx, 16
1462     movlps     qword ptr [edx], xmm0 // U
1463     movhps     qword ptr [edx + edi], xmm0 // V
1464     lea        edx, [edx + 8]
1465     jg         convertloop
1466 
1467     pop        edi
1468     pop        esi
1469     ret
1470   }
1471 }
1472 #endif  // HAS_ARGBTOYROW_SSSE3
1473 
1474 #ifdef HAS_I422TOARGBROW_SSSE3
1475 
1476 #define YG 74 /* static_cast<int8>(1.164 * 64 + 0.5) */
1477 
1478 #define UB 127 /* min(63,static_cast<int8>(2.018 * 64)) */
1479 #define UG -25 /* static_cast<int8>(-0.391 * 64 - 0.5) */
1480 #define UR 0
1481 
1482 #define VB 0
1483 #define VG -52 /* static_cast<int8>(-0.813 * 64 - 0.5) */
1484 #define VR 102 /* static_cast<int8>(1.596 * 64 + 0.5) */
1485 
1486 // Bias
1487 #define BB UB * 128 + VB * 128
1488 #define BG UG * 128 + VG * 128
1489 #define BR UR * 128 + VR * 128
1490 
1491 static const vec8 kUVToB = {
1492   UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB
1493 };
1494 
1495 static const vec8 kUVToR = {
1496   UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR
1497 };
1498 
1499 static const vec8 kUVToG = {
1500   UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG
1501 };
1502 
1503 static const vec8 kVUToB = {
1504   VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB,
1505 };
1506 
1507 static const vec8 kVUToR = {
1508   VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR,
1509 };
1510 
1511 static const vec8 kVUToG = {
1512   VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG,
1513 };
1514 
1515 static const vec16 kYToRgb = { YG, YG, YG, YG, YG, YG, YG, YG };
1516 static const vec16 kYSub16 = { 16, 16, 16, 16, 16, 16, 16, 16 };
1517 static const vec16 kUVBiasB = { BB, BB, BB, BB, BB, BB, BB, BB };
1518 static const vec16 kUVBiasG = { BG, BG, BG, BG, BG, BG, BG, BG };
1519 static const vec16 kUVBiasR = { BR, BR, BR, BR, BR, BR, BR, BR };
1520 
1521 // TODO(fbarchard): NV12/NV21 fetch UV and use directly.
1522 // TODO(fbarchard): Read that does half size on Y and treats 420 as 444.
1523 
1524 // Read 8 UV from 411.
1525 #define READYUV444 __asm {                                                     \
1526     __asm movq       xmm0, qword ptr [esi] /* U */                /* NOLINT */ \
1527     __asm movq       xmm1, qword ptr [esi + edi] /* V */          /* NOLINT */ \
1528     __asm lea        esi,  [esi + 8]                                           \
1529     __asm punpcklbw  xmm0, xmm1           /* UV */                             \
1530   }
1531 
1532 // Read 4 UV from 422, upsample to 8 UV.
1533 #define READYUV422 __asm {                                                     \
1534     __asm movd       xmm0, [esi]          /* U */                              \
1535     __asm movd       xmm1, [esi + edi]    /* V */                              \
1536     __asm lea        esi,  [esi + 4]                                           \
1537     __asm punpcklbw  xmm0, xmm1           /* UV */                             \
1538     __asm punpcklwd  xmm0, xmm0           /* UVUV (upsample) */                \
1539   }
1540 
1541 // Read 2 UV from 411, upsample to 8 UV.
1542 #define READYUV411 __asm {                                                     \
1543     __asm movd       xmm0, [esi]          /* U */                              \
1544     __asm movd       xmm1, [esi + edi]    /* V */                              \
1545     __asm lea        esi,  [esi + 2]                                           \
1546     __asm punpcklbw  xmm0, xmm1           /* UV */                             \
1547     __asm punpcklwd  xmm0, xmm0           /* UVUV (upsample) */                \
1548     __asm punpckldq  xmm0, xmm0           /* UVUV (upsample) */                \
1549   }
1550 
1551 // Read 4 UV from NV12, upsample to 8 UV.
1552 #define READNV12 __asm {                                                       \
1553     __asm movq       xmm0, qword ptr [esi] /* UV */               /* NOLINT */ \
1554     __asm lea        esi,  [esi + 8]                                           \
1555     __asm punpcklwd  xmm0, xmm0           /* UVUV (upsample) */                \
1556   }
1557 
1558 // Convert 8 pixels: 8 UV and 8 Y.
1559 #define YUVTORGB __asm {                                                       \
1560     /* Step 1: Find 4 UV contributions to 8 R,G,B values */                    \
1561     __asm movdqa     xmm1, xmm0                                                \
1562     __asm movdqa     xmm2, xmm0                                                \
1563     __asm pmaddubsw  xmm0, kUVToB        /* scale B UV */                      \
1564     __asm pmaddubsw  xmm1, kUVToG        /* scale G UV */                      \
1565     __asm pmaddubsw  xmm2, kUVToR        /* scale R UV */                      \
1566     __asm psubw      xmm0, kUVBiasB      /* unbias back to signed */           \
1567     __asm psubw      xmm1, kUVBiasG                                            \
1568     __asm psubw      xmm2, kUVBiasR                                            \
1569     /* Step 2: Find Y contribution to 8 R,G,B values */                        \
1570     __asm movq       xmm3, qword ptr [eax]                        /* NOLINT */ \
1571     __asm lea        eax, [eax + 8]                                            \
1572     __asm punpcklbw  xmm3, xmm4                                                \
1573     __asm psubsw     xmm3, kYSub16                                             \
1574     __asm pmullw     xmm3, kYToRgb                                             \
1575     __asm paddsw     xmm0, xmm3           /* B += Y */                         \
1576     __asm paddsw     xmm1, xmm3           /* G += Y */                         \
1577     __asm paddsw     xmm2, xmm3           /* R += Y */                         \
1578     __asm psraw      xmm0, 6                                                   \
1579     __asm psraw      xmm1, 6                                                   \
1580     __asm psraw      xmm2, 6                                                   \
1581     __asm packuswb   xmm0, xmm0           /* B */                              \
1582     __asm packuswb   xmm1, xmm1           /* G */                              \
1583     __asm packuswb   xmm2, xmm2           /* R */                              \
1584   }
1585 
1586 // Convert 8 pixels: 8 VU and 8 Y.
1587 #define YVUTORGB __asm {                                                       \
1588     /* Step 1: Find 4 UV contributions to 8 R,G,B values */                    \
1589     __asm movdqa     xmm1, xmm0                                                \
1590     __asm movdqa     xmm2, xmm0                                                \
1591     __asm pmaddubsw  xmm0, kVUToB        /* scale B UV */                      \
1592     __asm pmaddubsw  xmm1, kVUToG        /* scale G UV */                      \
1593     __asm pmaddubsw  xmm2, kVUToR        /* scale R UV */                      \
1594     __asm psubw      xmm0, kUVBiasB      /* unbias back to signed */           \
1595     __asm psubw      xmm1, kUVBiasG                                            \
1596     __asm psubw      xmm2, kUVBiasR                                            \
1597     /* Step 2: Find Y contribution to 8 R,G,B values */                        \
1598     __asm movq       xmm3, qword ptr [eax]                        /* NOLINT */ \
1599     __asm lea        eax, [eax + 8]                                            \
1600     __asm punpcklbw  xmm3, xmm4                                                \
1601     __asm psubsw     xmm3, kYSub16                                             \
1602     __asm pmullw     xmm3, kYToRgb                                             \
1603     __asm paddsw     xmm0, xmm3           /* B += Y */                         \
1604     __asm paddsw     xmm1, xmm3           /* G += Y */                         \
1605     __asm paddsw     xmm2, xmm3           /* R += Y */                         \
1606     __asm psraw      xmm0, 6                                                   \
1607     __asm psraw      xmm1, 6                                                   \
1608     __asm psraw      xmm2, 6                                                   \
1609     __asm packuswb   xmm0, xmm0           /* B */                              \
1610     __asm packuswb   xmm1, xmm1           /* G */                              \
1611     __asm packuswb   xmm2, xmm2           /* R */                              \
1612   }
1613 
1614 // 8 pixels, dest aligned 16.
1615 // 8 UV values, mixed with 8 Y producing 8 ARGB (32 bytes).
1616 __declspec(naked) __declspec(align(16))
I444ToARGBRow_SSSE3(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * argb_buf,int width)1617 void I444ToARGBRow_SSSE3(const uint8* y_buf,
1618                          const uint8* u_buf,
1619                          const uint8* v_buf,
1620                          uint8* argb_buf,
1621                          int width) {
1622   __asm {
1623     push       esi
1624     push       edi
1625     mov        eax, [esp + 8 + 4]   // Y
1626     mov        esi, [esp + 8 + 8]   // U
1627     mov        edi, [esp + 8 + 12]  // V
1628     mov        edx, [esp + 8 + 16]  // argb
1629     mov        ecx, [esp + 8 + 20]  // width
1630     sub        edi, esi
1631     pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
1632     pxor       xmm4, xmm4
1633 
1634     align      16
1635  convertloop:
1636     READYUV444
1637     YUVTORGB
1638 
1639     // Step 3: Weave into ARGB
1640     punpcklbw  xmm0, xmm1           // BG
1641     punpcklbw  xmm2, xmm5           // RA
1642     movdqa     xmm1, xmm0
1643     punpcklwd  xmm0, xmm2           // BGRA first 4 pixels
1644     punpckhwd  xmm1, xmm2           // BGRA next 4 pixels
1645     movdqa     [edx], xmm0
1646     movdqa     [edx + 16], xmm1
1647     lea        edx,  [edx + 32]
1648     sub        ecx, 8
1649     jg         convertloop
1650 
1651     pop        edi
1652     pop        esi
1653     ret
1654   }
1655 }
1656 
1657 // 8 pixels, dest aligned 16.
1658 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
1659 __declspec(naked) __declspec(align(16))
I422ToARGBRow_SSSE3(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * argb_buf,int width)1660 void I422ToARGBRow_SSSE3(const uint8* y_buf,
1661                          const uint8* u_buf,
1662                          const uint8* v_buf,
1663                          uint8* argb_buf,
1664                          int width) {
1665   __asm {
1666     push       esi
1667     push       edi
1668     mov        eax, [esp + 8 + 4]   // Y
1669     mov        esi, [esp + 8 + 8]   // U
1670     mov        edi, [esp + 8 + 12]  // V
1671     mov        edx, [esp + 8 + 16]  // argb
1672     mov        ecx, [esp + 8 + 20]  // width
1673     sub        edi, esi
1674     pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
1675     pxor       xmm4, xmm4
1676 
1677     align      16
1678  convertloop:
1679     READYUV422
1680     YUVTORGB
1681 
1682     // Step 3: Weave into ARGB
1683     punpcklbw  xmm0, xmm1           // BG
1684     punpcklbw  xmm2, xmm5           // RA
1685     movdqa     xmm1, xmm0
1686     punpcklwd  xmm0, xmm2           // BGRA first 4 pixels
1687     punpckhwd  xmm1, xmm2           // BGRA next 4 pixels
1688     movdqa     [edx], xmm0
1689     movdqa     [edx + 16], xmm1
1690     lea        edx,  [edx + 32]
1691     sub        ecx, 8
1692     jg         convertloop
1693 
1694     pop        edi
1695     pop        esi
1696     ret
1697   }
1698 }
1699 
1700 // 8 pixels, dest aligned 16.
1701 // 2 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
1702 // Similar to I420 but duplicate UV once more.
1703 __declspec(naked) __declspec(align(16))
I411ToARGBRow_SSSE3(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * argb_buf,int width)1704 void I411ToARGBRow_SSSE3(const uint8* y_buf,
1705                          const uint8* u_buf,
1706                          const uint8* v_buf,
1707                          uint8* argb_buf,
1708                          int width) {
1709   __asm {
1710     push       esi
1711     push       edi
1712     mov        eax, [esp + 8 + 4]   // Y
1713     mov        esi, [esp + 8 + 8]   // U
1714     mov        edi, [esp + 8 + 12]  // V
1715     mov        edx, [esp + 8 + 16]  // argb
1716     mov        ecx, [esp + 8 + 20]  // width
1717     sub        edi, esi
1718     pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
1719     pxor       xmm4, xmm4
1720 
1721     align      16
1722  convertloop:
1723     READYUV411
1724     YUVTORGB
1725 
1726     // Step 3: Weave into ARGB
1727     punpcklbw  xmm0, xmm1           // BG
1728     punpcklbw  xmm2, xmm5           // RA
1729     movdqa     xmm1, xmm0
1730     punpcklwd  xmm0, xmm2           // BGRA first 4 pixels
1731     punpckhwd  xmm1, xmm2           // BGRA next 4 pixels
1732     movdqa     [edx], xmm0
1733     movdqa     [edx + 16], xmm1
1734     lea        edx,  [edx + 32]
1735     sub        ecx, 8
1736     jg         convertloop
1737 
1738     pop        edi
1739     pop        esi
1740     ret
1741   }
1742 }
1743 
1744 // 8 pixels, dest aligned 16.
1745 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
1746 __declspec(naked) __declspec(align(16))
NV12ToARGBRow_SSSE3(const uint8 * y_buf,const uint8 * uv_buf,uint8 * argb_buf,int width)1747 void NV12ToARGBRow_SSSE3(const uint8* y_buf,
1748                          const uint8* uv_buf,
1749                          uint8* argb_buf,
1750                          int width) {
1751   __asm {
1752     push       esi
1753     mov        eax, [esp + 4 + 4]   // Y
1754     mov        esi, [esp + 4 + 8]   // UV
1755     mov        edx, [esp + 4 + 12]  // argb
1756     mov        ecx, [esp + 4 + 16]  // width
1757     pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
1758     pxor       xmm4, xmm4
1759 
1760     align      16
1761  convertloop:
1762     READNV12
1763     YUVTORGB
1764 
1765     // Step 3: Weave into ARGB
1766     punpcklbw  xmm0, xmm1           // BG
1767     punpcklbw  xmm2, xmm5           // RA
1768     movdqa     xmm1, xmm0
1769     punpcklwd  xmm0, xmm2           // BGRA first 4 pixels
1770     punpckhwd  xmm1, xmm2           // BGRA next 4 pixels
1771     movdqa     [edx], xmm0
1772     movdqa     [edx + 16], xmm1
1773     lea        edx,  [edx + 32]
1774     sub        ecx, 8
1775     jg         convertloop
1776 
1777     pop        esi
1778     ret
1779   }
1780 }
1781 
1782 // 8 pixels, dest aligned 16.
1783 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
1784 __declspec(naked) __declspec(align(16))
NV21ToARGBRow_SSSE3(const uint8 * y_buf,const uint8 * uv_buf,uint8 * argb_buf,int width)1785 void NV21ToARGBRow_SSSE3(const uint8* y_buf,
1786                          const uint8* uv_buf,
1787                          uint8* argb_buf,
1788                          int width) {
1789   __asm {
1790     push       esi
1791     mov        eax, [esp + 4 + 4]   // Y
1792     mov        esi, [esp + 4 + 8]   // VU
1793     mov        edx, [esp + 4 + 12]  // argb
1794     mov        ecx, [esp + 4 + 16]  // width
1795     pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
1796     pxor       xmm4, xmm4
1797 
1798     align      16
1799  convertloop:
1800     READNV12
1801     YVUTORGB
1802 
1803     // Step 3: Weave into ARGB
1804     punpcklbw  xmm0, xmm1           // BG
1805     punpcklbw  xmm2, xmm5           // RA
1806     movdqa     xmm1, xmm0
1807     punpcklwd  xmm0, xmm2           // BGRA first 4 pixels
1808     punpckhwd  xmm1, xmm2           // BGRA next 4 pixels
1809     movdqa     [edx], xmm0
1810     movdqa     [edx + 16], xmm1
1811     lea        edx,  [edx + 32]
1812     sub        ecx, 8
1813     jg         convertloop
1814 
1815     pop        esi
1816     ret
1817   }
1818 }
1819 
1820 // 8 pixels, unaligned.
1821 // 8 UV values, mixed with 8 Y producing 8 ARGB (32 bytes).
1822 __declspec(naked) __declspec(align(16))
I444ToARGBRow_Unaligned_SSSE3(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * argb_buf,int width)1823 void I444ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
1824                                    const uint8* u_buf,
1825                                    const uint8* v_buf,
1826                                    uint8* argb_buf,
1827                                    int width) {
1828   __asm {
1829     push       esi
1830     push       edi
1831     mov        eax, [esp + 8 + 4]   // Y
1832     mov        esi, [esp + 8 + 8]   // U
1833     mov        edi, [esp + 8 + 12]  // V
1834     mov        edx, [esp + 8 + 16]  // argb
1835     mov        ecx, [esp + 8 + 20]  // width
1836     sub        edi, esi
1837     pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
1838     pxor       xmm4, xmm4
1839 
1840     align      16
1841  convertloop:
1842     READYUV444
1843     YUVTORGB
1844 
1845     // Step 3: Weave into ARGB
1846     punpcklbw  xmm0, xmm1           // BG
1847     punpcklbw  xmm2, xmm5           // RA
1848     movdqa     xmm1, xmm0
1849     punpcklwd  xmm0, xmm2           // BGRA first 4 pixels
1850     punpckhwd  xmm1, xmm2           // BGRA next 4 pixels
1851     movdqu     [edx], xmm0
1852     movdqu     [edx + 16], xmm1
1853     lea        edx,  [edx + 32]
1854     sub        ecx, 8
1855     jg         convertloop
1856 
1857     pop        edi
1858     pop        esi
1859     ret
1860   }
1861 }
1862 
1863 // 8 pixels, unaligned.
1864 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
1865 __declspec(naked) __declspec(align(16))
I422ToARGBRow_Unaligned_SSSE3(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * argb_buf,int width)1866 void I422ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
1867                                    const uint8* u_buf,
1868                                    const uint8* v_buf,
1869                                    uint8* argb_buf,
1870                                    int width) {
1871   __asm {
1872     push       esi
1873     push       edi
1874     mov        eax, [esp + 8 + 4]   // Y
1875     mov        esi, [esp + 8 + 8]   // U
1876     mov        edi, [esp + 8 + 12]  // V
1877     mov        edx, [esp + 8 + 16]  // argb
1878     mov        ecx, [esp + 8 + 20]  // width
1879     sub        edi, esi
1880     pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
1881     pxor       xmm4, xmm4
1882 
1883     align      16
1884  convertloop:
1885     READYUV422
1886     YUVTORGB
1887 
1888     // Step 3: Weave into ARGB
1889     punpcklbw  xmm0, xmm1           // BG
1890     punpcklbw  xmm2, xmm5           // RA
1891     movdqa     xmm1, xmm0
1892     punpcklwd  xmm0, xmm2           // BGRA first 4 pixels
1893     punpckhwd  xmm1, xmm2           // BGRA next 4 pixels
1894     movdqu     [edx], xmm0
1895     movdqu     [edx + 16], xmm1
1896     lea        edx,  [edx + 32]
1897     sub        ecx, 8
1898     jg         convertloop
1899 
1900     pop        edi
1901     pop        esi
1902     ret
1903   }
1904 }
1905 
1906 // 8 pixels, unaligned.
1907 // 2 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
1908 // Similar to I420 but duplicate UV once more.
1909 __declspec(naked) __declspec(align(16))
I411ToARGBRow_Unaligned_SSSE3(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * argb_buf,int width)1910 void I411ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
1911                                    const uint8* u_buf,
1912                                    const uint8* v_buf,
1913                                    uint8* argb_buf,
1914                                    int width) {
1915   __asm {
1916     push       esi
1917     push       edi
1918     mov        eax, [esp + 8 + 4]   // Y
1919     mov        esi, [esp + 8 + 8]   // U
1920     mov        edi, [esp + 8 + 12]  // V
1921     mov        edx, [esp + 8 + 16]  // argb
1922     mov        ecx, [esp + 8 + 20]  // width
1923     sub        edi, esi
1924     pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
1925     pxor       xmm4, xmm4
1926 
1927     align      16
1928  convertloop:
1929     READYUV411
1930     YUVTORGB
1931 
1932     // Step 3: Weave into ARGB
1933     punpcklbw  xmm0, xmm1           // BG
1934     punpcklbw  xmm2, xmm5           // RA
1935     movdqa     xmm1, xmm0
1936     punpcklwd  xmm0, xmm2           // BGRA first 4 pixels
1937     punpckhwd  xmm1, xmm2           // BGRA next 4 pixels
1938     movdqu     [edx], xmm0
1939     movdqu     [edx + 16], xmm1
1940     lea        edx,  [edx + 32]
1941     sub        ecx, 8
1942     jg         convertloop
1943 
1944     pop        edi
1945     pop        esi
1946     ret
1947   }
1948 }
1949 
1950 
1951 // 8 pixels, dest aligned 16.
1952 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
1953 __declspec(naked) __declspec(align(16))
NV12ToARGBRow_Unaligned_SSSE3(const uint8 * y_buf,const uint8 * uv_buf,uint8 * argb_buf,int width)1954 void NV12ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
1955                                    const uint8* uv_buf,
1956                                    uint8* argb_buf,
1957                                    int width) {
1958   __asm {
1959     push       esi
1960     mov        eax, [esp + 4 + 4]   // Y
1961     mov        esi, [esp + 4 + 8]   // UV
1962     mov        edx, [esp + 4 + 12]  // argb
1963     mov        ecx, [esp + 4 + 16]  // width
1964     pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
1965     pxor       xmm4, xmm4
1966 
1967     align      16
1968  convertloop:
1969     READNV12
1970     YUVTORGB
1971 
1972     // Step 3: Weave into ARGB
1973     punpcklbw  xmm0, xmm1           // BG
1974     punpcklbw  xmm2, xmm5           // RA
1975     movdqa     xmm1, xmm0
1976     punpcklwd  xmm0, xmm2           // BGRA first 4 pixels
1977     punpckhwd  xmm1, xmm2           // BGRA next 4 pixels
1978     movdqu     [edx], xmm0
1979     movdqu     [edx + 16], xmm1
1980     lea        edx,  [edx + 32]
1981     sub        ecx, 8
1982     jg         convertloop
1983 
1984     pop        esi
1985     ret
1986   }
1987 }
1988 
1989 // 8 pixels, dest aligned 16.
1990 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
1991 __declspec(naked) __declspec(align(16))
NV21ToARGBRow_Unaligned_SSSE3(const uint8 * y_buf,const uint8 * uv_buf,uint8 * argb_buf,int width)1992 void NV21ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
1993                                    const uint8* uv_buf,
1994                                    uint8* argb_buf,
1995                                    int width) {
1996   __asm {
1997     push       esi
1998     mov        eax, [esp + 4 + 4]   // Y
1999     mov        esi, [esp + 4 + 8]   // VU
2000     mov        edx, [esp + 4 + 12]  // argb
2001     mov        ecx, [esp + 4 + 16]  // width
2002     pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
2003     pxor       xmm4, xmm4
2004 
2005     align      16
2006  convertloop:
2007     READNV12
2008     YVUTORGB
2009 
2010     // Step 3: Weave into ARGB
2011     punpcklbw  xmm0, xmm1           // BG
2012     punpcklbw  xmm2, xmm5           // RA
2013     movdqa     xmm1, xmm0
2014     punpcklwd  xmm0, xmm2           // BGRA first 4 pixels
2015     punpckhwd  xmm1, xmm2           // BGRA next 4 pixels
2016     movdqu     [edx], xmm0
2017     movdqu     [edx + 16], xmm1
2018     lea        edx,  [edx + 32]
2019     sub        ecx, 8
2020     jg         convertloop
2021 
2022     pop        esi
2023     ret
2024   }
2025 }
2026 
2027 __declspec(naked) __declspec(align(16))
I422ToBGRARow_SSSE3(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * bgra_buf,int width)2028 void I422ToBGRARow_SSSE3(const uint8* y_buf,
2029                          const uint8* u_buf,
2030                          const uint8* v_buf,
2031                          uint8* bgra_buf,
2032                          int width) {
2033   __asm {
2034     push       esi
2035     push       edi
2036     mov        eax, [esp + 8 + 4]   // Y
2037     mov        esi, [esp + 8 + 8]   // U
2038     mov        edi, [esp + 8 + 12]  // V
2039     mov        edx, [esp + 8 + 16]  // bgra
2040     mov        ecx, [esp + 8 + 20]  // width
2041     sub        edi, esi
2042     pxor       xmm4, xmm4
2043 
2044     align      16
2045  convertloop:
2046     READYUV422
2047     YUVTORGB
2048 
2049     // Step 3: Weave into BGRA
2050     pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
2051     punpcklbw  xmm1, xmm0           // GB
2052     punpcklbw  xmm5, xmm2           // AR
2053     movdqa     xmm0, xmm5
2054     punpcklwd  xmm5, xmm1           // BGRA first 4 pixels
2055     punpckhwd  xmm0, xmm1           // BGRA next 4 pixels
2056     movdqa     [edx], xmm5
2057     movdqa     [edx + 16], xmm0
2058     lea        edx,  [edx + 32]
2059     sub        ecx, 8
2060     jg         convertloop
2061 
2062     pop        edi
2063     pop        esi
2064     ret
2065   }
2066 }
2067 
2068 __declspec(naked) __declspec(align(16))
I422ToBGRARow_Unaligned_SSSE3(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * bgra_buf,int width)2069 void I422ToBGRARow_Unaligned_SSSE3(const uint8* y_buf,
2070                                    const uint8* u_buf,
2071                                    const uint8* v_buf,
2072                                    uint8* bgra_buf,
2073                                    int width) {
2074   __asm {
2075     push       esi
2076     push       edi
2077     mov        eax, [esp + 8 + 4]   // Y
2078     mov        esi, [esp + 8 + 8]   // U
2079     mov        edi, [esp + 8 + 12]  // V
2080     mov        edx, [esp + 8 + 16]  // bgra
2081     mov        ecx, [esp + 8 + 20]  // width
2082     sub        edi, esi
2083     pxor       xmm4, xmm4
2084 
2085     align      16
2086  convertloop:
2087     READYUV422
2088     YUVTORGB
2089 
2090     // Step 3: Weave into BGRA
2091     pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
2092     punpcklbw  xmm1, xmm0           // GB
2093     punpcklbw  xmm5, xmm2           // AR
2094     movdqa     xmm0, xmm5
2095     punpcklwd  xmm5, xmm1           // BGRA first 4 pixels
2096     punpckhwd  xmm0, xmm1           // BGRA next 4 pixels
2097     movdqu     [edx], xmm5
2098     movdqu     [edx + 16], xmm0
2099     lea        edx,  [edx + 32]
2100     sub        ecx, 8
2101     jg         convertloop
2102 
2103     pop        edi
2104     pop        esi
2105     ret
2106   }
2107 }
2108 
2109 __declspec(naked) __declspec(align(16))
I422ToABGRRow_SSSE3(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * abgr_buf,int width)2110 void I422ToABGRRow_SSSE3(const uint8* y_buf,
2111                          const uint8* u_buf,
2112                          const uint8* v_buf,
2113                          uint8* abgr_buf,
2114                          int width) {
2115   __asm {
2116     push       esi
2117     push       edi
2118     mov        eax, [esp + 8 + 4]   // Y
2119     mov        esi, [esp + 8 + 8]   // U
2120     mov        edi, [esp + 8 + 12]  // V
2121     mov        edx, [esp + 8 + 16]  // abgr
2122     mov        ecx, [esp + 8 + 20]  // width
2123     sub        edi, esi
2124     pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
2125     pxor       xmm4, xmm4
2126 
2127     align      16
2128  convertloop:
2129     READYUV422
2130     YUVTORGB
2131 
2132     // Step 3: Weave into ARGB
2133     punpcklbw  xmm2, xmm1           // RG
2134     punpcklbw  xmm0, xmm5           // BA
2135     movdqa     xmm1, xmm2
2136     punpcklwd  xmm2, xmm0           // RGBA first 4 pixels
2137     punpckhwd  xmm1, xmm0           // RGBA next 4 pixels
2138     movdqa     [edx], xmm2
2139     movdqa     [edx + 16], xmm1
2140     lea        edx,  [edx + 32]
2141     sub        ecx, 8
2142     jg         convertloop
2143 
2144     pop        edi
2145     pop        esi
2146     ret
2147   }
2148 }
2149 
2150 __declspec(naked) __declspec(align(16))
I422ToABGRRow_Unaligned_SSSE3(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * abgr_buf,int width)2151 void I422ToABGRRow_Unaligned_SSSE3(const uint8* y_buf,
2152                                    const uint8* u_buf,
2153                                    const uint8* v_buf,
2154                                    uint8* abgr_buf,
2155                                    int width) {
2156   __asm {
2157     push       esi
2158     push       edi
2159     mov        eax, [esp + 8 + 4]   // Y
2160     mov        esi, [esp + 8 + 8]   // U
2161     mov        edi, [esp + 8 + 12]  // V
2162     mov        edx, [esp + 8 + 16]  // abgr
2163     mov        ecx, [esp + 8 + 20]  // width
2164     sub        edi, esi
2165     pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
2166     pxor       xmm4, xmm4
2167 
2168     align      16
2169  convertloop:
2170     READYUV422
2171     YUVTORGB
2172 
2173     // Step 3: Weave into ARGB
2174     punpcklbw  xmm2, xmm1           // RG
2175     punpcklbw  xmm0, xmm5           // BA
2176     movdqa     xmm1, xmm2
2177     punpcklwd  xmm2, xmm0           // RGBA first 4 pixels
2178     punpckhwd  xmm1, xmm0           // RGBA next 4 pixels
2179     movdqu     [edx], xmm2
2180     movdqu     [edx + 16], xmm1
2181     lea        edx,  [edx + 32]
2182     sub        ecx, 8
2183     jg         convertloop
2184 
2185     pop        edi
2186     pop        esi
2187     ret
2188   }
2189 }
2190 
2191 __declspec(naked) __declspec(align(16))
I422ToRGBARow_SSSE3(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * rgba_buf,int width)2192 void I422ToRGBARow_SSSE3(const uint8* y_buf,
2193                          const uint8* u_buf,
2194                          const uint8* v_buf,
2195                          uint8* rgba_buf,
2196                          int width) {
2197   __asm {
2198     push       esi
2199     push       edi
2200     mov        eax, [esp + 8 + 4]   // Y
2201     mov        esi, [esp + 8 + 8]   // U
2202     mov        edi, [esp + 8 + 12]  // V
2203     mov        edx, [esp + 8 + 16]  // rgba
2204     mov        ecx, [esp + 8 + 20]  // width
2205     sub        edi, esi
2206     pxor       xmm4, xmm4
2207 
2208     align      16
2209  convertloop:
2210     READYUV422
2211     YUVTORGB
2212 
2213     // Step 3: Weave into RGBA
2214     pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
2215     punpcklbw  xmm1, xmm2           // GR
2216     punpcklbw  xmm5, xmm0           // AB
2217     movdqa     xmm0, xmm5
2218     punpcklwd  xmm5, xmm1           // RGBA first 4 pixels
2219     punpckhwd  xmm0, xmm1           // RGBA next 4 pixels
2220     movdqa     [edx], xmm5
2221     movdqa     [edx + 16], xmm0
2222     lea        edx,  [edx + 32]
2223     sub        ecx, 8
2224     jg         convertloop
2225 
2226     pop        edi
2227     pop        esi
2228     ret
2229   }
2230 }
2231 
2232 __declspec(naked) __declspec(align(16))
I422ToRGBARow_Unaligned_SSSE3(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * rgba_buf,int width)2233 void I422ToRGBARow_Unaligned_SSSE3(const uint8* y_buf,
2234                                    const uint8* u_buf,
2235                                    const uint8* v_buf,
2236                                    uint8* rgba_buf,
2237                                    int width) {
2238   __asm {
2239     push       esi
2240     push       edi
2241     mov        eax, [esp + 8 + 4]   // Y
2242     mov        esi, [esp + 8 + 8]   // U
2243     mov        edi, [esp + 8 + 12]  // V
2244     mov        edx, [esp + 8 + 16]  // rgba
2245     mov        ecx, [esp + 8 + 20]  // width
2246     sub        edi, esi
2247     pxor       xmm4, xmm4
2248 
2249     align      16
2250  convertloop:
2251     READYUV422
2252     YUVTORGB
2253 
2254     // Step 3: Weave into RGBA
2255     pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
2256     punpcklbw  xmm1, xmm2           // GR
2257     punpcklbw  xmm5, xmm0           // AB
2258     movdqa     xmm0, xmm5
2259     punpcklwd  xmm5, xmm1           // RGBA first 4 pixels
2260     punpckhwd  xmm0, xmm1           // RGBA next 4 pixels
2261     movdqu     [edx], xmm5
2262     movdqu     [edx + 16], xmm0
2263     lea        edx,  [edx + 32]
2264     sub        ecx, 8
2265     jg         convertloop
2266 
2267     pop        edi
2268     pop        esi
2269     ret
2270   }
2271 }
2272 
2273 #endif  // HAS_I422TOARGBROW_SSSE3
2274 
2275 #ifdef HAS_YTOARGBROW_SSE2
2276 __declspec(naked) __declspec(align(16))
YToARGBRow_SSE2(const uint8 * y_buf,uint8 * rgb_buf,int width)2277 void YToARGBRow_SSE2(const uint8* y_buf,
2278                      uint8* rgb_buf,
2279                      int width) {
2280   __asm {
2281     pcmpeqb    xmm4, xmm4           // generate mask 0xff000000
2282     pslld      xmm4, 24
2283     mov        eax,0x10001000
2284     movd       xmm3,eax
2285     pshufd     xmm3,xmm3,0
2286     mov        eax,0x012a012a
2287     movd       xmm2,eax
2288     pshufd     xmm2,xmm2,0
2289     mov        eax, [esp + 4]       // Y
2290     mov        edx, [esp + 8]       // rgb
2291     mov        ecx, [esp + 12]      // width
2292 
2293     align      16
2294  convertloop:
2295     // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
2296     movq       xmm0, qword ptr [eax]
2297     lea        eax, [eax + 8]
2298     punpcklbw  xmm0, xmm0           // Y.Y
2299     psubusw    xmm0, xmm3
2300     pmulhuw    xmm0, xmm2
2301     packuswb   xmm0, xmm0           // G
2302 
2303     // Step 2: Weave into ARGB
2304     punpcklbw  xmm0, xmm0           // GG
2305     movdqa     xmm1, xmm0
2306     punpcklwd  xmm0, xmm0           // BGRA first 4 pixels
2307     punpckhwd  xmm1, xmm1           // BGRA next 4 pixels
2308     por        xmm0, xmm4
2309     por        xmm1, xmm4
2310     movdqa     [edx], xmm0
2311     movdqa     [edx + 16], xmm1
2312     lea        edx,  [edx + 32]
2313     sub        ecx, 8
2314     jg         convertloop
2315 
2316     ret
2317   }
2318 }
2319 #endif  // HAS_YTOARGBROW_SSE2
2320 
2321 #ifdef HAS_MIRRORROW_SSSE3
2322 
2323 // Shuffle table for reversing the bytes.
2324 static const uvec8 kShuffleMirror = {
2325   15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
2326 };
2327 
2328 __declspec(naked) __declspec(align(16))
MirrorRow_SSSE3(const uint8 * src,uint8 * dst,int width)2329 void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
2330 __asm {
2331     mov       eax, [esp + 4]   // src
2332     mov       edx, [esp + 8]   // dst
2333     mov       ecx, [esp + 12]  // width
2334     movdqa    xmm5, kShuffleMirror
2335     lea       eax, [eax - 16]
2336 
2337     align      16
2338  convertloop:
2339     movdqa    xmm0, [eax + ecx]
2340     pshufb    xmm0, xmm5
2341     sub       ecx, 16
2342     movdqa    [edx], xmm0
2343     lea       edx, [edx + 16]
2344     jg        convertloop
2345     ret
2346   }
2347 }
2348 #endif  // HAS_MIRRORROW_SSSE3
2349 
2350 #ifdef HAS_MIRRORROW_SSE2
2351 // SSE2 version has movdqu so it can be used on unaligned buffers when SSSE3
2352 // version can not.
2353 __declspec(naked) __declspec(align(16))
MirrorRow_SSE2(const uint8 * src,uint8 * dst,int width)2354 void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
2355 __asm {
2356     mov       eax, [esp + 4]   // src
2357     mov       edx, [esp + 8]   // dst
2358     mov       ecx, [esp + 12]  // width
2359     lea       eax, [eax - 16]
2360 
2361     align      16
2362  convertloop:
2363     movdqu    xmm0, [eax + ecx]
2364     movdqa    xmm1, xmm0        // swap bytes
2365     psllw     xmm0, 8
2366     psrlw     xmm1, 8
2367     por       xmm0, xmm1
2368     pshuflw   xmm0, xmm0, 0x1b  // swap words
2369     pshufhw   xmm0, xmm0, 0x1b
2370     pshufd    xmm0, xmm0, 0x4e  // swap qwords
2371     sub       ecx, 16
2372     movdqu    [edx], xmm0
2373     lea       edx, [edx + 16]
2374     jg        convertloop
2375     ret
2376   }
2377 }
2378 #endif  // HAS_MIRRORROW_SSE2
2379 
2380 #ifdef HAS_MIRRORROW_UV_SSSE3
2381 // Shuffle table for reversing the bytes of UV channels.
2382 static const uvec8 kShuffleMirrorUV = {
2383   14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u
2384 };
2385 
2386 __declspec(naked) __declspec(align(16))
MirrorRowUV_SSSE3(const uint8 * src,uint8 * dst_u,uint8 * dst_v,int width)2387 void MirrorRowUV_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
2388                        int width) {
2389   __asm {
2390     push      edi
2391     mov       eax, [esp + 4 + 4]   // src
2392     mov       edx, [esp + 4 + 8]   // dst_u
2393     mov       edi, [esp + 4 + 12]  // dst_v
2394     mov       ecx, [esp + 4 + 16]  // width
2395     movdqa    xmm1, kShuffleMirrorUV
2396     lea       eax, [eax + ecx * 2 - 16]
2397     sub       edi, edx
2398 
2399     align      16
2400  convertloop:
2401     movdqa    xmm0, [eax]
2402     lea       eax, [eax - 16]
2403     pshufb    xmm0, xmm1
2404     sub       ecx, 8
2405     movlpd    qword ptr [edx], xmm0
2406     movhpd    qword ptr [edx + edi], xmm0
2407     lea       edx, [edx + 8]
2408     jg        convertloop
2409 
2410     pop       edi
2411     ret
2412   }
2413 }
2414 #endif  // HAS_MIRRORROW_UV_SSSE3
2415 
2416 #ifdef HAS_ARGBMIRRORROW_SSSE3
2417 
2418 // Shuffle table for reversing the bytes.
2419 static const uvec8 kARGBShuffleMirror = {
2420   12u, 13u, 14u, 15u, 8u, 9u, 10u, 11u, 4u, 5u, 6u, 7u, 0u, 1u, 2u, 3u
2421 };
2422 
2423 __declspec(naked) __declspec(align(16))
ARGBMirrorRow_SSSE3(const uint8 * src,uint8 * dst,int width)2424 void ARGBMirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
2425 __asm {
2426     mov       eax, [esp + 4]   // src
2427     mov       edx, [esp + 8]   // dst
2428     mov       ecx, [esp + 12]  // width
2429     movdqa    xmm5, kARGBShuffleMirror
2430     lea       eax, [eax - 16]
2431 
2432     align      16
2433  convertloop:
2434     movdqa    xmm0, [eax + ecx * 4]
2435     pshufb    xmm0, xmm5
2436     sub       ecx, 4
2437     movdqa    [edx], xmm0
2438     lea       edx, [edx + 16]
2439     jg        convertloop
2440     ret
2441   }
2442 }
2443 #endif  // HAS_ARGBMIRRORROW_SSSE3
2444 
2445 #ifdef HAS_SPLITUV_SSE2
2446 __declspec(naked) __declspec(align(16))
SplitUV_SSE2(const uint8 * src_uv,uint8 * dst_u,uint8 * dst_v,int pix)2447 void SplitUV_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
2448   __asm {
2449     push       edi
2450     mov        eax, [esp + 4 + 4]    // src_uv
2451     mov        edx, [esp + 4 + 8]    // dst_u
2452     mov        edi, [esp + 4 + 12]   // dst_v
2453     mov        ecx, [esp + 4 + 16]   // pix
2454     pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
2455     psrlw      xmm5, 8
2456     sub        edi, edx
2457 
2458     align      16
2459   convertloop:
2460     movdqa     xmm0, [eax]
2461     movdqa     xmm1, [eax + 16]
2462     lea        eax,  [eax + 32]
2463     movdqa     xmm2, xmm0
2464     movdqa     xmm3, xmm1
2465     pand       xmm0, xmm5   // even bytes
2466     pand       xmm1, xmm5
2467     packuswb   xmm0, xmm1
2468     psrlw      xmm2, 8      // odd bytes
2469     psrlw      xmm3, 8
2470     packuswb   xmm2, xmm3
2471     movdqa     [edx], xmm0
2472     movdqa     [edx + edi], xmm2
2473     lea        edx, [edx + 16]
2474     sub        ecx, 16
2475     jg         convertloop
2476 
2477     pop        edi
2478     ret
2479   }
2480 }
2481 #endif  // HAS_SPLITUV_SSE2
2482 
2483 #ifdef HAS_COPYROW_SSE2
2484 // CopyRow copys 'count' bytes using a 16 byte load/store, 32 bytes at time.
2485 __declspec(naked) __declspec(align(16))
CopyRow_SSE2(const uint8 * src,uint8 * dst,int count)2486 void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
2487   __asm {
2488     mov        eax, [esp + 4]   // src
2489     mov        edx, [esp + 8]   // dst
2490     mov        ecx, [esp + 12]  // count
2491     sub        edx, eax
2492 
2493     align      16
2494   convertloop:
2495     movdqa     xmm0, [eax]
2496     movdqa     xmm1, [eax + 16]
2497     movdqa     [eax + edx], xmm0
2498     movdqa     [eax + edx + 16], xmm1
2499     lea        eax, [eax + 32]
2500     sub        ecx, 32
2501     jg         convertloop
2502     ret
2503   }
2504 }
2505 #endif  // HAS_COPYROW_SSE2
2506 
2507 #ifdef HAS_COPYROW_X86
2508 __declspec(naked) __declspec(align(16))
CopyRow_X86(const uint8 * src,uint8 * dst,int count)2509 void CopyRow_X86(const uint8* src, uint8* dst, int count) {
2510   __asm {
2511     mov        eax, esi
2512     mov        edx, edi
2513     mov        esi, [esp + 4]   // src
2514     mov        edi, [esp + 8]   // dst
2515     mov        ecx, [esp + 12]  // count
2516     shr        ecx, 2
2517     rep movsd
2518     mov        edi, edx
2519     mov        esi, eax
2520     ret
2521   }
2522 }
2523 #endif  // HAS_COPYROW_X86
2524 
2525 #ifdef HAS_SETROW_X86
2526 // SetRow8 writes 'count' bytes using a 32 bit value repeated.
2527 __declspec(naked) __declspec(align(16))
SetRow8_X86(uint8 * dst,uint32 v32,int count)2528 void SetRow8_X86(uint8* dst, uint32 v32, int count) {
2529   __asm {
2530     mov        edx, edi
2531     mov        edi, [esp + 4]   // dst
2532     mov        eax, [esp + 8]   // v32
2533     mov        ecx, [esp + 12]  // count
2534     shr        ecx, 2
2535     rep stosd
2536     mov        edi, edx
2537     ret
2538   }
2539 }
2540 
2541 // SetRow32 writes 'count' words using a 32 bit value repeated.
2542 __declspec(naked) __declspec(align(16))
SetRows32_X86(uint8 * dst,uint32 v32,int width,int dst_stride,int height)2543 void SetRows32_X86(uint8* dst, uint32 v32, int width,
2544                    int dst_stride, int height) {
2545   __asm {
2546     push       esi
2547     push       edi
2548     push       ebp
2549     mov        edi, [esp + 12 + 4]   // dst
2550     mov        eax, [esp + 12 + 8]   // v32
2551     mov        ebp, [esp + 12 + 12]  // width
2552     mov        edx, [esp + 12 + 16]  // dst_stride
2553     mov        esi, [esp + 12 + 20]  // height
2554     lea        ecx, [ebp * 4]
2555     sub        edx, ecx             // stride - width * 4
2556 
2557     align      16
2558   convertloop:
2559     mov        ecx, ebp
2560     rep stosd
2561     add        edi, edx
2562     sub        esi, 1
2563     jg         convertloop
2564 
2565     pop        ebp
2566     pop        edi
2567     pop        esi
2568     ret
2569   }
2570 }
2571 #endif  // HAS_SETROW_X86
2572 
2573 #ifdef HAS_YUY2TOYROW_SSE2
2574 __declspec(naked) __declspec(align(16))
YUY2ToYRow_SSE2(const uint8 * src_yuy2,uint8 * dst_y,int pix)2575 void YUY2ToYRow_SSE2(const uint8* src_yuy2,
2576                      uint8* dst_y, int pix) {
2577   __asm {
2578     mov        eax, [esp + 4]    // src_yuy2
2579     mov        edx, [esp + 8]    // dst_y
2580     mov        ecx, [esp + 12]   // pix
2581     pcmpeqb    xmm5, xmm5        // generate mask 0x00ff00ff
2582     psrlw      xmm5, 8
2583 
2584     align      16
2585   convertloop:
2586     movdqa     xmm0, [eax]
2587     movdqa     xmm1, [eax + 16]
2588     lea        eax,  [eax + 32]
2589     pand       xmm0, xmm5   // even bytes are Y
2590     pand       xmm1, xmm5
2591     packuswb   xmm0, xmm1
2592     sub        ecx, 16
2593     movdqa     [edx], xmm0
2594     lea        edx, [edx + 16]
2595     jg         convertloop
2596     ret
2597   }
2598 }
2599 
2600 __declspec(naked) __declspec(align(16))
YUY2ToUVRow_SSE2(const uint8 * src_yuy2,int stride_yuy2,uint8 * dst_u,uint8 * dst_v,int pix)2601 void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
2602                       uint8* dst_u, uint8* dst_v, int pix) {
2603   __asm {
2604     push       esi
2605     push       edi
2606     mov        eax, [esp + 8 + 4]    // src_yuy2
2607     mov        esi, [esp + 8 + 8]    // stride_yuy2
2608     mov        edx, [esp + 8 + 12]   // dst_u
2609     mov        edi, [esp + 8 + 16]   // dst_v
2610     mov        ecx, [esp + 8 + 20]   // pix
2611     pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
2612     psrlw      xmm5, 8
2613     sub        edi, edx
2614 
2615     align      16
2616   convertloop:
2617     movdqa     xmm0, [eax]
2618     movdqa     xmm1, [eax + 16]
2619     movdqa     xmm2, [eax + esi]
2620     movdqa     xmm3, [eax + esi + 16]
2621     lea        eax,  [eax + 32]
2622     pavgb      xmm0, xmm2
2623     pavgb      xmm1, xmm3
2624     psrlw      xmm0, 8      // YUYV -> UVUV
2625     psrlw      xmm1, 8
2626     packuswb   xmm0, xmm1
2627     movdqa     xmm1, xmm0
2628     pand       xmm0, xmm5  // U
2629     packuswb   xmm0, xmm0
2630     psrlw      xmm1, 8     // V
2631     packuswb   xmm1, xmm1
2632     movq       qword ptr [edx], xmm0
2633     movq       qword ptr [edx + edi], xmm1
2634     lea        edx, [edx + 8]
2635     sub        ecx, 16
2636     jg         convertloop
2637 
2638     pop        edi
2639     pop        esi
2640     ret
2641   }
2642 }
2643 
2644 __declspec(naked) __declspec(align(16))
YUY2ToUV422Row_SSE2(const uint8 * src_yuy2,uint8 * dst_u,uint8 * dst_v,int pix)2645 void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
2646                          uint8* dst_u, uint8* dst_v, int pix) {
2647   __asm {
2648     push       edi
2649     mov        eax, [esp + 4 + 4]    // src_yuy2
2650     mov        edx, [esp + 4 + 8]    // dst_u
2651     mov        edi, [esp + 4 + 12]   // dst_v
2652     mov        ecx, [esp + 4 + 16]   // pix
2653     pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
2654     psrlw      xmm5, 8
2655     sub        edi, edx
2656 
2657     align      16
2658   convertloop:
2659     movdqa     xmm0, [eax]
2660     movdqa     xmm1, [eax + 16]
2661     lea        eax,  [eax + 32]
2662     psrlw      xmm0, 8      // YUYV -> UVUV
2663     psrlw      xmm1, 8
2664     packuswb   xmm0, xmm1
2665     movdqa     xmm1, xmm0
2666     pand       xmm0, xmm5  // U
2667     packuswb   xmm0, xmm0
2668     psrlw      xmm1, 8     // V
2669     packuswb   xmm1, xmm1
2670     movq       qword ptr [edx], xmm0
2671     movq       qword ptr [edx + edi], xmm1
2672     lea        edx, [edx + 8]
2673     sub        ecx, 16
2674     jg         convertloop
2675 
2676     pop        edi
2677     ret
2678   }
2679 }
2680 
2681 __declspec(naked) __declspec(align(16))
YUY2ToYRow_Unaligned_SSE2(const uint8 * src_yuy2,uint8 * dst_y,int pix)2682 void YUY2ToYRow_Unaligned_SSE2(const uint8* src_yuy2,
2683                                uint8* dst_y, int pix) {
2684   __asm {
2685     mov        eax, [esp + 4]    // src_yuy2
2686     mov        edx, [esp + 8]    // dst_y
2687     mov        ecx, [esp + 12]   // pix
2688     pcmpeqb    xmm5, xmm5        // generate mask 0x00ff00ff
2689     psrlw      xmm5, 8
2690 
2691     align      16
2692   convertloop:
2693     movdqu     xmm0, [eax]
2694     movdqu     xmm1, [eax + 16]
2695     lea        eax,  [eax + 32]
2696     pand       xmm0, xmm5   // even bytes are Y
2697     pand       xmm1, xmm5
2698     packuswb   xmm0, xmm1
2699     sub        ecx, 16
2700     movdqu     [edx], xmm0
2701     lea        edx, [edx + 16]
2702     jg         convertloop
2703     ret
2704   }
2705 }
2706 
2707 __declspec(naked) __declspec(align(16))
YUY2ToUVRow_Unaligned_SSE2(const uint8 * src_yuy2,int stride_yuy2,uint8 * dst_u,uint8 * dst_v,int pix)2708 void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2, int stride_yuy2,
2709                                 uint8* dst_u, uint8* dst_v, int pix) {
2710   __asm {
2711     push       esi
2712     push       edi
2713     mov        eax, [esp + 8 + 4]    // src_yuy2
2714     mov        esi, [esp + 8 + 8]    // stride_yuy2
2715     mov        edx, [esp + 8 + 12]   // dst_u
2716     mov        edi, [esp + 8 + 16]   // dst_v
2717     mov        ecx, [esp + 8 + 20]   // pix
2718     pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
2719     psrlw      xmm5, 8
2720     sub        edi, edx
2721 
2722     align      16
2723   convertloop:
2724     movdqu     xmm0, [eax]
2725     movdqu     xmm1, [eax + 16]
2726     movdqu     xmm2, [eax + esi]
2727     movdqu     xmm3, [eax + esi + 16]
2728     lea        eax,  [eax + 32]
2729     pavgb      xmm0, xmm2
2730     pavgb      xmm1, xmm3
2731     psrlw      xmm0, 8      // YUYV -> UVUV
2732     psrlw      xmm1, 8
2733     packuswb   xmm0, xmm1
2734     movdqa     xmm1, xmm0
2735     pand       xmm0, xmm5  // U
2736     packuswb   xmm0, xmm0
2737     psrlw      xmm1, 8     // V
2738     packuswb   xmm1, xmm1
2739     movq       qword ptr [edx], xmm0
2740     movq       qword ptr [edx + edi], xmm1
2741     lea        edx, [edx + 8]
2742     sub        ecx, 16
2743     jg         convertloop
2744 
2745     pop        edi
2746     pop        esi
2747     ret
2748   }
2749 }
2750 
2751 __declspec(naked) __declspec(align(16))
YUY2ToUV422Row_Unaligned_SSE2(const uint8 * src_yuy2,uint8 * dst_u,uint8 * dst_v,int pix)2752 void YUY2ToUV422Row_Unaligned_SSE2(const uint8* src_yuy2,
2753                                    uint8* dst_u, uint8* dst_v, int pix) {
2754   __asm {
2755     push       edi
2756     mov        eax, [esp + 4 + 4]    // src_yuy2
2757     mov        edx, [esp + 4 + 8]    // dst_u
2758     mov        edi, [esp + 4 + 12]   // dst_v
2759     mov        ecx, [esp + 4 + 16]   // pix
2760     pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
2761     psrlw      xmm5, 8
2762     sub        edi, edx
2763 
2764     align      16
2765   convertloop:
2766     movdqu     xmm0, [eax]
2767     movdqu     xmm1, [eax + 16]
2768     lea        eax,  [eax + 32]
2769     psrlw      xmm0, 8      // YUYV -> UVUV
2770     psrlw      xmm1, 8
2771     packuswb   xmm0, xmm1
2772     movdqa     xmm1, xmm0
2773     pand       xmm0, xmm5  // U
2774     packuswb   xmm0, xmm0
2775     psrlw      xmm1, 8     // V
2776     packuswb   xmm1, xmm1
2777     movq       qword ptr [edx], xmm0
2778     movq       qword ptr [edx + edi], xmm1
2779     lea        edx, [edx + 8]
2780     sub        ecx, 16
2781     jg         convertloop
2782 
2783     pop        edi
2784     ret
2785   }
2786 }
2787 
2788 __declspec(naked) __declspec(align(16))
UYVYToYRow_SSE2(const uint8 * src_uyvy,uint8 * dst_y,int pix)2789 void UYVYToYRow_SSE2(const uint8* src_uyvy,
2790                      uint8* dst_y, int pix) {
2791   __asm {
2792     mov        eax, [esp + 4]    // src_uyvy
2793     mov        edx, [esp + 8]    // dst_y
2794     mov        ecx, [esp + 12]   // pix
2795 
2796     align      16
2797   convertloop:
2798     movdqa     xmm0, [eax]
2799     movdqa     xmm1, [eax + 16]
2800     lea        eax,  [eax + 32]
2801     psrlw      xmm0, 8    // odd bytes are Y
2802     psrlw      xmm1, 8
2803     packuswb   xmm0, xmm1
2804     sub        ecx, 16
2805     movdqa     [edx], xmm0
2806     lea        edx, [edx + 16]
2807     jg         convertloop
2808     ret
2809   }
2810 }
2811 
2812 __declspec(naked) __declspec(align(16))
UYVYToUVRow_SSE2(const uint8 * src_uyvy,int stride_uyvy,uint8 * dst_u,uint8 * dst_v,int pix)2813 void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
2814                       uint8* dst_u, uint8* dst_v, int pix) {
2815   __asm {
2816     push       esi
2817     push       edi
2818     mov        eax, [esp + 8 + 4]    // src_yuy2
2819     mov        esi, [esp + 8 + 8]    // stride_yuy2
2820     mov        edx, [esp + 8 + 12]   // dst_u
2821     mov        edi, [esp + 8 + 16]   // dst_v
2822     mov        ecx, [esp + 8 + 20]   // pix
2823     pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
2824     psrlw      xmm5, 8
2825     sub        edi, edx
2826 
2827     align      16
2828   convertloop:
2829     movdqa     xmm0, [eax]
2830     movdqa     xmm1, [eax + 16]
2831     movdqa     xmm2, [eax + esi]
2832     movdqa     xmm3, [eax + esi + 16]
2833     lea        eax,  [eax + 32]
2834     pavgb      xmm0, xmm2
2835     pavgb      xmm1, xmm3
2836     pand       xmm0, xmm5   // UYVY -> UVUV
2837     pand       xmm1, xmm5
2838     packuswb   xmm0, xmm1
2839     movdqa     xmm1, xmm0
2840     pand       xmm0, xmm5  // U
2841     packuswb   xmm0, xmm0
2842     psrlw      xmm1, 8     // V
2843     packuswb   xmm1, xmm1
2844     movq       qword ptr [edx], xmm0
2845     movq       qword ptr [edx + edi], xmm1
2846     lea        edx, [edx + 8]
2847     sub        ecx, 16
2848     jg         convertloop
2849 
2850     pop        edi
2851     pop        esi
2852     ret
2853   }
2854 }
2855 
2856 __declspec(naked) __declspec(align(16))
UYVYToUV422Row_SSE2(const uint8 * src_uyvy,uint8 * dst_u,uint8 * dst_v,int pix)2857 void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
2858                          uint8* dst_u, uint8* dst_v, int pix) {
2859   __asm {
2860     push       edi
2861     mov        eax, [esp + 4 + 4]    // src_yuy2
2862     mov        edx, [esp + 4 + 8]    // dst_u
2863     mov        edi, [esp + 4 + 12]   // dst_v
2864     mov        ecx, [esp + 4 + 16]   // pix
2865     pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
2866     psrlw      xmm5, 8
2867     sub        edi, edx
2868 
2869     align      16
2870   convertloop:
2871     movdqa     xmm0, [eax]
2872     movdqa     xmm1, [eax + 16]
2873     lea        eax,  [eax + 32]
2874     pand       xmm0, xmm5   // UYVY -> UVUV
2875     pand       xmm1, xmm5
2876     packuswb   xmm0, xmm1
2877     movdqa     xmm1, xmm0
2878     pand       xmm0, xmm5  // U
2879     packuswb   xmm0, xmm0
2880     psrlw      xmm1, 8     // V
2881     packuswb   xmm1, xmm1
2882     movq       qword ptr [edx], xmm0
2883     movq       qword ptr [edx + edi], xmm1
2884     lea        edx, [edx + 8]
2885     sub        ecx, 16
2886     jg         convertloop
2887 
2888     pop        edi
2889     ret
2890   }
2891 }
2892 
2893 __declspec(naked) __declspec(align(16))
UYVYToYRow_Unaligned_SSE2(const uint8 * src_uyvy,uint8 * dst_y,int pix)2894 void UYVYToYRow_Unaligned_SSE2(const uint8* src_uyvy,
2895                                uint8* dst_y, int pix) {
2896   __asm {
2897     mov        eax, [esp + 4]    // src_uyvy
2898     mov        edx, [esp + 8]    // dst_y
2899     mov        ecx, [esp + 12]   // pix
2900 
2901     align      16
2902   convertloop:
2903     movdqu     xmm0, [eax]
2904     movdqu     xmm1, [eax + 16]
2905     lea        eax,  [eax + 32]
2906     psrlw      xmm0, 8    // odd bytes are Y
2907     psrlw      xmm1, 8
2908     packuswb   xmm0, xmm1
2909     sub        ecx, 16
2910     movdqu     [edx], xmm0
2911     lea        edx, [edx + 16]
2912     jg         convertloop
2913     ret
2914   }
2915 }
2916 
2917 __declspec(naked) __declspec(align(16))
UYVYToUVRow_Unaligned_SSE2(const uint8 * src_uyvy,int stride_uyvy,uint8 * dst_u,uint8 * dst_v,int pix)2918 void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy,
2919                                 uint8* dst_u, uint8* dst_v, int pix) {
2920   __asm {
2921     push       esi
2922     push       edi
2923     mov        eax, [esp + 8 + 4]    // src_yuy2
2924     mov        esi, [esp + 8 + 8]    // stride_yuy2
2925     mov        edx, [esp + 8 + 12]   // dst_u
2926     mov        edi, [esp + 8 + 16]   // dst_v
2927     mov        ecx, [esp + 8 + 20]   // pix
2928     pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
2929     psrlw      xmm5, 8
2930     sub        edi, edx
2931 
2932     align      16
2933   convertloop:
2934     movdqu     xmm0, [eax]
2935     movdqu     xmm1, [eax + 16]
2936     movdqu     xmm2, [eax + esi]
2937     movdqu     xmm3, [eax + esi + 16]
2938     lea        eax,  [eax + 32]
2939     pavgb      xmm0, xmm2
2940     pavgb      xmm1, xmm3
2941     pand       xmm0, xmm5   // UYVY -> UVUV
2942     pand       xmm1, xmm5
2943     packuswb   xmm0, xmm1
2944     movdqa     xmm1, xmm0
2945     pand       xmm0, xmm5  // U
2946     packuswb   xmm0, xmm0
2947     psrlw      xmm1, 8     // V
2948     packuswb   xmm1, xmm1
2949     movq       qword ptr [edx], xmm0
2950     movq       qword ptr [edx + edi], xmm1
2951     lea        edx, [edx + 8]
2952     sub        ecx, 16
2953     jg         convertloop
2954 
2955     pop        edi
2956     pop        esi
2957     ret
2958   }
2959 }
2960 
2961 __declspec(naked) __declspec(align(16))
UYVYToUV422Row_Unaligned_SSE2(const uint8 * src_uyvy,uint8 * dst_u,uint8 * dst_v,int pix)2962 void UYVYToUV422Row_Unaligned_SSE2(const uint8* src_uyvy,
2963                                    uint8* dst_u, uint8* dst_v, int pix) {
2964   __asm {
2965     push       edi
2966     mov        eax, [esp + 4 + 4]    // src_yuy2
2967     mov        edx, [esp + 4 + 8]    // dst_u
2968     mov        edi, [esp + 4 + 12]   // dst_v
2969     mov        ecx, [esp + 4 + 16]   // pix
2970     pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
2971     psrlw      xmm5, 8
2972     sub        edi, edx
2973 
2974     align      16
2975   convertloop:
2976     movdqu     xmm0, [eax]
2977     movdqu     xmm1, [eax + 16]
2978     lea        eax,  [eax + 32]
2979     pand       xmm0, xmm5   // UYVY -> UVUV
2980     pand       xmm1, xmm5
2981     packuswb   xmm0, xmm1
2982     movdqa     xmm1, xmm0
2983     pand       xmm0, xmm5  // U
2984     packuswb   xmm0, xmm0
2985     psrlw      xmm1, 8     // V
2986     packuswb   xmm1, xmm1
2987     movq       qword ptr [edx], xmm0
2988     movq       qword ptr [edx + edi], xmm1
2989     lea        edx, [edx + 8]
2990     sub        ecx, 16
2991     jg         convertloop
2992 
2993     pop        edi
2994     ret
2995   }
2996 }
2997 #endif  // HAS_YUY2TOYROW_SSE2
2998 
2999 #ifdef HAS_ARGBBLENDROW_SSE2
3000 // Blend 8 pixels at a time.
3001 __declspec(naked) __declspec(align(16))
ARGBBlendRow_SSE2(const uint8 * src_argb0,const uint8 * src_argb1,uint8 * dst_argb,int width)3002 void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
3003                        uint8* dst_argb, int width) {
3004   __asm {
3005     push       esi
3006     mov        eax, [esp + 4 + 4]   // src_argb0
3007     mov        esi, [esp + 4 + 8]   // src_argb1
3008     mov        edx, [esp + 4 + 12]  // dst_argb
3009     mov        ecx, [esp + 4 + 16]  // width
3010     pcmpeqb    xmm7, xmm7       // generate constant 1
3011     psrlw      xmm7, 15
3012     pcmpeqb    xmm6, xmm6       // generate mask 0x00ff00ff
3013     psrlw      xmm6, 8
3014     pcmpeqb    xmm5, xmm5       // generate mask 0xff00ff00
3015     psllw      xmm5, 8
3016     pcmpeqb    xmm4, xmm4       // generate mask 0xff000000
3017     pslld      xmm4, 24
3018 
3019     sub        ecx, 1
3020     je         convertloop1     // only 1 pixel?
3021     jl         convertloop1b
3022 
3023     // 1 pixel loop until destination pointer is aligned.
3024   alignloop1:
3025     test       edx, 15          // aligned?
3026     je         alignloop1b
3027     movd       xmm3, [eax]
3028     lea        eax, [eax + 4]
3029     movdqa     xmm0, xmm3       // src argb
3030     pxor       xmm3, xmm4       // ~alpha
3031     movd       xmm2, [esi]      // _r_b
3032     psrlw      xmm3, 8          // alpha
3033     pshufhw    xmm3, xmm3,0F5h  // 8 alpha words
3034     pshuflw    xmm3, xmm3,0F5h
3035     pand       xmm2, xmm6       // _r_b
3036     paddw      xmm3, xmm7       // 256 - alpha
3037     pmullw     xmm2, xmm3       // _r_b * alpha
3038     movd       xmm1, [esi]      // _a_g
3039     lea        esi, [esi + 4]
3040     psrlw      xmm1, 8          // _a_g
3041     por        xmm0, xmm4       // set alpha to 255
3042     pmullw     xmm1, xmm3       // _a_g * alpha
3043     psrlw      xmm2, 8          // _r_b convert to 8 bits again
3044     paddusb    xmm0, xmm2       // + src argb
3045     pand       xmm1, xmm5       // a_g_ convert to 8 bits again
3046     paddusb    xmm0, xmm1       // + src argb
3047     sub        ecx, 1
3048     movd       [edx], xmm0
3049     lea        edx, [edx + 4]
3050     jge        alignloop1
3051 
3052   alignloop1b:
3053     add        ecx, 1 - 4
3054     jl         convertloop4b
3055 
3056     // 4 pixel loop.
3057   convertloop4:
3058     movdqu     xmm3, [eax]      // src argb
3059     lea        eax, [eax + 16]
3060     movdqa     xmm0, xmm3       // src argb
3061     pxor       xmm3, xmm4       // ~alpha
3062     movdqu     xmm2, [esi]      // _r_b
3063     psrlw      xmm3, 8          // alpha
3064     pshufhw    xmm3, xmm3,0F5h  // 8 alpha words
3065     pshuflw    xmm3, xmm3,0F5h
3066     pand       xmm2, xmm6       // _r_b
3067     paddw      xmm3, xmm7       // 256 - alpha
3068     pmullw     xmm2, xmm3       // _r_b * alpha
3069     movdqu     xmm1, [esi]      // _a_g
3070     lea        esi, [esi + 16]
3071     psrlw      xmm1, 8          // _a_g
3072     por        xmm0, xmm4       // set alpha to 255
3073     pmullw     xmm1, xmm3       // _a_g * alpha
3074     psrlw      xmm2, 8          // _r_b convert to 8 bits again
3075     paddusb    xmm0, xmm2       // + src argb
3076     pand       xmm1, xmm5       // a_g_ convert to 8 bits again
3077     paddusb    xmm0, xmm1       // + src argb
3078     sub        ecx, 4
3079     movdqa     [edx], xmm0
3080     lea        edx, [edx + 16]
3081     jge        convertloop4
3082 
3083   convertloop4b:
3084     add        ecx, 4 - 1
3085     jl         convertloop1b
3086 
3087     // 1 pixel loop.
3088   convertloop1:
3089     movd       xmm3, [eax]      // src argb
3090     lea        eax, [eax + 4]
3091     movdqa     xmm0, xmm3       // src argb
3092     pxor       xmm3, xmm4       // ~alpha
3093     movd       xmm2, [esi]      // _r_b
3094     psrlw      xmm3, 8          // alpha
3095     pshufhw    xmm3, xmm3,0F5h  // 8 alpha words
3096     pshuflw    xmm3, xmm3,0F5h
3097     pand       xmm2, xmm6       // _r_b
3098     paddw      xmm3, xmm7       // 256 - alpha
3099     pmullw     xmm2, xmm3       // _r_b * alpha
3100     movd       xmm1, [esi]      // _a_g
3101     lea        esi, [esi + 4]
3102     psrlw      xmm1, 8          // _a_g
3103     por        xmm0, xmm4       // set alpha to 255
3104     pmullw     xmm1, xmm3       // _a_g * alpha
3105     psrlw      xmm2, 8          // _r_b convert to 8 bits again
3106     paddusb    xmm0, xmm2       // + src argb
3107     pand       xmm1, xmm5       // a_g_ convert to 8 bits again
3108     paddusb    xmm0, xmm1       // + src argb
3109     sub        ecx, 1
3110     movd       [edx], xmm0
3111     lea        edx, [edx + 4]
3112     jge        convertloop1
3113 
3114   convertloop1b:
3115     pop        esi
3116     ret
3117   }
3118 }
3119 #endif  // HAS_ARGBBLENDROW_SSE2
3120 
3121 #ifdef HAS_ARGBBLENDROW_SSSE3
3122 // Shuffle table for isolating alpha.
3123 static const uvec8 kShuffleAlpha = {
3124   3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80,
3125   11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80
3126 };
3127 // Same as SSE2, but replaces:
3128 //    psrlw      xmm3, 8          // alpha
3129 //    pshufhw    xmm3, xmm3,0F5h  // 8 alpha words
3130 //    pshuflw    xmm3, xmm3,0F5h
3131 // with..
3132 //    pshufb     xmm3, kShuffleAlpha // alpha
3133 // Blend 8 pixels at a time.
3134 
3135 __declspec(naked) __declspec(align(16))
ARGBBlendRow_SSSE3(const uint8 * src_argb0,const uint8 * src_argb1,uint8 * dst_argb,int width)3136 void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
3137                         uint8* dst_argb, int width) {
3138   __asm {
3139     push       esi
3140     mov        eax, [esp + 4 + 4]   // src_argb0
3141     mov        esi, [esp + 4 + 8]   // src_argb1
3142     mov        edx, [esp + 4 + 12]  // dst_argb
3143     mov        ecx, [esp + 4 + 16]  // width
3144     pcmpeqb    xmm7, xmm7       // generate constant 1
3145     psrlw      xmm7, 15
3146     pcmpeqb    xmm6, xmm6       // generate mask 0x00ff00ff
3147     psrlw      xmm6, 8
3148     pcmpeqb    xmm5, xmm5       // generate mask 0xff00ff00
3149     psllw      xmm5, 8
3150     pcmpeqb    xmm4, xmm4       // generate mask 0xff000000
3151     pslld      xmm4, 24
3152 
3153     sub        ecx, 1
3154     je         convertloop1     // only 1 pixel?
3155     jl         convertloop1b
3156 
3157     // 1 pixel loop until destination pointer is aligned.
3158   alignloop1:
3159     test       edx, 15          // aligned?
3160     je         alignloop1b
3161     movd       xmm3, [eax]
3162     lea        eax, [eax + 4]
3163     movdqa     xmm0, xmm3       // src argb
3164     pxor       xmm3, xmm4       // ~alpha
3165     movd       xmm2, [esi]      // _r_b
3166     pshufb     xmm3, kShuffleAlpha // alpha
3167     pand       xmm2, xmm6       // _r_b
3168     paddw      xmm3, xmm7       // 256 - alpha
3169     pmullw     xmm2, xmm3       // _r_b * alpha
3170     movd       xmm1, [esi]      // _a_g
3171     lea        esi, [esi + 4]
3172     psrlw      xmm1, 8          // _a_g
3173     por        xmm0, xmm4       // set alpha to 255
3174     pmullw     xmm1, xmm3       // _a_g * alpha
3175     psrlw      xmm2, 8          // _r_b convert to 8 bits again
3176     paddusb    xmm0, xmm2       // + src argb
3177     pand       xmm1, xmm5       // a_g_ convert to 8 bits again
3178     paddusb    xmm0, xmm1       // + src argb
3179     sub        ecx, 1
3180     movd       [edx], xmm0
3181     lea        edx, [edx + 4]
3182     jge        alignloop1
3183 
3184   alignloop1b:
3185     add        ecx, 1 - 4
3186     jl         convertloop4b
3187 
3188     test       eax, 15          // unaligned?
3189     jne        convertuloop4
3190     test       esi, 15          // unaligned?
3191     jne        convertuloop4
3192 
3193     // 4 pixel loop.
3194   convertloop4:
3195     movdqa     xmm3, [eax]      // src argb
3196     lea        eax, [eax + 16]
3197     movdqa     xmm0, xmm3       // src argb
3198     pxor       xmm3, xmm4       // ~alpha
3199     movdqa     xmm2, [esi]      // _r_b
3200     pshufb     xmm3, kShuffleAlpha // alpha
3201     pand       xmm2, xmm6       // _r_b
3202     paddw      xmm3, xmm7       // 256 - alpha
3203     pmullw     xmm2, xmm3       // _r_b * alpha
3204     movdqa     xmm1, [esi]      // _a_g
3205     lea        esi, [esi + 16]
3206     psrlw      xmm1, 8          // _a_g
3207     por        xmm0, xmm4       // set alpha to 255
3208     pmullw     xmm1, xmm3       // _a_g * alpha
3209     psrlw      xmm2, 8          // _r_b convert to 8 bits again
3210     paddusb    xmm0, xmm2       // + src argb
3211     pand       xmm1, xmm5       // a_g_ convert to 8 bits again
3212     paddusb    xmm0, xmm1       // + src argb
3213     sub        ecx, 4
3214     movdqa     [edx], xmm0
3215     lea        edx, [edx + 16]
3216     jge        convertloop4
3217     jmp        convertloop4b
3218 
3219     // 4 pixel unaligned loop.
3220   convertuloop4:
3221     movdqu     xmm3, [eax]      // src argb
3222     lea        eax, [eax + 16]
3223     movdqa     xmm0, xmm3       // src argb
3224     pxor       xmm3, xmm4       // ~alpha
3225     movdqu     xmm2, [esi]      // _r_b
3226     pshufb     xmm3, kShuffleAlpha // alpha
3227     pand       xmm2, xmm6       // _r_b
3228     paddw      xmm3, xmm7       // 256 - alpha
3229     pmullw     xmm2, xmm3       // _r_b * alpha
3230     movdqu     xmm1, [esi]      // _a_g
3231     lea        esi, [esi + 16]
3232     psrlw      xmm1, 8          // _a_g
3233     por        xmm0, xmm4       // set alpha to 255
3234     pmullw     xmm1, xmm3       // _a_g * alpha
3235     psrlw      xmm2, 8          // _r_b convert to 8 bits again
3236     paddusb    xmm0, xmm2       // + src argb
3237     pand       xmm1, xmm5       // a_g_ convert to 8 bits again
3238     paddusb    xmm0, xmm1       // + src argb
3239     sub        ecx, 4
3240     movdqa     [edx], xmm0
3241     lea        edx, [edx + 16]
3242     jge        convertuloop4
3243 
3244   convertloop4b:
3245     add        ecx, 4 - 1
3246     jl         convertloop1b
3247 
3248     // 1 pixel loop.
3249   convertloop1:
3250     movd       xmm3, [eax]      // src argb
3251     lea        eax, [eax + 4]
3252     movdqa     xmm0, xmm3       // src argb
3253     pxor       xmm3, xmm4       // ~alpha
3254     movd       xmm2, [esi]      // _r_b
3255     pshufb     xmm3, kShuffleAlpha // alpha
3256     pand       xmm2, xmm6       // _r_b
3257     paddw      xmm3, xmm7       // 256 - alpha
3258     pmullw     xmm2, xmm3       // _r_b * alpha
3259     movd       xmm1, [esi]      // _a_g
3260     lea        esi, [esi + 4]
3261     psrlw      xmm1, 8          // _a_g
3262     por        xmm0, xmm4       // set alpha to 255
3263     pmullw     xmm1, xmm3       // _a_g * alpha
3264     psrlw      xmm2, 8          // _r_b convert to 8 bits again
3265     paddusb    xmm0, xmm2       // + src argb
3266     pand       xmm1, xmm5       // a_g_ convert to 8 bits again
3267     paddusb    xmm0, xmm1       // + src argb
3268     sub        ecx, 1
3269     movd       [edx], xmm0
3270     lea        edx, [edx + 4]
3271     jge        convertloop1
3272 
3273   convertloop1b:
3274     pop        esi
3275     ret
3276   }
3277 }
3278 #endif  // HAS_ARGBBLENDROW_SSSE3
3279 
3280 #ifdef HAS_ARGBATTENUATE_SSE2
3281 // Attenuate 4 pixels at a time.
3282 // Aligned to 16 bytes.
3283 __declspec(naked) __declspec(align(16))
ARGBAttenuateRow_SSE2(const uint8 * src_argb,uint8 * dst_argb,int width)3284 void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
3285   __asm {
3286     mov        eax, [esp + 4]   // src_argb0
3287     mov        edx, [esp + 8]   // dst_argb
3288     mov        ecx, [esp + 12]  // width
3289     sub        edx, eax
3290     pcmpeqb    xmm4, xmm4       // generate mask 0xff000000
3291     pslld      xmm4, 24
3292     pcmpeqb    xmm5, xmm5       // generate mask 0x00ffffff
3293     psrld      xmm5, 8
3294 
3295     align      16
3296  convertloop:
3297     movdqa     xmm0, [eax]      // read 4 pixels
3298     punpcklbw  xmm0, xmm0       // first 2
3299     pshufhw    xmm2, xmm0,0FFh  // 8 alpha words
3300     pshuflw    xmm2, xmm2,0FFh
3301     pmulhuw    xmm0, xmm2       // rgb * a
3302     movdqa     xmm1, [eax]      // read 4 pixels
3303     punpckhbw  xmm1, xmm1       // next 2 pixels
3304     pshufhw    xmm2, xmm1,0FFh  // 8 alpha words
3305     pshuflw    xmm2, xmm2,0FFh
3306     pmulhuw    xmm1, xmm2       // rgb * a
3307     movdqa     xmm2, [eax]      // alphas
3308     psrlw      xmm0, 8
3309     pand       xmm2, xmm4
3310     psrlw      xmm1, 8
3311     packuswb   xmm0, xmm1
3312     pand       xmm0, xmm5       // keep original alphas
3313     por        xmm0, xmm2
3314     sub        ecx, 4
3315     movdqa     [eax + edx], xmm0
3316     lea        eax, [eax + 16]
3317     jg         convertloop
3318 
3319     ret
3320   }
3321 }
3322 #endif  // HAS_ARGBATTENUATE_SSE2
3323 
3324 #ifdef HAS_ARGBATTENUATEROW_SSSE3
3325 // Shuffle table duplicating alpha.
3326 static const uvec8 kShuffleAlpha0 = {
3327   3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u,
3328 };
3329 static const uvec8 kShuffleAlpha1 = {
3330   11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
3331   15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u,
3332 };
3333 __declspec(naked) __declspec(align(16))
ARGBAttenuateRow_SSSE3(const uint8 * src_argb,uint8 * dst_argb,int width)3334 void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
3335   __asm {
3336     mov        eax, [esp + 4]   // src_argb0
3337     mov        edx, [esp + 8]   // dst_argb
3338     mov        ecx, [esp + 12]  // width
3339     sub        edx, eax
3340     pcmpeqb    xmm3, xmm3       // generate mask 0xff000000
3341     pslld      xmm3, 24
3342     movdqa     xmm4, kShuffleAlpha0
3343     movdqa     xmm5, kShuffleAlpha1
3344 
3345     align      16
3346  convertloop:
3347     movdqa     xmm0, [eax]      // read 4 pixels
3348     pshufb     xmm0, xmm4       // isolate first 2 alphas
3349     movdqa     xmm1, [eax]      // read 4 pixels
3350     punpcklbw  xmm1, xmm1       // first 2 pixel rgbs
3351     pmulhuw    xmm0, xmm1       // rgb * a
3352     movdqa     xmm1, [eax]      // read 4 pixels
3353     pshufb     xmm1, xmm5       // isolate next 2 alphas
3354     movdqa     xmm2, [eax]      // read 4 pixels
3355     punpckhbw  xmm2, xmm2       // next 2 pixel rgbs
3356     pmulhuw    xmm1, xmm2       // rgb * a
3357     movdqa     xmm2, [eax]      // mask original alpha
3358     pand       xmm2, xmm3
3359     psrlw      xmm0, 8
3360     psrlw      xmm1, 8
3361     packuswb   xmm0, xmm1
3362     por        xmm0, xmm2       // copy original alpha
3363     sub        ecx, 4
3364     movdqa     [eax + edx], xmm0
3365     lea        eax, [eax + 16]
3366     jg         convertloop
3367 
3368     ret
3369   }
3370 }
3371 #endif  // HAS_ARGBATTENUATEROW_SSSE3
3372 
3373 #ifdef HAS_ARGBUNATTENUATEROW_SSE2
3374 // Unattenuate 4 pixels at a time.
3375 // Aligned to 16 bytes.
3376 __declspec(naked) __declspec(align(16))
ARGBUnattenuateRow_SSE2(const uint8 * src_argb,uint8 * dst_argb,int width)3377 void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
3378                              int width) {
3379   __asm {
3380     push       esi
3381     push       edi
3382     mov        eax, [esp + 8 + 4]   // src_argb0
3383     mov        edx, [esp + 8 + 8]   // dst_argb
3384     mov        ecx, [esp + 8 + 12]  // width
3385     sub        edx, eax
3386     pcmpeqb    xmm4, xmm4       // generate mask 0xff000000
3387     pslld      xmm4, 24
3388 
3389     align      16
3390  convertloop:
3391     movdqa     xmm0, [eax]      // read 4 pixels
3392     movzx      esi, byte ptr [eax + 3]  // first alpha
3393     movzx      edi, byte ptr [eax + 7]  // second alpha
3394     punpcklbw  xmm0, xmm0       // first 2
3395     movd       xmm2, dword ptr fixed_invtbl8[esi * 4]
3396     movd       xmm3, dword ptr fixed_invtbl8[edi * 4]
3397     pshuflw    xmm2, xmm2,0C0h  // first 4 inv_alpha words
3398     pshuflw    xmm3, xmm3,0C0h  // next 4 inv_alpha words
3399     movlhps    xmm2, xmm3
3400     pmulhuw    xmm0, xmm2       // rgb * a
3401 
3402     movdqa     xmm1, [eax]      // read 4 pixels
3403     movzx      esi, byte ptr [eax + 11]  // third alpha
3404     movzx      edi, byte ptr [eax + 15]  // forth alpha
3405     punpckhbw  xmm1, xmm1       // next 2
3406     movd       xmm2, dword ptr fixed_invtbl8[esi * 4]
3407     movd       xmm3, dword ptr fixed_invtbl8[edi * 4]
3408     pshuflw    xmm2, xmm2,0C0h  // first 4 inv_alpha words
3409     pshuflw    xmm3, xmm3,0C0h  // next 4 inv_alpha words
3410     movlhps    xmm2, xmm3
3411     pmulhuw    xmm1, xmm2       // rgb * a
3412 
3413     movdqa     xmm2, [eax]      // alphas
3414     pand       xmm2, xmm4
3415     packuswb   xmm0, xmm1
3416     por        xmm0, xmm2
3417     sub        ecx, 4
3418     movdqa     [eax + edx], xmm0
3419     lea        eax, [eax + 16]
3420     jg         convertloop
3421     pop        edi
3422     pop        esi
3423     ret
3424   }
3425 }
3426 #endif  // HAS_ARGBUNATTENUATEROW_SSE2
3427 
3428 #ifdef HAS_ARGBGRAYROW_SSSE3
3429 // Constant for ARGB color to gray scale: 0.11 * B + 0.59 * G + 0.30 * R
3430 static const vec8 kARGBToGray = {
3431   14, 76, 38, 0, 14, 76, 38, 0, 14, 76, 38, 0, 14, 76, 38, 0
3432 };
3433 
3434 // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels.
3435 __declspec(naked) __declspec(align(16))
ARGBGrayRow_SSSE3(const uint8 * src_argb,uint8 * dst_argb,int width)3436 void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
3437   __asm {
3438     mov        eax, [esp + 4]   /* src_argb */
3439     mov        edx, [esp + 8]   /* dst_argb */
3440     mov        ecx, [esp + 12]  /* width */
3441     movdqa     xmm4, kARGBToGray
3442     sub        edx, eax
3443 
3444     align      16
3445  convertloop:
3446     movdqa     xmm0, [eax]  // G
3447     movdqa     xmm1, [eax + 16]
3448     pmaddubsw  xmm0, xmm4
3449     pmaddubsw  xmm1, xmm4
3450     phaddw     xmm0, xmm1
3451     psrlw      xmm0, 7
3452     packuswb   xmm0, xmm0   // 8 G bytes
3453     movdqa     xmm2, [eax]  // A
3454     movdqa     xmm3, [eax + 16]
3455     psrld      xmm2, 24
3456     psrld      xmm3, 24
3457     packuswb   xmm2, xmm3
3458     packuswb   xmm2, xmm2   // 8 A bytes
3459     movdqa     xmm3, xmm0   // Weave into GG, GA, then GGGA
3460     punpcklbw  xmm0, xmm0   // 8 GG words
3461     punpcklbw  xmm3, xmm2   // 8 GA words
3462     movdqa     xmm1, xmm0
3463     punpcklwd  xmm0, xmm3   // GGGA first 4
3464     punpckhwd  xmm1, xmm3   // GGGA next 4
3465     sub        ecx, 8
3466     movdqa     [eax + edx], xmm0
3467     movdqa     [eax + edx + 16], xmm1
3468     lea        eax, [eax + 32]
3469     jg         convertloop
3470     ret
3471   }
3472 }
3473 #endif  // HAS_ARGBGRAYROW_SSSE3
3474 
3475 #ifdef HAS_ARGBSEPIAROW_SSSE3
3476 //    b = (r * 35 + g * 68 + b * 17) >> 7
3477 //    g = (r * 45 + g * 88 + b * 22) >> 7
3478 //    r = (r * 50 + g * 98 + b * 24) >> 7
3479 // Constant for ARGB color to sepia tone.
3480 static const vec8 kARGBToSepiaB = {
3481   17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0
3482 };
3483 
3484 static const vec8 kARGBToSepiaG = {
3485   22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0
3486 };
3487 
3488 static const vec8 kARGBToSepiaR = {
3489   24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0
3490 };
3491 
3492 // Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
3493 __declspec(naked) __declspec(align(16))
ARGBSepiaRow_SSSE3(uint8 * dst_argb,int width)3494 void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
3495   __asm {
3496     mov        eax, [esp + 4]   /* dst_argb */
3497     mov        ecx, [esp + 8]   /* width */
3498     movdqa     xmm2, kARGBToSepiaB
3499     movdqa     xmm3, kARGBToSepiaG
3500     movdqa     xmm4, kARGBToSepiaR
3501 
3502     align      16
3503  convertloop:
3504     movdqa     xmm0, [eax]  // B
3505     movdqa     xmm6, [eax + 16]
3506     pmaddubsw  xmm0, xmm2
3507     pmaddubsw  xmm6, xmm2
3508     phaddw     xmm0, xmm6
3509     psrlw      xmm0, 7
3510     packuswb   xmm0, xmm0   // 8 B values
3511     movdqa     xmm5, [eax]  // G
3512     movdqa     xmm1, [eax + 16]
3513     pmaddubsw  xmm5, xmm3
3514     pmaddubsw  xmm1, xmm3
3515     phaddw     xmm5, xmm1
3516     psrlw      xmm5, 7
3517     packuswb   xmm5, xmm5   // 8 G values
3518     punpcklbw  xmm0, xmm5   // 8 BG values
3519     movdqa     xmm5, [eax]  // R
3520     movdqa     xmm1, [eax + 16]
3521     pmaddubsw  xmm5, xmm4
3522     pmaddubsw  xmm1, xmm4
3523     phaddw     xmm5, xmm1
3524     psrlw      xmm5, 7
3525     packuswb   xmm5, xmm5   // 8 R values
3526     movdqa     xmm6, [eax]  // A
3527     movdqa     xmm1, [eax + 16]
3528     psrld      xmm6, 24
3529     psrld      xmm1, 24
3530     packuswb   xmm6, xmm1
3531     packuswb   xmm6, xmm6   // 8 A values
3532     punpcklbw  xmm5, xmm6   // 8 RA values
3533     movdqa     xmm1, xmm0   // Weave BG, RA together
3534     punpcklwd  xmm0, xmm5   // BGRA first 4
3535     punpckhwd  xmm1, xmm5   // BGRA next 4
3536     sub        ecx, 8
3537     movdqa     [eax], xmm0
3538     movdqa     [eax + 16], xmm1
3539     lea        eax, [eax + 32]
3540     jg         convertloop
3541     ret
3542   }
3543 }
3544 #endif  // HAS_ARGBSEPIAROW_SSSE3
3545 
3546 #ifdef HAS_ARGBCOLORMATRIXROW_SSSE3
3547 // Tranform 8 ARGB pixels (32 bytes) with color matrix.
3548 // Same as Sepia except matrix is provided.
3549 // TODO(fbarchard): packuswbs only use half of the reg. To make RGBA, combine R
3550 // and B into a high and low, then G/A, unpackl/hbw and then unpckl/hwd.
3551 __declspec(naked) __declspec(align(16))
ARGBColorMatrixRow_SSSE3(uint8 * dst_argb,const int8 * matrix_argb,int width)3552 void ARGBColorMatrixRow_SSSE3(uint8* dst_argb, const int8* matrix_argb,
3553                               int width) {
3554   __asm {
3555     mov        eax, [esp + 4]   /* dst_argb */
3556     mov        edx, [esp + 8]   /* matrix_argb */
3557     mov        ecx, [esp + 12]  /* width */
3558     movd       xmm2, [edx]
3559     movd       xmm3, [edx + 4]
3560     movd       xmm4, [edx + 8]
3561     pshufd     xmm2, xmm2, 0
3562     pshufd     xmm3, xmm3, 0
3563     pshufd     xmm4, xmm4, 0
3564 
3565     align      16
3566  convertloop:
3567     movdqa     xmm0, [eax]  // B
3568     movdqa     xmm6, [eax + 16]
3569     pmaddubsw  xmm0, xmm2
3570     pmaddubsw  xmm6, xmm2
3571     movdqa     xmm5, [eax]  // G
3572     movdqa     xmm1, [eax + 16]
3573     pmaddubsw  xmm5, xmm3
3574     pmaddubsw  xmm1, xmm3
3575     phaddsw    xmm0, xmm6   // B
3576     phaddsw    xmm5, xmm1   // G
3577     psraw      xmm0, 7      // B
3578     psraw      xmm5, 7      // G
3579     packuswb   xmm0, xmm0   // 8 B values
3580     packuswb   xmm5, xmm5   // 8 G values
3581     punpcklbw  xmm0, xmm5   // 8 BG values
3582     movdqa     xmm5, [eax]  // R
3583     movdqa     xmm1, [eax + 16]
3584     pmaddubsw  xmm5, xmm4
3585     pmaddubsw  xmm1, xmm4
3586     phaddsw    xmm5, xmm1
3587     psraw      xmm5, 7
3588     packuswb   xmm5, xmm5   // 8 R values
3589     movdqa     xmm6, [eax]  // A
3590     movdqa     xmm1, [eax + 16]
3591     psrld      xmm6, 24
3592     psrld      xmm1, 24
3593     packuswb   xmm6, xmm1
3594     packuswb   xmm6, xmm6   // 8 A values
3595     movdqa     xmm1, xmm0   // Weave BG, RA together
3596     punpcklbw  xmm5, xmm6   // 8 RA values
3597     punpcklwd  xmm0, xmm5   // BGRA first 4
3598     punpckhwd  xmm1, xmm5   // BGRA next 4
3599     sub        ecx, 8
3600     movdqa     [eax], xmm0
3601     movdqa     [eax + 16], xmm1
3602     lea        eax, [eax + 32]
3603     jg         convertloop
3604     ret
3605   }
3606 }
3607 #endif  // HAS_ARGBCOLORMATRIXROW_SSSE3
3608 
3609 #ifdef HAS_ARGBCOLORTABLEROW_X86
3610 // Tranform ARGB pixels with color table.
3611 __declspec(naked) __declspec(align(16))
ARGBColorTableRow_X86(uint8 * dst_argb,const uint8 * table_argb,int width)3612 void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb,
3613                            int width) {
3614   __asm {
3615     push       ebx
3616     push       esi
3617     push       edi
3618     push       ebp
3619     mov        eax, [esp + 16 + 4]   /* dst_argb */
3620     mov        edi, [esp + 16 + 8]   /* table_argb */
3621     mov        ecx, [esp + 16 + 12]  /* width */
3622     xor        ebx, ebx
3623     xor        edx, edx
3624 
3625     align      16
3626  convertloop:
3627     mov        ebp, dword ptr [eax]  // BGRA
3628     mov        esi, ebp
3629     and        ebp, 255
3630     shr        esi, 8
3631     and        esi, 255
3632     mov        bl, [edi + ebp * 4 + 0]  // B
3633     mov        dl, [edi + esi * 4 + 1]  // G
3634     mov        ebp, dword ptr [eax]  // BGRA
3635     mov        esi, ebp
3636     shr        ebp, 16
3637     shr        esi, 24
3638     and        ebp, 255
3639     mov        [eax], bl
3640     mov        [eax + 1], dl
3641     mov        bl, [edi + ebp * 4 + 2]  // R
3642     mov        dl, [edi + esi * 4 + 3]  // A
3643     mov        [eax + 2], bl
3644     mov        [eax + 3], dl
3645     lea        eax, [eax + 4]
3646     sub        ecx, 1
3647     jg         convertloop
3648     pop        ebp
3649     pop        edi
3650     pop        esi
3651     pop        ebx
3652     ret
3653   }
3654 }
3655 #endif  // HAS_ARGBCOLORTABLEROW_X86
3656 
3657 #ifdef HAS_ARGBQUANTIZEROW_SSE2
3658 // Quantize 4 ARGB pixels (16 bytes).
3659 // Aligned to 16 bytes.
3660 __declspec(naked) __declspec(align(16))
ARGBQuantizeRow_SSE2(uint8 * dst_argb,int scale,int interval_size,int interval_offset,int width)3661 void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,
3662                           int interval_offset, int width) {
3663   __asm {
3664     mov        eax, [esp + 4]    /* dst_argb */
3665     movd       xmm2, [esp + 8]   /* scale */
3666     movd       xmm3, [esp + 12]  /* interval_size */
3667     movd       xmm4, [esp + 16]  /* interval_offset */
3668     mov        ecx, [esp + 20]   /* width */
3669     pshuflw    xmm2, xmm2, 040h
3670     pshufd     xmm2, xmm2, 044h
3671     pshuflw    xmm3, xmm3, 040h
3672     pshufd     xmm3, xmm3, 044h
3673     pshuflw    xmm4, xmm4, 040h
3674     pshufd     xmm4, xmm4, 044h
3675     pxor       xmm5, xmm5  // constant 0
3676     pcmpeqb    xmm6, xmm6  // generate mask 0xff000000
3677     pslld      xmm6, 24
3678 
3679     align      16
3680  convertloop:
3681     movdqa     xmm0, [eax]  // read 4 pixels
3682     punpcklbw  xmm0, xmm5   // first 2 pixels
3683     pmulhuw    xmm0, xmm2   // pixel * scale >> 16
3684     movdqa     xmm1, [eax]  // read 4 pixels
3685     punpckhbw  xmm1, xmm5   // next 2 pixels
3686     pmulhuw    xmm1, xmm2
3687     pmullw     xmm0, xmm3   // * interval_size
3688     movdqa     xmm7, [eax]  // read 4 pixels
3689     pmullw     xmm1, xmm3
3690     pand       xmm7, xmm6   // mask alpha
3691     paddw      xmm0, xmm4   // + interval_size / 2
3692     paddw      xmm1, xmm4
3693     packuswb   xmm0, xmm1
3694     por        xmm0, xmm7
3695     sub        ecx, 4
3696     movdqa     [eax], xmm0
3697     lea        eax, [eax + 16]
3698     jg         convertloop
3699     ret
3700   }
3701 }
3702 #endif  // HAS_ARGBQUANTIZEROW_SSE2
3703 
3704 #ifdef HAS_CUMULATIVESUMTOAVERAGE_SSE2
3705 // Consider float CumulativeSum.
3706 // Consider calling CumulativeSum one row at time as needed.
3707 // Consider circular CumulativeSum buffer of radius * 2 + 1 height.
3708 // Convert cumulative sum for an area to an average for 1 pixel.
3709 // topleft is pointer to top left of CumulativeSum buffer for area.
3710 // botleft is pointer to bottom left of CumulativeSum buffer.
3711 // width is offset from left to right of area in CumulativeSum buffer measured
3712 //   in number of ints.
3713 // area is the number of pixels in the area being averaged.
3714 // dst points to pixel to store result to.
3715 // count is number of averaged pixels to produce.
3716 // Does 4 pixels at a time, requires CumulativeSum pointers to be 16 byte
3717 // aligned.
CumulativeSumToAverage_SSE2(const int32 * topleft,const int32 * botleft,int width,int area,uint8 * dst,int count)3718 void CumulativeSumToAverage_SSE2(const int32* topleft, const int32* botleft,
3719                                  int width, int area, uint8* dst, int count) {
3720   __asm {
3721     mov        eax, topleft  // eax topleft
3722     mov        esi, botleft  // esi botleft
3723     mov        edx, width
3724     movd       xmm4, area
3725     mov        edi, dst
3726     mov        ecx, count
3727     cvtdq2ps   xmm4, xmm4
3728     rcpss      xmm4, xmm4  // 1.0f / area
3729     pshufd     xmm4, xmm4, 0
3730     sub        ecx, 4
3731     jl         l4b
3732 
3733     // 4 pixel loop
3734     align      4
3735   l4:
3736     // top left
3737     movdqa     xmm0, [eax]
3738     movdqa     xmm1, [eax + 16]
3739     movdqa     xmm2, [eax + 32]
3740     movdqa     xmm3, [eax + 48]
3741 
3742     // - top right
3743     psubd      xmm0, [eax + edx * 4]
3744     psubd      xmm1, [eax + edx * 4 + 16]
3745     psubd      xmm2, [eax + edx * 4 + 32]
3746     psubd      xmm3, [eax + edx * 4 + 48]
3747     lea        eax, [eax + 64]
3748 
3749     // - bottom left
3750     psubd      xmm0, [esi]
3751     psubd      xmm1, [esi + 16]
3752     psubd      xmm2, [esi + 32]
3753     psubd      xmm3, [esi + 48]
3754 
3755     // + bottom right
3756     paddd      xmm0, [esi + edx * 4]
3757     paddd      xmm1, [esi + edx * 4 + 16]
3758     paddd      xmm2, [esi + edx * 4 + 32]
3759     paddd      xmm3, [esi + edx * 4 + 48]
3760     lea        esi, [esi + 64]
3761 
3762     cvtdq2ps   xmm0, xmm0   // Average = Sum * 1 / Area
3763     cvtdq2ps   xmm1, xmm1
3764     mulps      xmm0, xmm4
3765     mulps      xmm1, xmm4
3766     cvtdq2ps   xmm2, xmm2
3767     cvtdq2ps   xmm3, xmm3
3768     mulps      xmm2, xmm4
3769     mulps      xmm3, xmm4
3770     cvtps2dq   xmm0, xmm0
3771     cvtps2dq   xmm1, xmm1
3772     cvtps2dq   xmm2, xmm2
3773     cvtps2dq   xmm3, xmm3
3774     packssdw   xmm0, xmm1
3775     packssdw   xmm2, xmm3
3776     packuswb   xmm0, xmm2
3777     movdqu     [edi], xmm0
3778     lea        edi, [edi + 16]
3779     sub        ecx, 4
3780     jge        l4
3781 
3782   l4b:
3783     add        ecx, 4 - 1
3784     jl         l1b
3785 
3786     // 1 pixel loop
3787     align      4
3788   l1:
3789     movdqa     xmm0, [eax]
3790     psubd      xmm0, [eax + edx * 4]
3791     lea        eax, [eax + 16]
3792     psubd      xmm0, [esi]
3793     paddd      xmm0, [esi + edx * 4]
3794     lea        esi, [esi + 16]
3795     cvtdq2ps   xmm0, xmm0
3796     mulps      xmm0, xmm4
3797     cvtps2dq   xmm0, xmm0
3798     packssdw   xmm0, xmm0
3799     packuswb   xmm0, xmm0
3800     movd       dword ptr [edi], xmm0
3801     lea        edi, [edi + 4]
3802     sub        ecx, 1
3803     jge        l1
3804   l1b:
3805   }
3806 }
3807 #endif  // HAS_CUMULATIVESUMTOAVERAGE_SSE2
3808 
3809 #ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2
3810 // Creates a table of cumulative sums where each value is a sum of all values
3811 // above and to the left of the value.
ComputeCumulativeSumRow_SSE2(const uint8 * row,int32 * cumsum,const int32 * previous_cumsum,int width)3812 void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
3813                                   const int32* previous_cumsum, int width) {
3814   __asm {
3815     mov        eax, row
3816     mov        edx, cumsum
3817     mov        esi, previous_cumsum
3818     mov        ecx, width
3819     sub        esi, edx
3820     pxor       xmm0, xmm0
3821     pxor       xmm1, xmm1
3822 
3823     sub        ecx, 4
3824     jl         l4b
3825     test       edx, 15
3826     jne        l4b
3827 
3828     // 4 pixel loop
3829     align      4
3830   l4:
3831     movdqu     xmm2, [eax]  // 4 argb pixels 16 bytes.
3832     lea        eax, [eax + 16]
3833     movdqa     xmm4, xmm2
3834 
3835     punpcklbw  xmm2, xmm1
3836     movdqa     xmm3, xmm2
3837     punpcklwd  xmm2, xmm1
3838     punpckhwd  xmm3, xmm1
3839 
3840     punpckhbw  xmm4, xmm1
3841     movdqa     xmm5, xmm4
3842     punpcklwd  xmm4, xmm1
3843     punpckhwd  xmm5, xmm1
3844 
3845     paddd      xmm0, xmm2
3846     movdqa     xmm2, [edx + esi]  // previous row above.
3847     paddd      xmm2, xmm0
3848 
3849     paddd      xmm0, xmm3
3850     movdqa     xmm3, [edx + esi + 16]
3851     paddd      xmm3, xmm0
3852 
3853     paddd      xmm0, xmm4
3854     movdqa     xmm4, [edx + esi + 32]
3855     paddd      xmm4, xmm0
3856 
3857     paddd      xmm0, xmm5
3858     movdqa     xmm5, [edx + esi + 48]
3859     paddd      xmm5, xmm0
3860 
3861     movdqa     [edx], xmm2
3862     movdqa     [edx + 16], xmm3
3863     movdqa     [edx + 32], xmm4
3864     movdqa     [edx + 48], xmm5
3865 
3866     lea        edx, [edx + 64]
3867     sub        ecx, 4
3868     jge        l4
3869 
3870   l4b:
3871     add        ecx, 4 - 1
3872     jl         l1b
3873 
3874     // 1 pixel loop
3875     align      4
3876   l1:
3877     movd       xmm2, dword ptr [eax]  // 1 argb pixel 4 bytes.
3878     lea        eax, [eax + 4]
3879     punpcklbw  xmm2, xmm1
3880     punpcklwd  xmm2, xmm1
3881     paddd      xmm0, xmm2
3882     movdqu     xmm2, [edx + esi]
3883     paddd      xmm2, xmm0
3884     movdqu     [edx], xmm2
3885     lea        edx, [edx + 16]
3886     sub        ecx, 1
3887     jge        l1
3888 
3889  l1b:
3890   }
3891 }
3892 #endif  // HAS_COMPUTECUMULATIVESUMROW_SSE2
3893 
3894 #ifdef HAS_ARGBSHADE_SSE2
3895 // Shade 4 pixels at a time by specified value.
3896 // Aligned to 16 bytes.
3897 __declspec(naked) __declspec(align(16))
ARGBShadeRow_SSE2(const uint8 * src_argb,uint8 * dst_argb,int width,uint32 value)3898 void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
3899                        uint32 value) {
3900   __asm {
3901     mov        eax, [esp + 4]   // src_argb
3902     mov        edx, [esp + 8]   // dst_argb
3903     mov        ecx, [esp + 12]  // width
3904     movd       xmm2, [esp + 16]  // value
3905     sub        edx, eax
3906     punpcklbw  xmm2, xmm2
3907     punpcklqdq xmm2, xmm2
3908 
3909     align      16
3910  convertloop:
3911     movdqa     xmm0, [eax]      // read 4 pixels
3912     movdqa     xmm1, xmm0
3913     punpcklbw  xmm0, xmm0       // first 2
3914     punpckhbw  xmm1, xmm1       // next 2
3915     pmulhuw    xmm0, xmm2       // argb * value
3916     pmulhuw    xmm1, xmm2       // argb * value
3917     psrlw      xmm0, 8
3918     psrlw      xmm1, 8
3919     packuswb   xmm0, xmm1
3920     sub        ecx, 4
3921     movdqa     [eax + edx], xmm0
3922     lea        eax, [eax + 16]
3923     jg         convertloop
3924 
3925     ret
3926   }
3927 }
3928 #endif  // HAS_ARGBSHADE_SSE2
3929 
3930 #ifdef HAS_ARGBAFFINEROW_SSE2
3931 // Copy ARGB pixels from source image with slope to a row of destination.
3932 __declspec(naked) __declspec(align(16))
3933 LIBYUV_API
ARGBAffineRow_SSE2(const uint8 * src_argb,int src_argb_stride,uint8 * dst_argb,const float * uv_dudv,int width)3934 void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
3935                         uint8* dst_argb, const float* uv_dudv, int width) {
3936   __asm {
3937     push       esi
3938     push       edi
3939     mov        eax, [esp + 12]   // src_argb
3940     mov        esi, [esp + 16]  // stride
3941     mov        edx, [esp + 20]  // dst_argb
3942     mov        ecx, [esp + 24]  // pointer to uv_dudv
3943     movq       xmm2, qword ptr [ecx]  // uv
3944     movq       xmm7, qword ptr [ecx + 8]  // dudv
3945     mov        ecx, [esp + 28]  // width
3946     shl        esi, 16          // 4, stride
3947     add        esi, 4
3948     movd       xmm5, esi
3949     sub        ecx, 4
3950     jl         l4b
3951 
3952     // setup for 4 pixel loop
3953     pshufd     xmm7, xmm7, 0x44  // dup dudv
3954     pshufd     xmm5, xmm5, 0  // dup 4, stride
3955     movdqa     xmm0, xmm2    // x0, y0, x1, y1
3956     addps      xmm0, xmm7
3957     movlhps    xmm2, xmm0
3958     movdqa     xmm4, xmm7
3959     addps      xmm4, xmm4    // dudv *= 2
3960     movdqa     xmm3, xmm2    // x2, y2, x3, y3
3961     addps      xmm3, xmm4
3962     addps      xmm4, xmm4    // dudv *= 4
3963 
3964     // 4 pixel loop
3965     align      4
3966   l4:
3967     cvttps2dq  xmm0, xmm2    // x, y float to int first 2
3968     cvttps2dq  xmm1, xmm3    // x, y float to int next 2
3969     packssdw   xmm0, xmm1    // x, y as 8 shorts
3970     pmaddwd    xmm0, xmm5    // offsets = x * 4 + y * stride.
3971     movd       esi, xmm0
3972     pshufd     xmm0, xmm0, 0x39  // shift right
3973     movd       edi, xmm0
3974     pshufd     xmm0, xmm0, 0x39  // shift right
3975     movd       xmm1, [eax + esi]  // read pixel 0
3976     movd       xmm6, [eax + edi]  // read pixel 1
3977     punpckldq  xmm1, xmm6     // combine pixel 0 and 1
3978     addps      xmm2, xmm4    // x, y += dx, dy first 2
3979     movq       qword ptr [edx], xmm1
3980     movd       esi, xmm0
3981     pshufd     xmm0, xmm0, 0x39  // shift right
3982     movd       edi, xmm0
3983     movd       xmm6, [eax + esi]  // read pixel 2
3984     movd       xmm0, [eax + edi]  // read pixel 3
3985     punpckldq  xmm6, xmm0     // combine pixel 2 and 3
3986     addps      xmm3, xmm4    // x, y += dx, dy next 2
3987     sub        ecx, 4
3988     movq       qword ptr 8[edx], xmm6
3989     lea        edx, [edx + 16]
3990     jge        l4
3991 
3992   l4b:
3993     add        ecx, 4 - 1
3994     jl         l1b
3995 
3996     // 1 pixel loop
3997     align      4
3998   l1:
3999     cvttps2dq  xmm0, xmm2    // x, y float to int
4000     packssdw   xmm0, xmm0    // x, y as shorts
4001     pmaddwd    xmm0, xmm5    // offset = x * 4 + y * stride
4002     addps      xmm2, xmm7    // x, y += dx, dy
4003     movd       esi, xmm0
4004     movd       xmm0, [eax + esi]  // copy a pixel
4005     sub        ecx, 1
4006     movd       [edx], xmm0
4007     lea        edx, [edx + 4]
4008     jge        l1
4009   l1b:
4010     pop        edi
4011     pop        esi
4012     ret
4013   }
4014 }
4015 #endif  // HAS_ARGBAFFINEROW_SSE2
4016 
4017 // Bilinear row filtering combines 4x2 -> 4x1. SSSE3 version.
4018 __declspec(naked) __declspec(align(16))
ARGBInterpolateRow_SSSE3(uint8 * dst_ptr,const uint8 * src_ptr,ptrdiff_t src_stride,int dst_width,int source_y_fraction)4019 void ARGBInterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
4020                               ptrdiff_t src_stride, int dst_width,
4021                               int source_y_fraction) {
4022   __asm {
4023     push       esi
4024     push       edi
4025     mov        edi, [esp + 8 + 4]   // dst_ptr
4026     mov        esi, [esp + 8 + 8]   // src_ptr
4027     mov        edx, [esp + 8 + 12]  // src_stride
4028     mov        ecx, [esp + 8 + 16]  // dst_width
4029     mov        eax, [esp + 8 + 20]  // source_y_fraction (0..255)
4030     sub        edi, esi
4031     shr        eax, 1
4032     cmp        eax, 0
4033     je         xloop1
4034     cmp        eax, 64
4035     je         xloop2
4036     movd       xmm0, eax  // high fraction 0..127
4037     neg        eax
4038     add        eax, 128
4039     movd       xmm5, eax  // low fraction 128..1
4040     punpcklbw  xmm5, xmm0
4041     punpcklwd  xmm5, xmm5
4042     pshufd     xmm5, xmm5, 0
4043 
4044     align      16
4045   xloop:
4046     movdqa     xmm0, [esi]
4047     movdqa     xmm2, [esi + edx]
4048     movdqa     xmm1, xmm0
4049     punpcklbw  xmm0, xmm2
4050     punpckhbw  xmm1, xmm2
4051     pmaddubsw  xmm0, xmm5
4052     pmaddubsw  xmm1, xmm5
4053     psrlw      xmm0, 7
4054     psrlw      xmm1, 7
4055     packuswb   xmm0, xmm1
4056     sub        ecx, 4
4057     movdqa     [esi + edi], xmm0
4058     lea        esi, [esi + 16]
4059     jg         xloop
4060 
4061     pop        edi
4062     pop        esi
4063     ret
4064 
4065     align      16
4066   xloop1:
4067     movdqa     xmm0, [esi]
4068     sub        ecx, 4
4069     movdqa     [esi + edi], xmm0
4070     lea        esi, [esi + 16]
4071     jg         xloop1
4072 
4073     pop        edi
4074     pop        esi
4075     ret
4076 
4077     align      16
4078   xloop2:
4079     movdqa     xmm0, [esi]
4080     pavgb      xmm0, [esi + edx]
4081     sub        ecx, 4
4082     movdqa     [esi + edi], xmm0
4083     lea        esi, [esi + 16]
4084     jg         xloop2
4085 
4086     pop        edi
4087     pop        esi
4088     ret
4089   }
4090 }
4091 
4092 #endif  // _M_IX86
4093 
4094 #ifdef __cplusplus
4095 }  // extern "C"
4096 }  // namespace libyuv
4097 #endif
4098