1 /*
2  *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include "libyuv/row.h"
12 
13 #include "libyuv/basic_types.h"
14 
15 #ifdef __cplusplus
16 namespace libyuv {
17 extern "C" {
18 #endif
19 
20 // This module is for GCC x86 and x64
21 #if !defined(YUV_DISABLE_ASM) && (defined(__x86_64__) || defined(__i386__))
22 
23 // GCC 4.2 on OSX has link error when passing static or const to inline.
24 // TODO(fbarchard): Use static const when gcc 4.2 support is dropped.
25 #ifdef __APPLE__
26 #define CONST
27 #else
28 #define CONST static const
29 #endif
30 
31 #ifdef HAS_ARGBTOYROW_SSSE3
32 
33 // Constants for ARGB
34 CONST vec8 kARGBToY = {
35   13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0
36 };
37 
38 CONST vec8 kARGBToU = {
39   112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0
40 };
41 
42 CONST vec8 kARGBToV = {
43   -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
44 };
45 
46 // Constants for BGRA
47 CONST vec8 kBGRAToY = {
48   0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13
49 };
50 
51 CONST vec8 kBGRAToU = {
52   0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112
53 };
54 
55 CONST vec8 kBGRAToV = {
56   0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18
57 };
58 
59 // Constants for ABGR
60 CONST vec8 kABGRToY = {
61   33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0
62 };
63 
64 CONST vec8 kABGRToU = {
65   -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0
66 };
67 
68 CONST vec8 kABGRToV = {
69   112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0
70 };
71 
72 CONST uvec8 kAddY16 = {
73   16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u
74 };
75 
76 CONST uvec8 kAddUV128 = {
77   128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
78   128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
79 };
80 
81 // Shuffle table for converting RGB24 to ARGB.
82 CONST uvec8 kShuffleMaskRGB24ToARGB = {
83   0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u
84 };
85 
86 // Shuffle table for converting RAW to ARGB.
87 CONST uvec8 kShuffleMaskRAWToARGB = {
88   2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u
89 };
90 
91 // Shuffle table for converting ABGR to ARGB.
92 CONST uvec8 kShuffleMaskABGRToARGB = {
93   2u, 1u, 0u, 3u, 6u, 5u, 4u, 7u, 10u, 9u, 8u, 11u, 14u, 13u, 12u, 15u
94 };
95 
96 // Shuffle table for converting BGRA to ARGB.
97 CONST uvec8 kShuffleMaskBGRAToARGB = {
98   3u, 2u, 1u, 0u, 7u, 6u, 5u, 4u, 11u, 10u, 9u, 8u, 15u, 14u, 13u, 12u
99 };
100 
101 // Shuffle table for converting RGBA to ARGB.
102 CONST uvec8 kShuffleMaskRGBAToARGB = {
103   1u, 2u, 3u, 0u, 5u, 6u, 7u, 4u, 9u, 10u, 11u, 8u, 13u, 14u, 15u, 12u
104 };
105 
106 // Shuffle table for converting ARGB to RGBA.
107 CONST uvec8 kShuffleMaskARGBToRGBA = {
108   3u, 0u, 1u, 2u, 7u, 4u, 5u, 6u, 11u, 8u, 9u, 10u, 15u, 12u, 13u, 14u
109 };
110 
111 // Shuffle table for converting ARGB to RGB24.
112 CONST uvec8 kShuffleMaskARGBToRGB24 = {
113   0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u
114 };
115 
116 // Shuffle table for converting ARGB to RAW.
117 CONST uvec8 kShuffleMaskARGBToRAW = {
118   2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u
119 };
120 
I400ToARGBRow_SSE2(const uint8 * src_y,uint8 * dst_argb,int pix)121 void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
122   asm volatile (
123     "pcmpeqb   %%xmm5,%%xmm5                   \n"
124     "pslld     $0x18,%%xmm5                    \n"
125     ".p2align  4                               \n"
126   "1:                                          \n"
127     "movq      (%0),%%xmm0                     \n"
128     "lea       0x8(%0),%0                      \n"
129     "punpcklbw %%xmm0,%%xmm0                   \n"
130     "movdqa    %%xmm0,%%xmm1                   \n"
131     "punpcklwd %%xmm0,%%xmm0                   \n"
132     "punpckhwd %%xmm1,%%xmm1                   \n"
133     "por       %%xmm5,%%xmm0                   \n"
134     "por       %%xmm5,%%xmm1                   \n"
135     "movdqa    %%xmm0,(%1)                     \n"
136     "movdqa    %%xmm1,0x10(%1)                 \n"
137     "lea       0x20(%1),%1                     \n"
138     "sub       $0x8,%2                         \n"
139     "jg        1b                              \n"
140   : "+r"(src_y),     // %0
141     "+r"(dst_argb),  // %1
142     "+r"(pix)        // %2
143   :
144   : "memory", "cc"
145 #if defined(__SSE2__)
146     , "xmm0", "xmm1", "xmm5"
147 #endif
148   );
149 }
150 
ABGRToARGBRow_SSSE3(const uint8 * src_abgr,uint8 * dst_argb,int pix)151 void ABGRToARGBRow_SSSE3(const uint8* src_abgr, uint8* dst_argb, int pix) {
152   asm volatile (
153     "movdqa    %3,%%xmm5                       \n"
154     "sub       %0,%1                           \n"
155     ".p2align  4                               \n"
156   "1:                                          \n"
157     "movdqa    (%0),%%xmm0                     \n"
158     "pshufb    %%xmm5,%%xmm0                   \n"
159     "sub       $0x4,%2                         \n"
160     "movdqa    %%xmm0,(%0,%1,1)                \n"
161     "lea       0x10(%0),%0                     \n"
162     "jg        1b                              \n"
163 
164   : "+r"(src_abgr),  // %0
165     "+r"(dst_argb),  // %1
166     "+r"(pix)        // %2
167   : "m"(kShuffleMaskABGRToARGB)  // %3
168   : "memory", "cc"
169 #if defined(__SSE2__)
170     , "xmm0", "xmm5"
171 #endif
172   );
173 }
174 
BGRAToARGBRow_SSSE3(const uint8 * src_bgra,uint8 * dst_argb,int pix)175 void BGRAToARGBRow_SSSE3(const uint8* src_bgra, uint8* dst_argb, int pix) {
176   asm volatile (
177     "movdqa    %3,%%xmm5                       \n"
178     "sub       %0,%1                           \n"
179     ".p2align  4                               \n"
180   "1:                                          \n"
181     "movdqa    (%0),%%xmm0                     \n"
182     "pshufb    %%xmm5,%%xmm0                   \n"
183     "sub       $0x4,%2                         \n"
184     "movdqa    %%xmm0,(%0,%1,1)                \n"
185     "lea       0x10(%0),%0                     \n"
186     "jg        1b                              \n"
187   : "+r"(src_bgra),  // %0
188     "+r"(dst_argb),  // %1
189     "+r"(pix)        // %2
190   : "m"(kShuffleMaskBGRAToARGB)  // %3
191   : "memory", "cc"
192 #if defined(__SSE2__)
193     , "xmm0", "xmm5"
194 #endif
195   );
196 }
197 
RGBAToARGBRow_SSSE3(const uint8 * src_rgba,uint8 * dst_argb,int pix)198 void RGBAToARGBRow_SSSE3(const uint8* src_rgba, uint8* dst_argb, int pix) {
199   asm volatile (
200     "movdqa    %3,%%xmm5                       \n"
201     "sub       %0,%1                           \n"
202     ".p2align  4                               \n"
203   "1:                                          \n"
204     "movdqa    (%0),%%xmm0                     \n"
205     "pshufb    %%xmm5,%%xmm0                   \n"
206     "sub       $0x4,%2                         \n"
207     "movdqa    %%xmm0,(%0,%1,1)                \n"
208     "lea       0x10(%0),%0                     \n"
209     "jg        1b                              \n"
210 
211   : "+r"(src_rgba),  // %0
212     "+r"(dst_argb),  // %1
213     "+r"(pix)        // %2
214   : "m"(kShuffleMaskRGBAToARGB)  // %3
215   : "memory", "cc"
216 #if defined(__SSE2__)
217     , "xmm0", "xmm5"
218 #endif
219   );
220 }
221 
ARGBToRGBARow_SSSE3(const uint8 * src_argb,uint8 * dst_rgba,int pix)222 void ARGBToRGBARow_SSSE3(const uint8* src_argb, uint8* dst_rgba, int pix) {
223   asm volatile (
224     "movdqa    %3,%%xmm5                       \n"
225     "sub       %0,%1                           \n"
226     ".p2align  4                               \n"
227   "1:                                          \n"
228     "movdqa    (%0),%%xmm0                     \n"
229     "pshufb    %%xmm5,%%xmm0                   \n"
230     "sub       $0x4,%2                         \n"
231     "movdqa    %%xmm0,(%0,%1,1)                \n"
232     "lea       0x10(%0),%0                     \n"
233     "jg        1b                              \n"
234 
235   : "+r"(src_argb),  // %0
236     "+r"(dst_rgba),  // %1
237     "+r"(pix)        // %2
238   : "m"(kShuffleMaskARGBToRGBA)  // %3
239   : "memory", "cc"
240 #if defined(__SSE2__)
241     , "xmm0", "xmm5"
242 #endif
243   );
244 }
245 
RGB24ToARGBRow_SSSE3(const uint8 * src_rgb24,uint8 * dst_argb,int pix)246 void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) {
247   asm volatile (
248     "pcmpeqb   %%xmm5,%%xmm5                   \n"  // generate mask 0xff000000
249     "pslld     $0x18,%%xmm5                    \n"
250     "movdqa    %3,%%xmm4                       \n"
251     ".p2align  4                               \n"
252   "1:                                          \n"
253     "movdqu    (%0),%%xmm0                     \n"
254     "movdqu    0x10(%0),%%xmm1                 \n"
255     "movdqu    0x20(%0),%%xmm3                 \n"
256     "lea       0x30(%0),%0                     \n"
257     "movdqa    %%xmm3,%%xmm2                   \n"
258     "palignr   $0x8,%%xmm1,%%xmm2              \n"
259     "pshufb    %%xmm4,%%xmm2                   \n"
260     "por       %%xmm5,%%xmm2                   \n"
261     "palignr   $0xc,%%xmm0,%%xmm1              \n"
262     "pshufb    %%xmm4,%%xmm0                   \n"
263     "movdqa    %%xmm2,0x20(%1)                 \n"
264     "por       %%xmm5,%%xmm0                   \n"
265     "pshufb    %%xmm4,%%xmm1                   \n"
266     "movdqa    %%xmm0,(%1)                     \n"
267     "por       %%xmm5,%%xmm1                   \n"
268     "palignr   $0x4,%%xmm3,%%xmm3              \n"
269     "pshufb    %%xmm4,%%xmm3                   \n"
270     "movdqa    %%xmm1,0x10(%1)                 \n"
271     "por       %%xmm5,%%xmm3                   \n"
272     "sub       $0x10,%2                        \n"
273     "movdqa    %%xmm3,0x30(%1)                 \n"
274     "lea       0x40(%1),%1                     \n"
275     "jg        1b                              \n"
276   : "+r"(src_rgb24),  // %0
277     "+r"(dst_argb),  // %1
278     "+r"(pix)        // %2
279   : "m"(kShuffleMaskRGB24ToARGB)  // %3
280   : "memory", "cc"
281 #if defined(__SSE2__)
282     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
283 #endif
284   );
285 }
286 
RAWToARGBRow_SSSE3(const uint8 * src_raw,uint8 * dst_argb,int pix)287 void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int pix) {
288   asm volatile (
289     "pcmpeqb   %%xmm5,%%xmm5                   \n"  // generate mask 0xff000000
290     "pslld     $0x18,%%xmm5                    \n"
291     "movdqa    %3,%%xmm4                       \n"
292     ".p2align  4                               \n"
293   "1:                                          \n"
294     "movdqu    (%0),%%xmm0                     \n"
295     "movdqu    0x10(%0),%%xmm1                 \n"
296     "movdqu    0x20(%0),%%xmm3                 \n"
297     "lea       0x30(%0),%0                     \n"
298     "movdqa    %%xmm3,%%xmm2                   \n"
299     "palignr   $0x8,%%xmm1,%%xmm2              \n"
300     "pshufb    %%xmm4,%%xmm2                   \n"
301     "por       %%xmm5,%%xmm2                   \n"
302     "palignr   $0xc,%%xmm0,%%xmm1              \n"
303     "pshufb    %%xmm4,%%xmm0                   \n"
304     "movdqa    %%xmm2,0x20(%1)                 \n"
305     "por       %%xmm5,%%xmm0                   \n"
306     "pshufb    %%xmm4,%%xmm1                   \n"
307     "movdqa    %%xmm0,(%1)                     \n"
308     "por       %%xmm5,%%xmm1                   \n"
309     "palignr   $0x4,%%xmm3,%%xmm3              \n"
310     "pshufb    %%xmm4,%%xmm3                   \n"
311     "movdqa    %%xmm1,0x10(%1)                 \n"
312     "por       %%xmm5,%%xmm3                   \n"
313     "sub       $0x10,%2                        \n"
314     "movdqa    %%xmm3,0x30(%1)                 \n"
315     "lea       0x40(%1),%1                     \n"
316     "jg        1b                              \n"
317   : "+r"(src_raw),   // %0
318     "+r"(dst_argb),  // %1
319     "+r"(pix)        // %2
320   : "m"(kShuffleMaskRAWToARGB)  // %3
321   : "memory", "cc"
322 #if defined(__SSE2__)
323     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
324 #endif
325   );
326 }
327 
RGB565ToARGBRow_SSE2(const uint8 * src,uint8 * dst,int pix)328 void RGB565ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
329   asm volatile (
330     "mov       $0x1080108,%%eax                \n"
331     "movd      %%eax,%%xmm5                    \n"
332     "pshufd    $0x0,%%xmm5,%%xmm5              \n"
333     "mov       $0x20802080,%%eax               \n"
334     "movd      %%eax,%%xmm6                    \n"
335     "pshufd    $0x0,%%xmm6,%%xmm6              \n"
336     "pcmpeqb   %%xmm3,%%xmm3                   \n"
337     "psllw     $0xb,%%xmm3                     \n"
338     "pcmpeqb   %%xmm4,%%xmm4                   \n"
339     "psllw     $0xa,%%xmm4                     \n"
340     "psrlw     $0x5,%%xmm4                     \n"
341     "pcmpeqb   %%xmm7,%%xmm7                   \n"
342     "psllw     $0x8,%%xmm7                     \n"
343     "sub       %0,%1                           \n"
344     "sub       %0,%1                           \n"
345     ".p2align  4                               \n"
346   "1:                                          \n"
347     "movdqu    (%0),%%xmm0                     \n"
348     "movdqa    %%xmm0,%%xmm1                   \n"
349     "movdqa    %%xmm0,%%xmm2                   \n"
350     "pand      %%xmm3,%%xmm1                   \n"
351     "psllw     $0xb,%%xmm2                     \n"
352     "pmulhuw   %%xmm5,%%xmm1                   \n"
353     "pmulhuw   %%xmm5,%%xmm2                   \n"
354     "psllw     $0x8,%%xmm1                     \n"
355     "por       %%xmm2,%%xmm1                   \n"
356     "pand      %%xmm4,%%xmm0                   \n"
357     "pmulhuw   %%xmm6,%%xmm0                   \n"
358     "por       %%xmm7,%%xmm0                   \n"
359     "movdqa    %%xmm1,%%xmm2                   \n"
360     "punpcklbw %%xmm0,%%xmm1                   \n"
361     "punpckhbw %%xmm0,%%xmm2                   \n"
362     "movdqa    %%xmm1,(%1,%0,2)                \n"
363     "movdqa    %%xmm2,0x10(%1,%0,2)            \n"
364     "lea       0x10(%0),%0                     \n"
365     "sub       $0x8,%2                         \n"
366     "jg        1b                              \n"
367   : "+r"(src),  // %0
368     "+r"(dst),  // %1
369     "+r"(pix)   // %2
370   :
371   : "memory", "cc", "eax"
372 #if defined(__SSE2__)
373     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
374 #endif
375   );
376 }
377 
ARGB1555ToARGBRow_SSE2(const uint8 * src,uint8 * dst,int pix)378 void ARGB1555ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
379   asm volatile (
380     "mov       $0x1080108,%%eax                \n"
381     "movd      %%eax,%%xmm5                    \n"
382     "pshufd    $0x0,%%xmm5,%%xmm5              \n"
383     "mov       $0x42004200,%%eax               \n"
384     "movd      %%eax,%%xmm6                    \n"
385     "pshufd    $0x0,%%xmm6,%%xmm6              \n"
386     "pcmpeqb   %%xmm3,%%xmm3                   \n"
387     "psllw     $0xb,%%xmm3                     \n"
388     "movdqa    %%xmm3,%%xmm4                   \n"
389     "psrlw     $0x6,%%xmm4                     \n"
390     "pcmpeqb   %%xmm7,%%xmm7                   \n"
391     "psllw     $0x8,%%xmm7                     \n"
392     "sub       %0,%1                           \n"
393     "sub       %0,%1                           \n"
394     ".p2align  4                               \n"
395   "1:                                          \n"
396     "movdqu    (%0),%%xmm0                     \n"
397     "movdqa    %%xmm0,%%xmm1                   \n"
398     "movdqa    %%xmm0,%%xmm2                   \n"
399     "psllw     $0x1,%%xmm1                     \n"
400     "psllw     $0xb,%%xmm2                     \n"
401     "pand      %%xmm3,%%xmm1                   \n"
402     "pmulhuw   %%xmm5,%%xmm2                   \n"
403     "pmulhuw   %%xmm5,%%xmm1                   \n"
404     "psllw     $0x8,%%xmm1                     \n"
405     "por       %%xmm2,%%xmm1                   \n"
406     "movdqa    %%xmm0,%%xmm2                   \n"
407     "pand      %%xmm4,%%xmm0                   \n"
408     "psraw     $0x8,%%xmm2                     \n"
409     "pmulhuw   %%xmm6,%%xmm0                   \n"
410     "pand      %%xmm7,%%xmm2                   \n"
411     "por       %%xmm2,%%xmm0                   \n"
412     "movdqa    %%xmm1,%%xmm2                   \n"
413     "punpcklbw %%xmm0,%%xmm1                   \n"
414     "punpckhbw %%xmm0,%%xmm2                   \n"
415     "movdqa    %%xmm1,(%1,%0,2)                \n"
416     "movdqa    %%xmm2,0x10(%1,%0,2)            \n"
417     "lea       0x10(%0),%0                     \n"
418     "sub       $0x8,%2                         \n"
419     "jg        1b                              \n"
420   : "+r"(src),  // %0
421     "+r"(dst),  // %1
422     "+r"(pix)   // %2
423   :
424   : "memory", "cc", "eax"
425 #if defined(__SSE2__)
426     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
427 #endif
428   );
429 }
430 
ARGB4444ToARGBRow_SSE2(const uint8 * src,uint8 * dst,int pix)431 void ARGB4444ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
432   asm volatile (
433     "mov       $0xf0f0f0f,%%eax                \n"
434     "movd      %%eax,%%xmm4                    \n"
435     "pshufd    $0x0,%%xmm4,%%xmm4              \n"
436     "movdqa    %%xmm4,%%xmm5                   \n"
437     "pslld     $0x4,%%xmm5                     \n"
438     "sub       %0,%1                           \n"
439     "sub       %0,%1                           \n"
440     ".p2align  4                               \n"
441   "1:                                          \n"
442     "movdqu    (%0),%%xmm0                     \n"
443     "movdqa    %%xmm0,%%xmm2                   \n"
444     "pand      %%xmm4,%%xmm0                   \n"
445     "pand      %%xmm5,%%xmm2                   \n"
446     "movdqa    %%xmm0,%%xmm1                   \n"
447     "movdqa    %%xmm2,%%xmm3                   \n"
448     "psllw     $0x4,%%xmm1                     \n"
449     "psrlw     $0x4,%%xmm3                     \n"
450     "por       %%xmm1,%%xmm0                   \n"
451     "por       %%xmm3,%%xmm2                   \n"
452     "movdqa    %%xmm0,%%xmm1                   \n"
453     "punpcklbw %%xmm2,%%xmm0                   \n"
454     "punpckhbw %%xmm2,%%xmm1                   \n"
455     "movdqa    %%xmm0,(%1,%0,2)                \n"
456     "movdqa    %%xmm1,0x10(%1,%0,2)            \n"
457     "lea       0x10(%0),%0                     \n"
458     "sub       $0x8,%2                         \n"
459     "jg        1b                              \n"
460   : "+r"(src),  // %0
461     "+r"(dst),  // %1
462     "+r"(pix)   // %2
463   :
464   : "memory", "cc", "eax"
465 #if defined(__SSE2__)
466     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
467 #endif
468   );
469 }
470 
ARGBToRGB24Row_SSSE3(const uint8 * src,uint8 * dst,int pix)471 void ARGBToRGB24Row_SSSE3(const uint8* src, uint8* dst, int pix) {
472   asm volatile (
473     "movdqa    %3,%%xmm6                       \n"
474     ".p2align  4                               \n"
475   "1:                                          \n"
476     "movdqa    (%0),%%xmm0                     \n"
477     "movdqa    0x10(%0),%%xmm1                 \n"
478     "movdqa    0x20(%0),%%xmm2                 \n"
479     "movdqa    0x30(%0),%%xmm3                 \n"
480     "lea       0x40(%0),%0                     \n"
481     "pshufb    %%xmm6,%%xmm0                   \n"
482     "pshufb    %%xmm6,%%xmm1                   \n"
483     "pshufb    %%xmm6,%%xmm2                   \n"
484     "pshufb    %%xmm6,%%xmm3                   \n"
485     "movdqa    %%xmm1,%%xmm4                   \n"
486     "psrldq    $0x4,%%xmm1                     \n"
487     "pslldq    $0xc,%%xmm4                     \n"
488     "movdqa    %%xmm2,%%xmm5                   \n"
489     "por       %%xmm4,%%xmm0                   \n"
490     "pslldq    $0x8,%%xmm5                     \n"
491     "movdqa    %%xmm0,(%1)                     \n"
492     "por       %%xmm5,%%xmm1                   \n"
493     "psrldq    $0x8,%%xmm2                     \n"
494     "pslldq    $0x4,%%xmm3                     \n"
495     "por       %%xmm3,%%xmm2                   \n"
496     "movdqa    %%xmm1,0x10(%1)                 \n"
497     "movdqa    %%xmm2,0x20(%1)                 \n"
498     "lea       0x30(%1),%1                     \n"
499     "sub       $0x10,%2                        \n"
500     "jg        1b                              \n"
501   : "+r"(src),  // %0
502     "+r"(dst),  // %1
503     "+r"(pix)   // %2
504   : "m"(kShuffleMaskARGBToRGB24)  // %3
505   : "memory", "cc"
506 #if defined(__SSE2__)
507     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
508 #endif
509   );
510 }
511 
ARGBToRAWRow_SSSE3(const uint8 * src,uint8 * dst,int pix)512 void ARGBToRAWRow_SSSE3(const uint8* src, uint8* dst, int pix) {
513   asm volatile (
514     "movdqa    %3,%%xmm6                       \n"
515     ".p2align  4                               \n"
516   "1:                                          \n"
517     "movdqa    (%0),%%xmm0                     \n"
518     "movdqa    0x10(%0),%%xmm1                 \n"
519     "movdqa    0x20(%0),%%xmm2                 \n"
520     "movdqa    0x30(%0),%%xmm3                 \n"
521     "lea       0x40(%0),%0                     \n"
522     "pshufb    %%xmm6,%%xmm0                   \n"
523     "pshufb    %%xmm6,%%xmm1                   \n"
524     "pshufb    %%xmm6,%%xmm2                   \n"
525     "pshufb    %%xmm6,%%xmm3                   \n"
526     "movdqa    %%xmm1,%%xmm4                   \n"
527     "psrldq    $0x4,%%xmm1                     \n"
528     "pslldq    $0xc,%%xmm4                     \n"
529     "movdqa    %%xmm2,%%xmm5                   \n"
530     "por       %%xmm4,%%xmm0                   \n"
531     "pslldq    $0x8,%%xmm5                     \n"
532     "movdqa    %%xmm0,(%1)                     \n"
533     "por       %%xmm5,%%xmm1                   \n"
534     "psrldq    $0x8,%%xmm2                     \n"
535     "pslldq    $0x4,%%xmm3                     \n"
536     "por       %%xmm3,%%xmm2                   \n"
537     "movdqa    %%xmm1,0x10(%1)                 \n"
538     "movdqa    %%xmm2,0x20(%1)                 \n"
539     "lea       0x30(%1),%1                     \n"
540     "sub       $0x10,%2                        \n"
541     "jg        1b                              \n"
542   : "+r"(src),  // %0
543     "+r"(dst),  // %1
544     "+r"(pix)   // %2
545   : "m"(kShuffleMaskARGBToRAW)  // %3
546   : "memory", "cc"
547 #if defined(__SSE2__)
548     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
549 #endif
550   );
551 }
552 
ARGBToRGB565Row_SSE2(const uint8 * src,uint8 * dst,int pix)553 void ARGBToRGB565Row_SSE2(const uint8* src, uint8* dst, int pix) {
554   asm volatile (
555     "pcmpeqb   %%xmm3,%%xmm3                   \n"
556     "psrld     $0x1b,%%xmm3                    \n"
557     "pcmpeqb   %%xmm4,%%xmm4                   \n"
558     "psrld     $0x1a,%%xmm4                    \n"
559     "pslld     $0x5,%%xmm4                     \n"
560     "pcmpeqb   %%xmm5,%%xmm5                   \n"
561     "pslld     $0xb,%%xmm5                     \n"
562     ".p2align  4                               \n"
563   "1:                                          \n"
564     "movdqa    (%0),%%xmm0                     \n"
565     "movdqa    %%xmm0,%%xmm1                   \n"
566     "movdqa    %%xmm0,%%xmm2                   \n"
567     "pslld     $0x8,%%xmm0                     \n"
568     "psrld     $0x3,%%xmm1                     \n"
569     "psrld     $0x5,%%xmm2                     \n"
570     "psrad     $0x10,%%xmm0                    \n"
571     "pand      %%xmm3,%%xmm1                   \n"
572     "pand      %%xmm4,%%xmm2                   \n"
573     "pand      %%xmm5,%%xmm0                   \n"
574     "por       %%xmm2,%%xmm1                   \n"
575     "por       %%xmm1,%%xmm0                   \n"
576     "packssdw  %%xmm0,%%xmm0                   \n"
577     "lea       0x10(%0),%0                     \n"
578     "movq      %%xmm0,(%1)                     \n"
579     "lea       0x8(%1),%1                      \n"
580     "sub       $0x4,%2                         \n"
581     "jg        1b                              \n"
582   : "+r"(src),  // %0
583     "+r"(dst),  // %1
584     "+r"(pix)   // %2
585   :
586   : "memory", "cc"
587 #if defined(__SSE2__)
588     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
589 #endif
590   );
591 }
592 
ARGBToARGB1555Row_SSE2(const uint8 * src,uint8 * dst,int pix)593 void ARGBToARGB1555Row_SSE2(const uint8* src, uint8* dst, int pix) {
594   asm volatile (
595     "pcmpeqb   %%xmm4,%%xmm4                   \n"
596     "psrld     $0x1b,%%xmm4                    \n"
597     "movdqa    %%xmm4,%%xmm5                   \n"
598     "pslld     $0x5,%%xmm5                     \n"
599     "movdqa    %%xmm4,%%xmm6                   \n"
600     "pslld     $0xa,%%xmm6                     \n"
601     "pcmpeqb   %%xmm7,%%xmm7                   \n"
602     "pslld     $0xf,%%xmm7                     \n"
603     ".p2align  4                               \n"
604   "1:                                          \n"
605     "movdqa    (%0),%%xmm0                     \n"
606     "movdqa    %%xmm0,%%xmm1                   \n"
607     "movdqa    %%xmm0,%%xmm2                   \n"
608     "movdqa    %%xmm0,%%xmm3                   \n"
609     "psrad     $0x10,%%xmm0                    \n"
610     "psrld     $0x3,%%xmm1                     \n"
611     "psrld     $0x6,%%xmm2                     \n"
612     "psrld     $0x9,%%xmm3                     \n"
613     "pand      %%xmm7,%%xmm0                   \n"
614     "pand      %%xmm4,%%xmm1                   \n"
615     "pand      %%xmm5,%%xmm2                   \n"
616     "pand      %%xmm6,%%xmm3                   \n"
617     "por       %%xmm1,%%xmm0                   \n"
618     "por       %%xmm3,%%xmm2                   \n"
619     "por       %%xmm2,%%xmm0                   \n"
620     "packssdw  %%xmm0,%%xmm0                   \n"
621     "lea       0x10(%0),%0                     \n"
622     "movq      %%xmm0,(%1)                     \n"
623     "lea       0x8(%1),%1                      \n"
624     "sub       $0x4,%2                         \n"
625     "jg        1b                              \n"
626   : "+r"(src),  // %0
627     "+r"(dst),  // %1
628     "+r"(pix)   // %2
629   :
630   : "memory", "cc"
631 #if defined(__SSE2__)
632     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
633 #endif
634   );
635 }
636 
ARGBToARGB4444Row_SSE2(const uint8 * src,uint8 * dst,int pix)637 void ARGBToARGB4444Row_SSE2(const uint8* src, uint8* dst, int pix) {
638   asm volatile (
639     "pcmpeqb   %%xmm4,%%xmm4                   \n"
640     "psllw     $0xc,%%xmm4                     \n"
641     "movdqa    %%xmm4,%%xmm3                   \n"
642     "psrlw     $0x8,%%xmm3                     \n"
643     ".p2align  4                               \n"
644   "1:                                          \n"
645     "movdqa    (%0),%%xmm0                     \n"
646     "movdqa    %%xmm0,%%xmm1                   \n"
647     "pand      %%xmm3,%%xmm0                   \n"
648     "pand      %%xmm4,%%xmm1                   \n"
649     "psrlq     $0x4,%%xmm0                     \n"
650     "psrlq     $0x8,%%xmm1                     \n"
651     "por       %%xmm1,%%xmm0                   \n"
652     "packuswb  %%xmm0,%%xmm0                   \n"
653     "lea       0x10(%0),%0                     \n"
654     "movq      %%xmm0,(%1)                     \n"
655     "lea       0x8(%1),%1                      \n"
656     "sub       $0x4,%2                         \n"
657     "jg        1b                              \n"
658   : "+r"(src),  // %0
659     "+r"(dst),  // %1
660     "+r"(pix)   // %2
661   :
662   : "memory", "cc"
663 #if defined(__SSE2__)
664     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
665 #endif
666   );
667 }
668 
ARGBToYRow_SSSE3(const uint8 * src_argb,uint8 * dst_y,int pix)669 void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
670   asm volatile (
671     "movdqa    %4,%%xmm5                       \n"
672     "movdqa    %3,%%xmm4                       \n"
673     ".p2align  4                               \n"
674   "1:                                          \n"
675     "movdqa    (%0),%%xmm0                     \n"
676     "movdqa    0x10(%0),%%xmm1                 \n"
677     "movdqa    0x20(%0),%%xmm2                 \n"
678     "movdqa    0x30(%0),%%xmm3                 \n"
679     "pmaddubsw %%xmm4,%%xmm0                   \n"
680     "pmaddubsw %%xmm4,%%xmm1                   \n"
681     "pmaddubsw %%xmm4,%%xmm2                   \n"
682     "pmaddubsw %%xmm4,%%xmm3                   \n"
683     "lea       0x40(%0),%0                     \n"
684     "phaddw    %%xmm1,%%xmm0                   \n"
685     "phaddw    %%xmm3,%%xmm2                   \n"
686     "psrlw     $0x7,%%xmm0                     \n"
687     "psrlw     $0x7,%%xmm2                     \n"
688     "packuswb  %%xmm2,%%xmm0                   \n"
689     "paddb     %%xmm5,%%xmm0                   \n"
690     "sub       $0x10,%2                        \n"
691     "movdqa    %%xmm0,(%1)                     \n"
692     "lea       0x10(%1),%1                     \n"
693     "jg        1b                              \n"
694   : "+r"(src_argb),  // %0
695     "+r"(dst_y),     // %1
696     "+r"(pix)        // %2
697   : "m"(kARGBToY),   // %3
698     "m"(kAddY16)     // %4
699   : "memory", "cc"
700 #if defined(__SSE2__)
701     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
702 #endif
703   );
704 }
705 
ARGBToYRow_Unaligned_SSSE3(const uint8 * src_argb,uint8 * dst_y,int pix)706 void ARGBToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
707   asm volatile (
708     "movdqa    %4,%%xmm5                       \n"
709     "movdqa    %3,%%xmm4                       \n"
710     ".p2align  4                               \n"
711   "1:                                          \n"
712     "movdqu    (%0),%%xmm0                     \n"
713     "movdqu    0x10(%0),%%xmm1                 \n"
714     "movdqu    0x20(%0),%%xmm2                 \n"
715     "movdqu    0x30(%0),%%xmm3                 \n"
716     "pmaddubsw %%xmm4,%%xmm0                   \n"
717     "pmaddubsw %%xmm4,%%xmm1                   \n"
718     "pmaddubsw %%xmm4,%%xmm2                   \n"
719     "pmaddubsw %%xmm4,%%xmm3                   \n"
720     "lea       0x40(%0),%0                     \n"
721     "phaddw    %%xmm1,%%xmm0                   \n"
722     "phaddw    %%xmm3,%%xmm2                   \n"
723     "psrlw     $0x7,%%xmm0                     \n"
724     "psrlw     $0x7,%%xmm2                     \n"
725     "packuswb  %%xmm2,%%xmm0                   \n"
726     "paddb     %%xmm5,%%xmm0                   \n"
727     "sub       $0x10,%2                        \n"
728     "movdqu    %%xmm0,(%1)                     \n"
729     "lea       0x10(%1),%1                     \n"
730     "jg        1b                              \n"
731   : "+r"(src_argb),  // %0
732     "+r"(dst_y),     // %1
733     "+r"(pix)        // %2
734   : "m"(kARGBToY),   // %3
735     "m"(kAddY16)     // %4
736   : "memory", "cc"
737 #if defined(__SSE2__)
738     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
739 #endif
740   );
741 }
742 
743 // TODO(fbarchard): pass xmm constants to single block of assembly.
744 // fpic on GCC 4.2 for OSX runs out of GPR registers. "m" effectively takes
745 // 3 registers - ebx, ebp and eax. "m" can be passed with 3 normal registers,
746 // or 4 if stack frame is disabled. Doing 2 assembly blocks is a work around
747 // and considered unsafe.
ARGBToUVRow_SSSE3(const uint8 * src_argb0,int src_stride_argb,uint8 * dst_u,uint8 * dst_v,int width)748 void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
749                        uint8* dst_u, uint8* dst_v, int width) {
750   asm volatile (
751     "movdqa    %0,%%xmm4                       \n"
752     "movdqa    %1,%%xmm3                       \n"
753     "movdqa    %2,%%xmm5                       \n"
754   :
755   : "m"(kARGBToU),  // %0
756     "m"(kARGBToV),  // %1
757     "m"(kAddUV128)  // %2
758   );
759   asm volatile (
760     "sub       %1,%2                           \n"
761     ".p2align  4                               \n"
762   "1:                                          \n"
763     "movdqa    (%0),%%xmm0                     \n"
764     "movdqa    0x10(%0),%%xmm1                 \n"
765     "movdqa    0x20(%0),%%xmm2                 \n"
766     "movdqa    0x30(%0),%%xmm6                 \n"
767     "pavgb     (%0,%4,1),%%xmm0                \n"
768     "pavgb     0x10(%0,%4,1),%%xmm1            \n"
769     "pavgb     0x20(%0,%4,1),%%xmm2            \n"
770     "pavgb     0x30(%0,%4,1),%%xmm6            \n"
771     "lea       0x40(%0),%0                     \n"
772     "movdqa    %%xmm0,%%xmm7                   \n"
773     "shufps    $0x88,%%xmm1,%%xmm0             \n"
774     "shufps    $0xdd,%%xmm1,%%xmm7             \n"
775     "pavgb     %%xmm7,%%xmm0                   \n"
776     "movdqa    %%xmm2,%%xmm7                   \n"
777     "shufps    $0x88,%%xmm6,%%xmm2             \n"
778     "shufps    $0xdd,%%xmm6,%%xmm7             \n"
779     "pavgb     %%xmm7,%%xmm2                   \n"
780     "movdqa    %%xmm0,%%xmm1                   \n"
781     "movdqa    %%xmm2,%%xmm6                   \n"
782     "pmaddubsw %%xmm4,%%xmm0                   \n"
783     "pmaddubsw %%xmm4,%%xmm2                   \n"
784     "pmaddubsw %%xmm3,%%xmm1                   \n"
785     "pmaddubsw %%xmm3,%%xmm6                   \n"
786     "phaddw    %%xmm2,%%xmm0                   \n"
787     "phaddw    %%xmm6,%%xmm1                   \n"
788     "psraw     $0x8,%%xmm0                     \n"
789     "psraw     $0x8,%%xmm1                     \n"
790     "packsswb  %%xmm1,%%xmm0                   \n"
791     "paddb     %%xmm5,%%xmm0                   \n"
792     "sub       $0x10,%3                        \n"
793     "movlps    %%xmm0,(%1)                     \n"
794     "movhps    %%xmm0,(%1,%2,1)                \n"
795     "lea       0x8(%1),%1                      \n"
796     "jg        1b                              \n"
797   : "+r"(src_argb0),       // %0
798     "+r"(dst_u),           // %1
799     "+r"(dst_v),           // %2
800     "+rm"(width)           // %3
801   : "r"(static_cast<intptr_t>(src_stride_argb))
802   : "memory", "cc"
803 #if defined(__SSE2__)
804     , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
805 #endif
806   );
807 }
808 
ARGBToUVRow_Unaligned_SSSE3(const uint8 * src_argb0,int src_stride_argb,uint8 * dst_u,uint8 * dst_v,int width)809 void ARGBToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
810                                  uint8* dst_u, uint8* dst_v, int width) {
811   asm volatile (
812     "movdqa    %0,%%xmm4                       \n"
813     "movdqa    %1,%%xmm3                       \n"
814     "movdqa    %2,%%xmm5                       \n"
815   :
816   : "m"(kARGBToU),         // %0
817     "m"(kARGBToV),         // %1
818     "m"(kAddUV128)         // %2
819   );
820   asm volatile (
821     "sub       %1,%2                           \n"
822     ".p2align  4                               \n"
823   "1:                                          \n"
824     "movdqu    (%0),%%xmm0                     \n"
825     "movdqu    0x10(%0),%%xmm1                 \n"
826     "movdqu    0x20(%0),%%xmm2                 \n"
827     "movdqu    0x30(%0),%%xmm6                 \n"
828     "movdqu    (%0,%4,1),%%xmm7                \n"
829     "pavgb     %%xmm7,%%xmm0                   \n"
830     "movdqu    0x10(%0,%4,1),%%xmm7            \n"
831     "pavgb     %%xmm7,%%xmm1                   \n"
832     "movdqu    0x20(%0,%4,1),%%xmm7            \n"
833     "pavgb     %%xmm7,%%xmm2                   \n"
834     "movdqu    0x30(%0,%4,1),%%xmm7            \n"
835     "pavgb     %%xmm7,%%xmm6                   \n"
836     "lea       0x40(%0),%0                     \n"
837     "movdqa    %%xmm0,%%xmm7                   \n"
838     "shufps    $0x88,%%xmm1,%%xmm0             \n"
839     "shufps    $0xdd,%%xmm1,%%xmm7             \n"
840     "pavgb     %%xmm7,%%xmm0                   \n"
841     "movdqa    %%xmm2,%%xmm7                   \n"
842     "shufps    $0x88,%%xmm6,%%xmm2             \n"
843     "shufps    $0xdd,%%xmm6,%%xmm7             \n"
844     "pavgb     %%xmm7,%%xmm2                   \n"
845     "movdqa    %%xmm0,%%xmm1                   \n"
846     "movdqa    %%xmm2,%%xmm6                   \n"
847     "pmaddubsw %%xmm4,%%xmm0                   \n"
848     "pmaddubsw %%xmm4,%%xmm2                   \n"
849     "pmaddubsw %%xmm3,%%xmm1                   \n"
850     "pmaddubsw %%xmm3,%%xmm6                   \n"
851     "phaddw    %%xmm2,%%xmm0                   \n"
852     "phaddw    %%xmm6,%%xmm1                   \n"
853     "psraw     $0x8,%%xmm0                     \n"
854     "psraw     $0x8,%%xmm1                     \n"
855     "packsswb  %%xmm1,%%xmm0                   \n"
856     "paddb     %%xmm5,%%xmm0                   \n"
857     "sub       $0x10,%3                        \n"
858     "movlps    %%xmm0,(%1)                     \n"
859     "movhps    %%xmm0,(%1,%2,1)                \n"
860     "lea       0x8(%1),%1                      \n"
861     "jg        1b                              \n"
862   : "+r"(src_argb0),       // %0
863     "+r"(dst_u),           // %1
864     "+r"(dst_v),           // %2
865     "+rm"(width)           // %3
866   : "r"(static_cast<intptr_t>(src_stride_argb))
867   : "memory", "cc"
868 #if defined(__SSE2__)
869     , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
870 #endif
871   );
872 }
873 
BGRAToYRow_SSSE3(const uint8 * src_bgra,uint8 * dst_y,int pix)874 void BGRAToYRow_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) {
875   asm volatile (
876     "movdqa    %4,%%xmm5                       \n"
877     "movdqa    %3,%%xmm4                       \n"
878     ".p2align  4                               \n"
879   "1:                                          \n"
880     "movdqa    (%0),%%xmm0                     \n"
881     "movdqa    0x10(%0),%%xmm1                 \n"
882     "movdqa    0x20(%0),%%xmm2                 \n"
883     "movdqa    0x30(%0),%%xmm3                 \n"
884     "pmaddubsw %%xmm4,%%xmm0                   \n"
885     "pmaddubsw %%xmm4,%%xmm1                   \n"
886     "pmaddubsw %%xmm4,%%xmm2                   \n"
887     "pmaddubsw %%xmm4,%%xmm3                   \n"
888     "lea       0x40(%0),%0                     \n"
889     "phaddw    %%xmm1,%%xmm0                   \n"
890     "phaddw    %%xmm3,%%xmm2                   \n"
891     "psrlw     $0x7,%%xmm0                     \n"
892     "psrlw     $0x7,%%xmm2                     \n"
893     "packuswb  %%xmm2,%%xmm0                   \n"
894     "paddb     %%xmm5,%%xmm0                   \n"
895     "sub       $0x10,%2                        \n"
896     "movdqa    %%xmm0,(%1)                     \n"
897     "lea       0x10(%1),%1                     \n"
898     "jg        1b                              \n"
899   : "+r"(src_bgra),  // %0
900     "+r"(dst_y),     // %1
901     "+r"(pix)        // %2
902   : "m"(kBGRAToY),   // %3
903     "m"(kAddY16)     // %4
904   : "memory", "cc"
905 #if defined(__SSE2__)
906     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
907 #endif
908   );
909 }
910 
BGRAToYRow_Unaligned_SSSE3(const uint8 * src_bgra,uint8 * dst_y,int pix)911 void BGRAToYRow_Unaligned_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) {
912   asm volatile (
913     "movdqa    %4,%%xmm5                       \n"
914     "movdqa    %3,%%xmm4                       \n"
915     ".p2align  4                               \n"
916   "1:                                          \n"
917     "movdqu    (%0),%%xmm0                     \n"
918     "movdqu    0x10(%0),%%xmm1                 \n"
919     "movdqu    0x20(%0),%%xmm2                 \n"
920     "movdqu    0x30(%0),%%xmm3                 \n"
921     "pmaddubsw %%xmm4,%%xmm0                   \n"
922     "pmaddubsw %%xmm4,%%xmm1                   \n"
923     "pmaddubsw %%xmm4,%%xmm2                   \n"
924     "pmaddubsw %%xmm4,%%xmm3                   \n"
925     "lea       0x40(%0),%0                     \n"
926     "phaddw    %%xmm1,%%xmm0                   \n"
927     "phaddw    %%xmm3,%%xmm2                   \n"
928     "psrlw     $0x7,%%xmm0                     \n"
929     "psrlw     $0x7,%%xmm2                     \n"
930     "packuswb  %%xmm2,%%xmm0                   \n"
931     "paddb     %%xmm5,%%xmm0                   \n"
932     "sub       $0x10,%2                        \n"
933     "movdqu    %%xmm0,(%1)                     \n"
934     "lea       0x10(%1),%1                     \n"
935     "jg        1b                              \n"
936   : "+r"(src_bgra),  // %0
937     "+r"(dst_y),     // %1
938     "+r"(pix)        // %2
939   : "m"(kBGRAToY),   // %3
940     "m"(kAddY16)     // %4
941   : "memory", "cc"
942 #if defined(__SSE2__)
943     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
944 #endif
945   );
946 }
947 
BGRAToUVRow_SSSE3(const uint8 * src_bgra0,int src_stride_bgra,uint8 * dst_u,uint8 * dst_v,int width)948 void BGRAToUVRow_SSSE3(const uint8* src_bgra0, int src_stride_bgra,
949                        uint8* dst_u, uint8* dst_v, int width) {
950   asm volatile (
951     "movdqa    %0,%%xmm4                       \n"
952     "movdqa    %1,%%xmm3                       \n"
953     "movdqa    %2,%%xmm5                       \n"
954   :
955   : "m"(kBGRAToU),         // %0
956     "m"(kBGRAToV),         // %1
957     "m"(kAddUV128)         // %2
958   );
959   asm volatile (
960     "sub       %1,%2                           \n"
961     ".p2align  4                               \n"
962   "1:                                          \n"
963     "movdqa    (%0),%%xmm0                     \n"
964     "movdqa    0x10(%0),%%xmm1                 \n"
965     "movdqa    0x20(%0),%%xmm2                 \n"
966     "movdqa    0x30(%0),%%xmm6                 \n"
967     "pavgb     (%0,%4,1),%%xmm0                \n"
968     "pavgb     0x10(%0,%4,1),%%xmm1            \n"
969     "pavgb     0x20(%0,%4,1),%%xmm2            \n"
970     "pavgb     0x30(%0,%4,1),%%xmm6            \n"
971     "lea       0x40(%0),%0                     \n"
972     "movdqa    %%xmm0,%%xmm7                   \n"
973     "shufps    $0x88,%%xmm1,%%xmm0             \n"
974     "shufps    $0xdd,%%xmm1,%%xmm7             \n"
975     "pavgb     %%xmm7,%%xmm0                   \n"
976     "movdqa    %%xmm2,%%xmm7                   \n"
977     "shufps    $0x88,%%xmm6,%%xmm2             \n"
978     "shufps    $0xdd,%%xmm6,%%xmm7             \n"
979     "pavgb     %%xmm7,%%xmm2                   \n"
980     "movdqa    %%xmm0,%%xmm1                   \n"
981     "movdqa    %%xmm2,%%xmm6                   \n"
982     "pmaddubsw %%xmm4,%%xmm0                   \n"
983     "pmaddubsw %%xmm4,%%xmm2                   \n"
984     "pmaddubsw %%xmm3,%%xmm1                   \n"
985     "pmaddubsw %%xmm3,%%xmm6                   \n"
986     "phaddw    %%xmm2,%%xmm0                   \n"
987     "phaddw    %%xmm6,%%xmm1                   \n"
988     "psraw     $0x8,%%xmm0                     \n"
989     "psraw     $0x8,%%xmm1                     \n"
990     "packsswb  %%xmm1,%%xmm0                   \n"
991     "paddb     %%xmm5,%%xmm0                   \n"
992     "sub       $0x10,%3                        \n"
993     "movlps    %%xmm0,(%1)                     \n"
994     "movhps    %%xmm0,(%1,%2,1)                \n"
995     "lea       0x8(%1),%1                      \n"
996     "jg        1b                              \n"
997   : "+r"(src_bgra0),       // %0
998     "+r"(dst_u),           // %1
999     "+r"(dst_v),           // %2
1000     "+rm"(width)           // %3
1001   : "r"(static_cast<intptr_t>(src_stride_bgra))
1002   : "memory", "cc"
1003 #if defined(__SSE2__)
1004     , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1005 #endif
1006   );
1007 }
1008 
BGRAToUVRow_Unaligned_SSSE3(const uint8 * src_bgra0,int src_stride_bgra,uint8 * dst_u,uint8 * dst_v,int width)1009 void BGRAToUVRow_Unaligned_SSSE3(const uint8* src_bgra0, int src_stride_bgra,
1010                                  uint8* dst_u, uint8* dst_v, int width) {
1011   asm volatile (
1012     "movdqa    %0,%%xmm4                       \n"
1013     "movdqa    %1,%%xmm3                       \n"
1014     "movdqa    %2,%%xmm5                       \n"
1015   :
1016   : "m"(kBGRAToU),         // %0
1017     "m"(kBGRAToV),         // %1
1018     "m"(kAddUV128)         // %2
1019   );
1020   asm volatile (
1021     "sub       %1,%2                           \n"
1022     ".p2align  4                               \n"
1023   "1:                                          \n"
1024     "movdqu    (%0),%%xmm0                     \n"
1025     "movdqu    0x10(%0),%%xmm1                 \n"
1026     "movdqu    0x20(%0),%%xmm2                 \n"
1027     "movdqu    0x30(%0),%%xmm6                 \n"
1028     "movdqu    (%0,%4,1),%%xmm7                \n"
1029     "pavgb     %%xmm7,%%xmm0                   \n"
1030     "movdqu    0x10(%0,%4,1),%%xmm7            \n"
1031     "pavgb     %%xmm7,%%xmm1                   \n"
1032     "movdqu    0x20(%0,%4,1),%%xmm7            \n"
1033     "pavgb     %%xmm7,%%xmm2                   \n"
1034     "movdqu    0x30(%0,%4,1),%%xmm7            \n"
1035     "pavgb     %%xmm7,%%xmm6                   \n"
1036     "lea       0x40(%0),%0                     \n"
1037     "movdqa    %%xmm0,%%xmm7                   \n"
1038     "shufps    $0x88,%%xmm1,%%xmm0             \n"
1039     "shufps    $0xdd,%%xmm1,%%xmm7             \n"
1040     "pavgb     %%xmm7,%%xmm0                   \n"
1041     "movdqa    %%xmm2,%%xmm7                   \n"
1042     "shufps    $0x88,%%xmm6,%%xmm2             \n"
1043     "shufps    $0xdd,%%xmm6,%%xmm7             \n"
1044     "pavgb     %%xmm7,%%xmm2                   \n"
1045     "movdqa    %%xmm0,%%xmm1                   \n"
1046     "movdqa    %%xmm2,%%xmm6                   \n"
1047     "pmaddubsw %%xmm4,%%xmm0                   \n"
1048     "pmaddubsw %%xmm4,%%xmm2                   \n"
1049     "pmaddubsw %%xmm3,%%xmm1                   \n"
1050     "pmaddubsw %%xmm3,%%xmm6                   \n"
1051     "phaddw    %%xmm2,%%xmm0                   \n"
1052     "phaddw    %%xmm6,%%xmm1                   \n"
1053     "psraw     $0x8,%%xmm0                     \n"
1054     "psraw     $0x8,%%xmm1                     \n"
1055     "packsswb  %%xmm1,%%xmm0                   \n"
1056     "paddb     %%xmm5,%%xmm0                   \n"
1057     "sub       $0x10,%3                        \n"
1058     "movlps    %%xmm0,(%1)                     \n"
1059     "movhps    %%xmm0,(%1,%2,1)                \n"
1060     "lea       0x8(%1),%1                      \n"
1061     "jg        1b                              \n"
1062   : "+r"(src_bgra0),       // %0
1063     "+r"(dst_u),           // %1
1064     "+r"(dst_v),           // %2
1065     "+rm"(width)           // %3
1066   : "r"(static_cast<intptr_t>(src_stride_bgra))
1067   : "memory", "cc"
1068 #if defined(__SSE2__)
1069     , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1070 #endif
1071   );
1072 }
1073 
ABGRToYRow_SSSE3(const uint8 * src_abgr,uint8 * dst_y,int pix)1074 void ABGRToYRow_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix) {
1075   asm volatile (
1076     "movdqa    %4,%%xmm5                       \n"
1077     "movdqa    %3,%%xmm4                       \n"
1078     ".p2align  4                               \n"
1079   "1:                                          \n"
1080     "movdqa    (%0),%%xmm0                     \n"
1081     "movdqa    0x10(%0),%%xmm1                 \n"
1082     "movdqa    0x20(%0),%%xmm2                 \n"
1083     "movdqa    0x30(%0),%%xmm3                 \n"
1084     "pmaddubsw %%xmm4,%%xmm0                   \n"
1085     "pmaddubsw %%xmm4,%%xmm1                   \n"
1086     "pmaddubsw %%xmm4,%%xmm2                   \n"
1087     "pmaddubsw %%xmm4,%%xmm3                   \n"
1088     "lea       0x40(%0),%0                     \n"
1089     "phaddw    %%xmm1,%%xmm0                   \n"
1090     "phaddw    %%xmm3,%%xmm2                   \n"
1091     "psrlw     $0x7,%%xmm0                     \n"
1092     "psrlw     $0x7,%%xmm2                     \n"
1093     "packuswb  %%xmm2,%%xmm0                   \n"
1094     "paddb     %%xmm5,%%xmm0                   \n"
1095     "sub       $0x10,%2                        \n"
1096     "movdqa    %%xmm0,(%1)                     \n"
1097     "lea       0x10(%1),%1                     \n"
1098     "jg        1b                              \n"
1099   : "+r"(src_abgr),  // %0
1100     "+r"(dst_y),     // %1
1101     "+r"(pix)        // %2
1102   : "m"(kABGRToY),   // %3
1103     "m"(kAddY16)     // %4
1104   : "memory", "cc"
1105 #if defined(__SSE2__)
1106     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1107 #endif
1108   );
1109 }
1110 
ABGRToYRow_Unaligned_SSSE3(const uint8 * src_abgr,uint8 * dst_y,int pix)1111 void ABGRToYRow_Unaligned_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix) {
1112   asm volatile (
1113     "movdqa    %4,%%xmm5                       \n"
1114     "movdqa    %3,%%xmm4                       \n"
1115     ".p2align  4                               \n"
1116   "1:                                          \n"
1117     "movdqu    (%0),%%xmm0                     \n"
1118     "movdqu    0x10(%0),%%xmm1                 \n"
1119     "movdqu    0x20(%0),%%xmm2                 \n"
1120     "movdqu    0x30(%0),%%xmm3                 \n"
1121     "pmaddubsw %%xmm4,%%xmm0                   \n"
1122     "pmaddubsw %%xmm4,%%xmm1                   \n"
1123     "pmaddubsw %%xmm4,%%xmm2                   \n"
1124     "pmaddubsw %%xmm4,%%xmm3                   \n"
1125     "lea       0x40(%0),%0                     \n"
1126     "phaddw    %%xmm1,%%xmm0                   \n"
1127     "phaddw    %%xmm3,%%xmm2                   \n"
1128     "psrlw     $0x7,%%xmm0                     \n"
1129     "psrlw     $0x7,%%xmm2                     \n"
1130     "packuswb  %%xmm2,%%xmm0                   \n"
1131     "paddb     %%xmm5,%%xmm0                   \n"
1132     "sub       $0x10,%2                        \n"
1133     "movdqu    %%xmm0,(%1)                     \n"
1134     "lea       0x10(%1),%1                     \n"
1135     "jg        1b                              \n"
1136   : "+r"(src_abgr),  // %0
1137     "+r"(dst_y),     // %1
1138     "+r"(pix)        // %2
1139   : "m"(kABGRToY),   // %3
1140     "m"(kAddY16)     // %4
1141   : "memory", "cc"
1142 #if defined(__SSE2__)
1143     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1144 #endif
1145   );
1146 }
1147 
ABGRToUVRow_SSSE3(const uint8 * src_abgr0,int src_stride_abgr,uint8 * dst_u,uint8 * dst_v,int width)1148 void ABGRToUVRow_SSSE3(const uint8* src_abgr0, int src_stride_abgr,
1149                        uint8* dst_u, uint8* dst_v, int width) {
1150   asm volatile (
1151     "movdqa    %0,%%xmm4                       \n"
1152     "movdqa    %1,%%xmm3                       \n"
1153     "movdqa    %2,%%xmm5                       \n"
1154   :
1155   : "m"(kABGRToU),         // %0
1156     "m"(kABGRToV),         // %1
1157     "m"(kAddUV128)         // %2
1158   );
1159   asm volatile (
1160     "sub       %1,%2                           \n"
1161     ".p2align  4                               \n"
1162   "1:                                          \n"
1163     "movdqa    (%0),%%xmm0                     \n"
1164     "movdqa    0x10(%0),%%xmm1                 \n"
1165     "movdqa    0x20(%0),%%xmm2                 \n"
1166     "movdqa    0x30(%0),%%xmm6                 \n"
1167     "pavgb     (%0,%4,1),%%xmm0                \n"
1168     "pavgb     0x10(%0,%4,1),%%xmm1            \n"
1169     "pavgb     0x20(%0,%4,1),%%xmm2            \n"
1170     "pavgb     0x30(%0,%4,1),%%xmm6            \n"
1171     "lea       0x40(%0),%0                     \n"
1172     "movdqa    %%xmm0,%%xmm7                   \n"
1173     "shufps    $0x88,%%xmm1,%%xmm0             \n"
1174     "shufps    $0xdd,%%xmm1,%%xmm7             \n"
1175     "pavgb     %%xmm7,%%xmm0                   \n"
1176     "movdqa    %%xmm2,%%xmm7                   \n"
1177     "shufps    $0x88,%%xmm6,%%xmm2             \n"
1178     "shufps    $0xdd,%%xmm6,%%xmm7             \n"
1179     "pavgb     %%xmm7,%%xmm2                   \n"
1180     "movdqa    %%xmm0,%%xmm1                   \n"
1181     "movdqa    %%xmm2,%%xmm6                   \n"
1182     "pmaddubsw %%xmm4,%%xmm0                   \n"
1183     "pmaddubsw %%xmm4,%%xmm2                   \n"
1184     "pmaddubsw %%xmm3,%%xmm1                   \n"
1185     "pmaddubsw %%xmm3,%%xmm6                   \n"
1186     "phaddw    %%xmm2,%%xmm0                   \n"
1187     "phaddw    %%xmm6,%%xmm1                   \n"
1188     "psraw     $0x8,%%xmm0                     \n"
1189     "psraw     $0x8,%%xmm1                     \n"
1190     "packsswb  %%xmm1,%%xmm0                   \n"
1191     "paddb     %%xmm5,%%xmm0                   \n"
1192     "sub       $0x10,%3                        \n"
1193     "movlps    %%xmm0,(%1)                     \n"
1194     "movhps    %%xmm0,(%1,%2,1)                \n"
1195     "lea       0x8(%1),%1                      \n"
1196     "jg        1b                              \n"
1197   : "+r"(src_abgr0),       // %0
1198     "+r"(dst_u),           // %1
1199     "+r"(dst_v),           // %2
1200     "+rm"(width)           // %3
1201   : "r"(static_cast<intptr_t>(src_stride_abgr))
1202   : "memory", "cc"
1203 #if defined(__SSE2__)
1204     , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1205 #endif
1206   );
1207 }
1208 
ABGRToUVRow_Unaligned_SSSE3(const uint8 * src_abgr0,int src_stride_abgr,uint8 * dst_u,uint8 * dst_v,int width)1209 void ABGRToUVRow_Unaligned_SSSE3(const uint8* src_abgr0, int src_stride_abgr,
1210                                  uint8* dst_u, uint8* dst_v, int width) {
1211   asm volatile (
1212     "movdqa    %0,%%xmm4                       \n"
1213     "movdqa    %1,%%xmm3                       \n"
1214     "movdqa    %2,%%xmm5                       \n"
1215   :
1216   : "m"(kABGRToU),         // %0
1217     "m"(kABGRToV),         // %1
1218     "m"(kAddUV128)         // %2
1219   );
1220   asm volatile (
1221     "sub       %1,%2                           \n"
1222     ".p2align  4                               \n"
1223   "1:                                          \n"
1224     "movdqu    (%0),%%xmm0                     \n"
1225     "movdqu    0x10(%0),%%xmm1                 \n"
1226     "movdqu    0x20(%0),%%xmm2                 \n"
1227     "movdqu    0x30(%0),%%xmm6                 \n"
1228     "movdqu    (%0,%4,1),%%xmm7                \n"
1229     "pavgb     %%xmm7,%%xmm0                   \n"
1230     "movdqu    0x10(%0,%4,1),%%xmm7            \n"
1231     "pavgb     %%xmm7,%%xmm1                   \n"
1232     "movdqu    0x20(%0,%4,1),%%xmm7            \n"
1233     "pavgb     %%xmm7,%%xmm2                   \n"
1234     "movdqu    0x30(%0,%4,1),%%xmm7            \n"
1235     "pavgb     %%xmm7,%%xmm6                   \n"
1236     "lea       0x40(%0),%0                     \n"
1237     "movdqa    %%xmm0,%%xmm7                   \n"
1238     "shufps    $0x88,%%xmm1,%%xmm0             \n"
1239     "shufps    $0xdd,%%xmm1,%%xmm7             \n"
1240     "pavgb     %%xmm7,%%xmm0                   \n"
1241     "movdqa    %%xmm2,%%xmm7                   \n"
1242     "shufps    $0x88,%%xmm6,%%xmm2             \n"
1243     "shufps    $0xdd,%%xmm6,%%xmm7             \n"
1244     "pavgb     %%xmm7,%%xmm2                   \n"
1245     "movdqa    %%xmm0,%%xmm1                   \n"
1246     "movdqa    %%xmm2,%%xmm6                   \n"
1247     "pmaddubsw %%xmm4,%%xmm0                   \n"
1248     "pmaddubsw %%xmm4,%%xmm2                   \n"
1249     "pmaddubsw %%xmm3,%%xmm1                   \n"
1250     "pmaddubsw %%xmm3,%%xmm6                   \n"
1251     "phaddw    %%xmm2,%%xmm0                   \n"
1252     "phaddw    %%xmm6,%%xmm1                   \n"
1253     "psraw     $0x8,%%xmm0                     \n"
1254     "psraw     $0x8,%%xmm1                     \n"
1255     "packsswb  %%xmm1,%%xmm0                   \n"
1256     "paddb     %%xmm5,%%xmm0                   \n"
1257     "sub       $0x10,%3                        \n"
1258     "movlps    %%xmm0,(%1)                     \n"
1259     "movhps    %%xmm0,(%1,%2,1)                \n"
1260     "lea       0x8(%1),%1                      \n"
1261     "jg        1b                              \n"
1262   : "+r"(src_abgr0),       // %0
1263     "+r"(dst_u),           // %1
1264     "+r"(dst_v),           // %2
1265     "+rm"(width)           // %3
1266   : "r"(static_cast<intptr_t>(src_stride_abgr))
1267   : "memory", "cc"
1268 #if defined(__SSE2__)
1269     , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1270 #endif
1271   );
1272 }
1273 #endif  // HAS_ARGBTOYROW_SSSE3
1274 
1275 #ifdef HAS_I422TOARGBROW_SSSE3
1276 #define UB 127 /* min(63,static_cast<int8>(2.018 * 64)) */
1277 #define UG -25 /* static_cast<int8>(-0.391 * 64 - 0.5) */
1278 #define UR 0
1279 
1280 #define VB 0
1281 #define VG -52 /* static_cast<int8>(-0.813 * 64 - 0.5) */
1282 #define VR 102 /* static_cast<int8>(1.596 * 64 + 0.5) */
1283 
1284 // Bias
1285 #define BB UB * 128 + VB * 128
1286 #define BG UG * 128 + VG * 128
1287 #define BR UR * 128 + VR * 128
1288 
1289 #define YG 74 /* static_cast<int8>(1.164 * 64 + 0.5) */
1290 
1291 struct {
1292   vec8 kUVToB;  // 0
1293   vec8 kUVToG;  // 16
1294   vec8 kUVToR;  // 32
1295   vec16 kUVBiasB;  // 48
1296   vec16 kUVBiasG;  // 64
1297   vec16 kUVBiasR;  // 80
1298   vec16 kYSub16;  // 96
1299   vec16 kYToRgb;  // 112
1300   vec8 kVUToB;  // 128
1301   vec8 kVUToG;  // 144
1302   vec8 kVUToR;  // 160
1303 } CONST SIMD_ALIGNED(kYuvConstants) = {
1304   { UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB },
1305   { UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG },
1306   { UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR },
1307   { BB, BB, BB, BB, BB, BB, BB, BB },
1308   { BG, BG, BG, BG, BG, BG, BG, BG },
1309   { BR, BR, BR, BR, BR, BR, BR, BR },
1310   { 16, 16, 16, 16, 16, 16, 16, 16 },
1311   { YG, YG, YG, YG, YG, YG, YG, YG },
1312   { VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB },
1313   { VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG },
1314   { VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR }
1315 };
1316 
1317 
1318 // Read 8 UV from 411
1319 #define READYUV444                                                             \
1320     "movq       (%[u_buf]),%%xmm0              \n"                             \
1321     "movq       (%[u_buf],%[v_buf],1),%%xmm1   \n"                             \
1322     "lea        0x8(%[u_buf]),%[u_buf]         \n"                             \
1323     "punpcklbw  %%xmm1,%%xmm0                  \n"                             \
1324 
1325 // Read 4 UV from 422, upsample to 8 UV
1326 #define READYUV422                                                             \
1327     "movd       (%[u_buf]),%%xmm0              \n"                             \
1328     "movd       (%[u_buf],%[v_buf],1),%%xmm1   \n"                             \
1329     "lea        0x4(%[u_buf]),%[u_buf]         \n"                             \
1330     "punpcklbw  %%xmm1,%%xmm0                  \n"                             \
1331     "punpcklwd  %%xmm0,%%xmm0                  \n"                             \
1332 
1333 // Read 2 UV from 411, upsample to 8 UV
1334 #define READYUV411                                                             \
1335     "movd       (%[u_buf]),%%xmm0              \n"                             \
1336     "movd       (%[u_buf],%[v_buf],1),%%xmm1   \n"                             \
1337     "lea        0x2(%[u_buf]),%[u_buf]         \n"                             \
1338     "punpcklbw  %%xmm1,%%xmm0                  \n"                             \
1339     "punpcklwd  %%xmm0,%%xmm0                  \n"                             \
1340     "punpckldq  %%xmm0,%%xmm0                  \n"                             \
1341 
1342 // Read 4 UV from NV12, upsample to 8 UV
1343 #define READNV12                                                               \
1344     "movq       (%[uv_buf]),%%xmm0             \n"                             \
1345     "lea        0x8(%[uv_buf]),%[uv_buf]       \n"                             \
1346     "punpcklwd  %%xmm0,%%xmm0                  \n"                             \
1347 
1348 // Convert 8 pixels: 8 UV and 8 Y
1349 #define YUVTORGB                                                               \
1350     "movdqa     %%xmm0,%%xmm1                  \n"                             \
1351     "movdqa     %%xmm0,%%xmm2                  \n"                             \
1352     "pmaddubsw  (%[kYuvConstants]),%%xmm0      \n"                             \
1353     "pmaddubsw  16(%[kYuvConstants]),%%xmm1    \n"                             \
1354     "pmaddubsw  32(%[kYuvConstants]),%%xmm2    \n"                             \
1355     "psubw      48(%[kYuvConstants]),%%xmm0    \n"                             \
1356     "psubw      64(%[kYuvConstants]),%%xmm1    \n"                             \
1357     "psubw      80(%[kYuvConstants]),%%xmm2    \n"                             \
1358     "movq       (%[y_buf]),%%xmm3              \n"                             \
1359     "lea        0x8(%[y_buf]),%[y_buf]         \n"                             \
1360     "punpcklbw  %%xmm4,%%xmm3                  \n"                             \
1361     "psubsw     96(%[kYuvConstants]),%%xmm3    \n"                             \
1362     "pmullw     112(%[kYuvConstants]),%%xmm3   \n"                             \
1363     "paddsw     %%xmm3,%%xmm0                  \n"                             \
1364     "paddsw     %%xmm3,%%xmm1                  \n"                             \
1365     "paddsw     %%xmm3,%%xmm2                  \n"                             \
1366     "psraw      $0x6,%%xmm0                    \n"                             \
1367     "psraw      $0x6,%%xmm1                    \n"                             \
1368     "psraw      $0x6,%%xmm2                    \n"                             \
1369     "packuswb   %%xmm0,%%xmm0                  \n"                             \
1370     "packuswb   %%xmm1,%%xmm1                  \n"                             \
1371     "packuswb   %%xmm2,%%xmm2                  \n"                             \
1372 
1373 // Convert 8 pixels: 8 VU and 8 Y
1374 #define YVUTORGB                                                               \
1375     "movdqa     %%xmm0,%%xmm1                  \n"                             \
1376     "movdqa     %%xmm0,%%xmm2                  \n"                             \
1377     "pmaddubsw  128(%[kYuvConstants]),%%xmm0   \n"                             \
1378     "pmaddubsw  144(%[kYuvConstants]),%%xmm1   \n"                             \
1379     "pmaddubsw  160(%[kYuvConstants]),%%xmm2   \n"                             \
1380     "psubw      48(%[kYuvConstants]),%%xmm0    \n"                             \
1381     "psubw      64(%[kYuvConstants]),%%xmm1    \n"                             \
1382     "psubw      80(%[kYuvConstants]),%%xmm2    \n"                             \
1383     "movq       (%[y_buf]),%%xmm3              \n"                             \
1384     "lea        0x8(%[y_buf]),%[y_buf]         \n"                             \
1385     "punpcklbw  %%xmm4,%%xmm3                  \n"                             \
1386     "psubsw     96(%[kYuvConstants]),%%xmm3    \n"                             \
1387     "pmullw     112(%[kYuvConstants]),%%xmm3   \n"                             \
1388     "paddsw     %%xmm3,%%xmm0                  \n"                             \
1389     "paddsw     %%xmm3,%%xmm1                  \n"                             \
1390     "paddsw     %%xmm3,%%xmm2                  \n"                             \
1391     "psraw      $0x6,%%xmm0                    \n"                             \
1392     "psraw      $0x6,%%xmm1                    \n"                             \
1393     "psraw      $0x6,%%xmm2                    \n"                             \
1394     "packuswb   %%xmm0,%%xmm0                  \n"                             \
1395     "packuswb   %%xmm1,%%xmm1                  \n"                             \
1396     "packuswb   %%xmm2,%%xmm2                  \n"                             \
1397 
I444ToARGBRow_SSSE3(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * argb_buf,int width)1398 void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf,
1399                                 const uint8* u_buf,
1400                                 const uint8* v_buf,
1401                                 uint8* argb_buf,
1402                                 int width) {
1403   asm volatile (
1404     "sub       %[u_buf],%[v_buf]               \n"
1405     "pcmpeqb   %%xmm5,%%xmm5                   \n"
1406     "pxor      %%xmm4,%%xmm4                   \n"
1407     ".p2align  4                               \n"
1408   "1:                                          \n"
1409     READYUV444
1410     YUVTORGB
1411     "punpcklbw %%xmm1,%%xmm0                   \n"
1412     "punpcklbw %%xmm5,%%xmm2                   \n"
1413     "movdqa    %%xmm0,%%xmm1                   \n"
1414     "punpcklwd %%xmm2,%%xmm0                   \n"
1415     "punpckhwd %%xmm2,%%xmm1                   \n"
1416     "movdqa    %%xmm0,(%[argb_buf])            \n"
1417     "movdqa    %%xmm1,0x10(%[argb_buf])        \n"
1418     "lea       0x20(%[argb_buf]),%[argb_buf]   \n"
1419     "sub       $0x8,%[width]                   \n"
1420     "jg        1b                              \n"
1421   : [y_buf]"+r"(y_buf),    // %[y_buf]
1422     [u_buf]"+r"(u_buf),    // %[u_buf]
1423     [v_buf]"+r"(v_buf),    // %[v_buf]
1424     [argb_buf]"+r"(argb_buf),  // %[argb_buf]
1425     [width]"+rm"(width)    // %[width]
1426   : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
1427   : "memory", "cc"
1428 #if defined(__SSE2__)
1429     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1430 #endif
1431   );
1432 }
1433 
I422ToARGBRow_SSSE3(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * argb_buf,int width)1434 void OMITFP I422ToARGBRow_SSSE3(const uint8* y_buf,
1435                                 const uint8* u_buf,
1436                                 const uint8* v_buf,
1437                                 uint8* argb_buf,
1438                                 int width) {
1439   asm volatile (
1440     "sub       %[u_buf],%[v_buf]               \n"
1441     "pcmpeqb   %%xmm5,%%xmm5                   \n"
1442     "pxor      %%xmm4,%%xmm4                   \n"
1443     ".p2align  4                               \n"
1444   "1:                                          \n"
1445     READYUV422
1446     YUVTORGB
1447     "punpcklbw %%xmm1,%%xmm0                   \n"
1448     "punpcklbw %%xmm5,%%xmm2                   \n"
1449     "movdqa    %%xmm0,%%xmm1                   \n"
1450     "punpcklwd %%xmm2,%%xmm0                   \n"
1451     "punpckhwd %%xmm2,%%xmm1                   \n"
1452     "movdqa    %%xmm0,(%[argb_buf])            \n"
1453     "movdqa    %%xmm1,0x10(%[argb_buf])        \n"
1454     "lea       0x20(%[argb_buf]),%[argb_buf]   \n"
1455     "sub       $0x8,%[width]                   \n"
1456     "jg        1b                              \n"
1457   : [y_buf]"+r"(y_buf),    // %[y_buf]
1458     [u_buf]"+r"(u_buf),    // %[u_buf]
1459     [v_buf]"+r"(v_buf),    // %[v_buf]
1460     [argb_buf]"+r"(argb_buf),  // %[argb_buf]
1461     [width]"+rm"(width)    // %[width]
1462   : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
1463   : "memory", "cc"
1464 #if defined(__SSE2__)
1465     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1466 #endif
1467   );
1468 }
1469 
I411ToARGBRow_SSSE3(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * argb_buf,int width)1470 void OMITFP I411ToARGBRow_SSSE3(const uint8* y_buf,
1471                                 const uint8* u_buf,
1472                                 const uint8* v_buf,
1473                                 uint8* argb_buf,
1474                                 int width) {
1475   asm volatile (
1476     "sub       %[u_buf],%[v_buf]               \n"
1477     "pcmpeqb   %%xmm5,%%xmm5                   \n"
1478     "pxor      %%xmm4,%%xmm4                   \n"
1479     ".p2align  4                               \n"
1480   "1:                                          \n"
1481     READYUV411
1482     YUVTORGB
1483     "punpcklbw %%xmm1,%%xmm0                   \n"
1484     "punpcklbw %%xmm5,%%xmm2                   \n"
1485     "movdqa    %%xmm0,%%xmm1                   \n"
1486     "punpcklwd %%xmm2,%%xmm0                   \n"
1487     "punpckhwd %%xmm2,%%xmm1                   \n"
1488     "movdqa    %%xmm0,(%[argb_buf])            \n"
1489     "movdqa    %%xmm1,0x10(%[argb_buf])        \n"
1490     "lea       0x20(%[argb_buf]),%[argb_buf]   \n"
1491     "sub       $0x8,%[width]                   \n"
1492     "jg        1b                              \n"
1493   : [y_buf]"+r"(y_buf),    // %[y_buf]
1494     [u_buf]"+r"(u_buf),    // %[u_buf]
1495     [v_buf]"+r"(v_buf),    // %[v_buf]
1496     [argb_buf]"+r"(argb_buf),  // %[argb_buf]
1497     [width]"+rm"(width)    // %[width]
1498   : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
1499   : "memory", "cc"
1500 #if defined(__SSE2__)
1501     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1502 #endif
1503   );
1504 }
1505 
NV12ToARGBRow_SSSE3(const uint8 * y_buf,const uint8 * uv_buf,uint8 * argb_buf,int width)1506 void OMITFP NV12ToARGBRow_SSSE3(const uint8* y_buf,
1507                                 const uint8* uv_buf,
1508                                 uint8* argb_buf,
1509                                 int width) {
1510   asm volatile (
1511     "pcmpeqb   %%xmm5,%%xmm5                   \n"
1512     "pxor      %%xmm4,%%xmm4                   \n"
1513     ".p2align  4                               \n"
1514   "1:                                          \n"
1515     READNV12
1516     YUVTORGB
1517     "punpcklbw %%xmm1,%%xmm0                   \n"
1518     "punpcklbw %%xmm5,%%xmm2                   \n"
1519     "movdqa    %%xmm0,%%xmm1                   \n"
1520     "punpcklwd %%xmm2,%%xmm0                   \n"
1521     "punpckhwd %%xmm2,%%xmm1                   \n"
1522     "movdqa    %%xmm0,(%[argb_buf])            \n"
1523     "movdqa    %%xmm1,0x10(%[argb_buf])        \n"
1524     "lea       0x20(%[argb_buf]),%[argb_buf]   \n"
1525     "sub       $0x8,%[width]                   \n"
1526     "jg        1b                              \n"
1527   : [y_buf]"+r"(y_buf),    // %[y_buf]
1528     [uv_buf]"+r"(uv_buf),    // %[uv_buf]
1529     [argb_buf]"+r"(argb_buf),  // %[argb_buf]
1530     [width]"+rm"(width)    // %[width]
1531   : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
1532   : "memory", "cc"
1533 #if defined(__SSE2__)
1534     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1535 #endif
1536   );
1537 }
1538 
NV21ToARGBRow_SSSE3(const uint8 * y_buf,const uint8 * vu_buf,uint8 * argb_buf,int width)1539 void OMITFP NV21ToARGBRow_SSSE3(const uint8* y_buf,
1540                                 const uint8* vu_buf,
1541                                 uint8* argb_buf,
1542                                 int width) {
1543   asm volatile (
1544     "pcmpeqb   %%xmm5,%%xmm5                   \n"
1545     "pxor      %%xmm4,%%xmm4                   \n"
1546     ".p2align  4                               \n"
1547   "1:                                          \n"
1548     READNV12
1549     YVUTORGB
1550     "punpcklbw %%xmm1,%%xmm0                   \n"
1551     "punpcklbw %%xmm5,%%xmm2                   \n"
1552     "movdqa    %%xmm0,%%xmm1                   \n"
1553     "punpcklwd %%xmm2,%%xmm0                   \n"
1554     "punpckhwd %%xmm2,%%xmm1                   \n"
1555     "movdqa    %%xmm0,(%[argb_buf])            \n"
1556     "movdqa    %%xmm1,0x10(%[argb_buf])        \n"
1557     "lea       0x20(%[argb_buf]),%[argb_buf]   \n"
1558     "sub       $0x8,%[width]                   \n"
1559     "jg        1b                              \n"
1560   : [y_buf]"+r"(y_buf),    // %[y_buf]
1561     [uv_buf]"+r"(vu_buf),    // %[uv_buf]
1562     [argb_buf]"+r"(argb_buf),  // %[argb_buf]
1563     [width]"+rm"(width)    // %[width]
1564   : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
1565   : "memory", "cc"
1566 #if defined(__SSE2__)
1567     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1568 #endif
1569   );
1570 }
1571 
I444ToARGBRow_Unaligned_SSSE3(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * argb_buf,int width)1572 void OMITFP I444ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
1573                                           const uint8* u_buf,
1574                                           const uint8* v_buf,
1575                                           uint8* argb_buf,
1576                                           int width) {
1577   asm volatile (
1578     "sub       %[u_buf],%[v_buf]               \n"
1579     "pcmpeqb   %%xmm5,%%xmm5                   \n"
1580     "pxor      %%xmm4,%%xmm4                   \n"
1581     ".p2align  4                               \n"
1582   "1:                                          \n"
1583     READYUV444
1584     YUVTORGB
1585     "punpcklbw %%xmm1,%%xmm0                   \n"
1586     "punpcklbw %%xmm5,%%xmm2                   \n"
1587     "movdqa    %%xmm0,%%xmm1                   \n"
1588     "punpcklwd %%xmm2,%%xmm0                   \n"
1589     "punpckhwd %%xmm2,%%xmm1                   \n"
1590     "movdqu    %%xmm0,(%[argb_buf])            \n"
1591     "movdqu    %%xmm1,0x10(%[argb_buf])        \n"
1592     "lea       0x20(%[argb_buf]),%[argb_buf]   \n"
1593     "sub       $0x8,%[width]                   \n"
1594     "jg        1b                              \n"
1595   : [y_buf]"+r"(y_buf),    // %[y_buf]
1596     [u_buf]"+r"(u_buf),    // %[u_buf]
1597     [v_buf]"+r"(v_buf),    // %[v_buf]
1598     [argb_buf]"+r"(argb_buf),  // %[argb_buf]
1599     [width]"+rm"(width)    // %[width]
1600   : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
1601   : "memory", "cc"
1602 #if defined(__SSE2__)
1603     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1604 #endif
1605   );
1606 }
1607 
I422ToARGBRow_Unaligned_SSSE3(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * argb_buf,int width)1608 void OMITFP I422ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
1609                                           const uint8* u_buf,
1610                                           const uint8* v_buf,
1611                                           uint8* argb_buf,
1612                                           int width) {
1613   asm volatile (
1614     "sub       %[u_buf],%[v_buf]               \n"
1615     "pcmpeqb   %%xmm5,%%xmm5                   \n"
1616     "pxor      %%xmm4,%%xmm4                   \n"
1617     ".p2align  4                               \n"
1618   "1:                                          \n"
1619     READYUV422
1620     YUVTORGB
1621     "punpcklbw %%xmm1,%%xmm0                   \n"
1622     "punpcklbw %%xmm5,%%xmm2                   \n"
1623     "movdqa    %%xmm0,%%xmm1                   \n"
1624     "punpcklwd %%xmm2,%%xmm0                   \n"
1625     "punpckhwd %%xmm2,%%xmm1                   \n"
1626     "movdqu    %%xmm0,(%[argb_buf])            \n"
1627     "movdqu    %%xmm1,0x10(%[argb_buf])        \n"
1628     "lea       0x20(%[argb_buf]),%[argb_buf]   \n"
1629     "sub       $0x8,%[width]                   \n"
1630     "jg        1b                              \n"
1631   : [y_buf]"+r"(y_buf),    // %[y_buf]
1632     [u_buf]"+r"(u_buf),    // %[u_buf]
1633     [v_buf]"+r"(v_buf),    // %[v_buf]
1634     [argb_buf]"+r"(argb_buf),  // %[argb_buf]
1635     [width]"+rm"(width)    // %[width]
1636   : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
1637   : "memory", "cc"
1638 #if defined(__SSE2__)
1639     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1640 #endif
1641   );
1642 }
1643 
I411ToARGBRow_Unaligned_SSSE3(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * argb_buf,int width)1644 void OMITFP I411ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
1645                                           const uint8* u_buf,
1646                                           const uint8* v_buf,
1647                                           uint8* argb_buf,
1648                                           int width) {
1649   asm volatile (
1650     "sub       %[u_buf],%[v_buf]               \n"
1651     "pcmpeqb   %%xmm5,%%xmm5                   \n"
1652     "pxor      %%xmm4,%%xmm4                   \n"
1653     ".p2align  4                               \n"
1654   "1:                                          \n"
1655     READYUV411
1656     YUVTORGB
1657     "punpcklbw %%xmm1,%%xmm0                   \n"
1658     "punpcklbw %%xmm5,%%xmm2                   \n"
1659     "movdqa    %%xmm0,%%xmm1                   \n"
1660     "punpcklwd %%xmm2,%%xmm0                   \n"
1661     "punpckhwd %%xmm2,%%xmm1                   \n"
1662     "movdqu    %%xmm0,(%[argb_buf])            \n"
1663     "movdqu    %%xmm1,0x10(%[argb_buf])        \n"
1664     "lea       0x20(%[argb_buf]),%[argb_buf]   \n"
1665     "sub       $0x8,%[width]                   \n"
1666     "jg        1b                              \n"
1667   : [y_buf]"+r"(y_buf),    // %[y_buf]
1668     [u_buf]"+r"(u_buf),    // %[u_buf]
1669     [v_buf]"+r"(v_buf),    // %[v_buf]
1670     [argb_buf]"+r"(argb_buf),  // %[argb_buf]
1671     [width]"+rm"(width)    // %[width]
1672   : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
1673   : "memory", "cc"
1674 #if defined(__SSE2__)
1675     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1676 #endif
1677   );
1678 }
1679 
NV12ToARGBRow_Unaligned_SSSE3(const uint8 * y_buf,const uint8 * uv_buf,uint8 * argb_buf,int width)1680 void OMITFP NV12ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
1681                                           const uint8* uv_buf,
1682                                           uint8* argb_buf,
1683                                           int width) {
1684   asm volatile (
1685     "pcmpeqb   %%xmm5,%%xmm5                   \n"
1686     "pxor      %%xmm4,%%xmm4                   \n"
1687     ".p2align  4                               \n"
1688   "1:                                          \n"
1689     READNV12
1690     YUVTORGB
1691     "punpcklbw %%xmm1,%%xmm0                   \n"
1692     "punpcklbw %%xmm5,%%xmm2                   \n"
1693     "movdqa    %%xmm0,%%xmm1                   \n"
1694     "punpcklwd %%xmm2,%%xmm0                   \n"
1695     "punpckhwd %%xmm2,%%xmm1                   \n"
1696     "movdqu    %%xmm0,(%[argb_buf])            \n"
1697     "movdqu    %%xmm1,0x10(%[argb_buf])        \n"
1698     "lea       0x20(%[argb_buf]),%[argb_buf]   \n"
1699     "sub       $0x8,%[width]                   \n"
1700     "jg        1b                              \n"
1701   : [y_buf]"+r"(y_buf),    // %[y_buf]
1702     [uv_buf]"+r"(uv_buf),    // %[uv_buf]
1703     [argb_buf]"+r"(argb_buf),  // %[argb_buf]
1704     [width]"+rm"(width)    // %[width]
1705   : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
1706   : "memory", "cc"
1707 #if defined(__SSE2__)
1708     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1709 #endif
1710   );
1711 }
1712 
NV21ToARGBRow_Unaligned_SSSE3(const uint8 * y_buf,const uint8 * vu_buf,uint8 * argb_buf,int width)1713 void OMITFP NV21ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
1714                                           const uint8* vu_buf,
1715                                           uint8* argb_buf,
1716                                           int width) {
1717   asm volatile (
1718     "pcmpeqb   %%xmm5,%%xmm5                   \n"
1719     "pxor      %%xmm4,%%xmm4                   \n"
1720     ".p2align  4                               \n"
1721   "1:                                          \n"
1722     READNV12
1723     YVUTORGB
1724     "punpcklbw %%xmm1,%%xmm0                   \n"
1725     "punpcklbw %%xmm5,%%xmm2                   \n"
1726     "movdqa    %%xmm0,%%xmm1                   \n"
1727     "punpcklwd %%xmm2,%%xmm0                   \n"
1728     "punpckhwd %%xmm2,%%xmm1                   \n"
1729     "movdqu    %%xmm0,(%[argb_buf])            \n"
1730     "movdqu    %%xmm1,0x10(%[argb_buf])        \n"
1731     "lea       0x20(%[argb_buf]),%[argb_buf]   \n"
1732     "sub       $0x8,%[width]                   \n"
1733     "jg        1b                              \n"
1734   : [y_buf]"+r"(y_buf),    // %[y_buf]
1735     [uv_buf]"+r"(vu_buf),    // %[uv_buf]
1736     [argb_buf]"+r"(argb_buf),  // %[argb_buf]
1737     [width]"+rm"(width)    // %[width]
1738   : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
1739   : "memory", "cc"
1740 #if defined(__SSE2__)
1741     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1742 #endif
1743   );
1744 }
1745 
I422ToBGRARow_SSSE3(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * bgra_buf,int width)1746 void OMITFP I422ToBGRARow_SSSE3(const uint8* y_buf,
1747                                 const uint8* u_buf,
1748                                 const uint8* v_buf,
1749                                 uint8* bgra_buf,
1750                                 int width) {
1751   asm volatile (
1752     "sub       %[u_buf],%[v_buf]               \n"
1753     "pcmpeqb   %%xmm5,%%xmm5                   \n"
1754     "pxor      %%xmm4,%%xmm4                   \n"
1755     ".p2align  4                               \n"
1756   "1:                                          \n"
1757     READYUV422
1758     YUVTORGB
1759     "pcmpeqb   %%xmm5,%%xmm5                   \n"
1760     "punpcklbw %%xmm0,%%xmm1                   \n"
1761     "punpcklbw %%xmm2,%%xmm5                   \n"
1762     "movdqa    %%xmm5,%%xmm0                   \n"
1763     "punpcklwd %%xmm1,%%xmm5                   \n"
1764     "punpckhwd %%xmm1,%%xmm0                   \n"
1765     "movdqa    %%xmm5,(%[argb_buf])            \n"
1766     "movdqa    %%xmm0,0x10(%[argb_buf])        \n"
1767     "lea       0x20(%[argb_buf]),%[argb_buf]   \n"
1768     "sub       $0x8,%[width]                   \n"
1769     "jg        1b                              \n"
1770   : [y_buf]"+r"(y_buf),    // %[y_buf]
1771     [u_buf]"+r"(u_buf),    // %[u_buf]
1772     [v_buf]"+r"(v_buf),    // %[v_buf]
1773     [argb_buf]"+r"(bgra_buf),  // %[argb_buf]
1774     [width]"+rm"(width)    // %[width]
1775   : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
1776   : "memory", "cc"
1777 #if defined(__SSE2__)
1778     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1779 #endif
1780   );
1781 }
1782 
I422ToABGRRow_SSSE3(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * abgr_buf,int width)1783 void OMITFP I422ToABGRRow_SSSE3(const uint8* y_buf,
1784                                 const uint8* u_buf,
1785                                 const uint8* v_buf,
1786                                 uint8* abgr_buf,
1787                                 int width) {
1788   asm volatile (
1789     "sub       %[u_buf],%[v_buf]               \n"
1790     "pcmpeqb   %%xmm5,%%xmm5                   \n"
1791     "pxor      %%xmm4,%%xmm4                   \n"
1792     ".p2align  4                               \n"
1793   "1:                                          \n"
1794     READYUV422
1795     YUVTORGB
1796     "punpcklbw %%xmm1,%%xmm2                   \n"
1797     "punpcklbw %%xmm5,%%xmm0                   \n"
1798     "movdqa    %%xmm2,%%xmm1                   \n"
1799     "punpcklwd %%xmm0,%%xmm2                   \n"
1800     "punpckhwd %%xmm0,%%xmm1                   \n"
1801     "movdqa    %%xmm2,(%[argb_buf])            \n"
1802     "movdqa    %%xmm1,0x10(%[argb_buf])        \n"
1803     "lea       0x20(%[argb_buf]),%[argb_buf]   \n"
1804     "sub       $0x8,%[width]                   \n"
1805     "jg        1b                              \n"
1806   : [y_buf]"+r"(y_buf),    // %[y_buf]
1807     [u_buf]"+r"(u_buf),    // %[u_buf]
1808     [v_buf]"+r"(v_buf),    // %[v_buf]
1809     [argb_buf]"+r"(abgr_buf),  // %[argb_buf]
1810     [width]"+rm"(width)    // %[width]
1811   : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
1812   : "memory", "cc"
1813 #if defined(__SSE2__)
1814     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1815 #endif
1816   );
1817 }
1818 
I422ToBGRARow_Unaligned_SSSE3(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * bgra_buf,int width)1819 void OMITFP I422ToBGRARow_Unaligned_SSSE3(const uint8* y_buf,
1820                                           const uint8* u_buf,
1821                                           const uint8* v_buf,
1822                                           uint8* bgra_buf,
1823                                           int width) {
1824   asm volatile (
1825     "sub       %[u_buf],%[v_buf]               \n"
1826     "pcmpeqb   %%xmm5,%%xmm5                   \n"
1827     "pxor      %%xmm4,%%xmm4                   \n"
1828     ".p2align  4                               \n"
1829   "1:                                          \n"
1830     READYUV422
1831     YUVTORGB
1832     "pcmpeqb   %%xmm5,%%xmm5                   \n"
1833     "punpcklbw %%xmm0,%%xmm1                   \n"
1834     "punpcklbw %%xmm2,%%xmm5                   \n"
1835     "movdqa    %%xmm5,%%xmm0                   \n"
1836     "punpcklwd %%xmm1,%%xmm5                   \n"
1837     "punpckhwd %%xmm1,%%xmm0                   \n"
1838     "movdqu    %%xmm5,(%[argb_buf])            \n"
1839     "movdqu    %%xmm0,0x10(%[argb_buf])        \n"
1840     "lea       0x20(%[argb_buf]),%[argb_buf]   \n"
1841     "sub       $0x8,%[width]                   \n"
1842     "jg        1b                              \n"
1843   : [y_buf]"+r"(y_buf),    // %[y_buf]
1844     [u_buf]"+r"(u_buf),    // %[u_buf]
1845     [v_buf]"+r"(v_buf),    // %[v_buf]
1846     [argb_buf]"+r"(bgra_buf),  // %[argb_buf]
1847     [width]"+rm"(width)    // %[width]
1848   : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
1849   : "memory", "cc"
1850 #if defined(__SSE2__)
1851     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1852 #endif
1853   );
1854 }
1855 
I422ToABGRRow_Unaligned_SSSE3(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * abgr_buf,int width)1856 void OMITFP I422ToABGRRow_Unaligned_SSSE3(const uint8* y_buf,
1857                                           const uint8* u_buf,
1858                                           const uint8* v_buf,
1859                                           uint8* abgr_buf,
1860                                           int width) {
1861   asm volatile (
1862     "sub       %[u_buf],%[v_buf]               \n"
1863     "pcmpeqb   %%xmm5,%%xmm5                   \n"
1864     "pxor      %%xmm4,%%xmm4                   \n"
1865     ".p2align  4                               \n"
1866   "1:                                          \n"
1867     READYUV422
1868     YUVTORGB
1869     "punpcklbw %%xmm1,%%xmm2                   \n"
1870     "punpcklbw %%xmm5,%%xmm0                   \n"
1871     "movdqa    %%xmm2,%%xmm1                   \n"
1872     "punpcklwd %%xmm0,%%xmm2                   \n"
1873     "punpckhwd %%xmm0,%%xmm1                   \n"
1874     "movdqu    %%xmm2,(%[argb_buf])            \n"
1875     "movdqu    %%xmm1,0x10(%[argb_buf])        \n"
1876     "lea       0x20(%[argb_buf]),%[argb_buf]   \n"
1877     "sub       $0x8,%[width]                   \n"
1878     "jg        1b                              \n"
1879   : [y_buf]"+r"(y_buf),    // %[y_buf]
1880     [u_buf]"+r"(u_buf),    // %[u_buf]
1881     [v_buf]"+r"(v_buf),    // %[v_buf]
1882     [argb_buf]"+r"(abgr_buf),  // %[argb_buf]
1883     [width]"+rm"(width)    // %[width]
1884   : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
1885   : "memory", "cc"
1886 #if defined(__SSE2__)
1887     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1888 #endif
1889   );
1890 }
1891 #endif  // HAS_I422TOARGBROW_SSSE3
1892 
1893 #ifdef HAS_YTOARGBROW_SSE2
YToARGBRow_SSE2(const uint8 * y_buf,uint8 * rgb_buf,int width)1894 void YToARGBRow_SSE2(const uint8* y_buf,
1895                      uint8* rgb_buf,
1896                      int width) {
1897   asm volatile (
1898     "pcmpeqb   %%xmm4,%%xmm4                   \n"
1899     "pslld     $0x18,%%xmm4                    \n"
1900     "mov       $0x10001000,%%eax               \n"
1901     "movd      %%eax,%%xmm3                    \n"
1902     "pshufd    $0x0,%%xmm3,%%xmm3              \n"
1903     "mov       $0x012a012a,%%eax               \n"
1904     "movd      %%eax,%%xmm2                    \n"
1905     "pshufd    $0x0,%%xmm2,%%xmm2              \n"
1906     ".p2align  4                               \n"
1907   "1:                                          \n"
1908     // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
1909     "movq      (%0),%%xmm0                     \n"
1910     "lea       0x8(%0),%0                      \n"
1911     "punpcklbw %%xmm0,%%xmm0                   \n"
1912     "psubusw   %%xmm3,%%xmm0                   \n"
1913     "pmulhuw   %%xmm2,%%xmm0                   \n"
1914     "packuswb  %%xmm0,%%xmm0                   \n"
1915 
1916     // Step 2: Weave into ARGB
1917     "punpcklbw %%xmm0,%%xmm0                   \n"
1918     "movdqa    %%xmm0,%%xmm1                   \n"
1919     "punpcklwd %%xmm0,%%xmm0                   \n"
1920     "punpckhwd %%xmm1,%%xmm1                   \n"
1921     "por       %%xmm4,%%xmm0                   \n"
1922     "por       %%xmm4,%%xmm1                   \n"
1923     "movdqa    %%xmm0,(%1)                     \n"
1924     "movdqa    %%xmm1,16(%1)                   \n"
1925     "lea       32(%1),%1                       \n"
1926 
1927     "sub       $0x8,%2                         \n"
1928     "jg        1b                              \n"
1929   : "+r"(y_buf),    // %0
1930     "+r"(rgb_buf),  // %1
1931     "+rm"(width)    // %2
1932   :
1933   : "memory", "cc", "eax"
1934 #if defined(__SSE2__)
1935     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
1936 #endif
1937   );
1938 }
1939 #endif  // HAS_YTOARGBROW_SSE2
1940 
1941 #ifdef HAS_MIRRORROW_SSSE3
1942 // Shuffle table for reversing the bytes.
1943 CONST uvec8 kShuffleMirror = {
1944   15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
1945 };
1946 
MirrorRow_SSSE3(const uint8 * src,uint8 * dst,int width)1947 void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
1948   intptr_t temp_width = static_cast<intptr_t>(width);
1949   asm volatile (
1950     "movdqa    %3,%%xmm5                       \n"
1951     "lea       -0x10(%0),%0                    \n"
1952     ".p2align  4                               \n"
1953   "1:                                          \n"
1954     "movdqa    (%0,%2),%%xmm0                  \n"
1955     "pshufb    %%xmm5,%%xmm0                   \n"
1956     "sub       $0x10,%2                        \n"
1957     "movdqa    %%xmm0,(%1)                     \n"
1958     "lea       0x10(%1),%1                     \n"
1959     "jg        1b                              \n"
1960   : "+r"(src),  // %0
1961     "+r"(dst),  // %1
1962     "+r"(temp_width)  // %2
1963   : "m"(kShuffleMirror) // %3
1964   : "memory", "cc"
1965 #if defined(__SSE2__)
1966     , "xmm0", "xmm5"
1967 #endif
1968   );
1969 }
1970 #endif  // HAS_MIRRORROW_SSSE3
1971 
1972 #ifdef HAS_MIRRORROW_SSE2
MirrorRow_SSE2(const uint8 * src,uint8 * dst,int width)1973 void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
1974   intptr_t temp_width = static_cast<intptr_t>(width);
1975   asm volatile (
1976     "lea       -0x10(%0),%0                    \n"
1977     ".p2align  4                               \n"
1978   "1:                                          \n"
1979     "movdqu    (%0,%2),%%xmm0                  \n"
1980     "movdqa    %%xmm0,%%xmm1                   \n"
1981     "psllw     $0x8,%%xmm0                     \n"
1982     "psrlw     $0x8,%%xmm1                     \n"
1983     "por       %%xmm1,%%xmm0                   \n"
1984     "pshuflw   $0x1b,%%xmm0,%%xmm0             \n"
1985     "pshufhw   $0x1b,%%xmm0,%%xmm0             \n"
1986     "pshufd    $0x4e,%%xmm0,%%xmm0             \n"
1987     "sub       $0x10,%2                        \n"
1988     "movdqu    %%xmm0,(%1)                     \n"
1989     "lea       0x10(%1),%1                     \n"
1990     "jg        1b                              \n"
1991   : "+r"(src),  // %0
1992     "+r"(dst),  // %1
1993     "+r"(temp_width)  // %2
1994   :
1995   : "memory", "cc"
1996 #if defined(__SSE2__)
1997     , "xmm0", "xmm1"
1998 #endif
1999   );
2000 }
2001 #endif  // HAS_MIRRORROW_SSE2
2002 
2003 #ifdef HAS_MIRRORROW_UV_SSSE3
2004 // Shuffle table for reversing the bytes of UV channels.
2005 CONST uvec8 kShuffleMirrorUV = {
2006   14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u
2007 };
MirrorRowUV_SSSE3(const uint8 * src,uint8 * dst_u,uint8 * dst_v,int width)2008 void MirrorRowUV_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
2009                        int width) {
2010   intptr_t temp_width = static_cast<intptr_t>(width);
2011   asm volatile (
2012     "movdqa    %4,%%xmm1                       \n"
2013     "lea       -16(%0,%3,2),%0                 \n"
2014     "sub       %1,%2                           \n"
2015     ".p2align  4                               \n"
2016   "1:                                          \n"
2017     "movdqa    (%0),%%xmm0                     \n"
2018     "lea       -16(%0),%0                      \n"
2019     "pshufb    %%xmm1,%%xmm0                   \n"
2020     "sub       $8,%3                           \n"
2021     "movlpd    %%xmm0,(%1)                     \n"
2022     "movhpd    %%xmm0,(%1,%2)                  \n"
2023     "lea       8(%1),%1                        \n"
2024     "jg        1b                              \n"
2025   : "+r"(src),      // %0
2026     "+r"(dst_u),    // %1
2027     "+r"(dst_v),    // %2
2028     "+r"(temp_width)  // %3
2029   : "m"(kShuffleMirrorUV)  // %4
2030   : "memory", "cc"
2031 #if defined(__SSE2__)
2032     , "xmm0", "xmm1"
2033 #endif
2034   );
2035 }
2036 #endif  // HAS_MIRRORROW_UV_SSSE3
2037 
2038 #ifdef HAS_ARGBMIRRORROW_SSSE3
2039 // Shuffle table for reversing the bytes.
2040 CONST uvec8 kARGBShuffleMirror = {
2041   12u, 13u, 14u, 15u, 8u, 9u, 10u, 11u, 4u, 5u, 6u, 7u, 0u, 1u, 2u, 3u
2042 };
2043 
ARGBMirrorRow_SSSE3(const uint8 * src,uint8 * dst,int width)2044 void ARGBMirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
2045   intptr_t temp_width = static_cast<intptr_t>(width);
2046   asm volatile (
2047     "movdqa    %3,%%xmm5                       \n"
2048     "lea       -0x10(%0),%0                    \n"
2049     ".p2align  4                               \n"
2050   "1:                                          \n"
2051     "movdqa    (%0,%2,4),%%xmm0                \n"
2052     "pshufb    %%xmm5,%%xmm0                   \n"
2053     "sub       $0x4,%2                         \n"
2054     "movdqa    %%xmm0,(%1)                     \n"
2055     "lea       0x10(%1),%1                     \n"
2056     "jg        1b                              \n"
2057   : "+r"(src),  // %0
2058     "+r"(dst),  // %1
2059     "+r"(temp_width)  // %2
2060   : "m"(kARGBShuffleMirror)  // %3
2061   : "memory", "cc"
2062 #if defined(__SSE2__)
2063     , "xmm0", "xmm5"
2064 #endif
2065   );
2066 }
2067 #endif  // HAS_ARGBMIRRORROW_SSSE3
2068 
2069 #ifdef HAS_SPLITUV_SSE2
SplitUV_SSE2(const uint8 * src_uv,uint8 * dst_u,uint8 * dst_v,int pix)2070 void SplitUV_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
2071   asm volatile (
2072     "pcmpeqb    %%xmm5,%%xmm5                    \n"
2073     "psrlw      $0x8,%%xmm5                      \n"
2074     "sub        %1,%2                            \n"
2075     ".p2align  4                               \n"
2076   "1:                                            \n"
2077     "movdqa     (%0),%%xmm0                      \n"
2078     "movdqa     0x10(%0),%%xmm1                  \n"
2079     "lea        0x20(%0),%0                      \n"
2080     "movdqa     %%xmm0,%%xmm2                    \n"
2081     "movdqa     %%xmm1,%%xmm3                    \n"
2082     "pand       %%xmm5,%%xmm0                    \n"
2083     "pand       %%xmm5,%%xmm1                    \n"
2084     "packuswb   %%xmm1,%%xmm0                    \n"
2085     "psrlw      $0x8,%%xmm2                      \n"
2086     "psrlw      $0x8,%%xmm3                      \n"
2087     "packuswb   %%xmm3,%%xmm2                    \n"
2088     "movdqa     %%xmm0,(%1)                      \n"
2089     "movdqa     %%xmm2,(%1,%2)                   \n"
2090     "lea        0x10(%1),%1                      \n"
2091     "sub        $0x10,%3                         \n"
2092     "jg         1b                               \n"
2093   : "+r"(src_uv),     // %0
2094     "+r"(dst_u),      // %1
2095     "+r"(dst_v),      // %2
2096     "+r"(pix)         // %3
2097   :
2098   : "memory", "cc"
2099 #if defined(__SSE2__)
2100     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
2101 #endif
2102   );
2103 }
2104 #endif  // HAS_SPLITUV_SSE2
2105 
2106 #ifdef HAS_COPYROW_SSE2
CopyRow_SSE2(const uint8 * src,uint8 * dst,int count)2107 void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
2108   asm volatile (
2109     "sub        %0,%1                          \n"
2110     ".p2align  4                               \n"
2111   "1:                                          \n"
2112     "movdqa    (%0),%%xmm0                     \n"
2113     "movdqa    0x10(%0),%%xmm1                 \n"
2114     "movdqa    %%xmm0,(%0,%1)                  \n"
2115     "movdqa    %%xmm1,0x10(%0,%1)              \n"
2116     "lea       0x20(%0),%0                     \n"
2117     "sub       $0x20,%2                        \n"
2118     "jg        1b                              \n"
2119   : "+r"(src),   // %0
2120     "+r"(dst),   // %1
2121     "+r"(count)  // %2
2122   :
2123   : "memory", "cc"
2124 #if defined(__SSE2__)
2125     , "xmm0", "xmm1"
2126 #endif
2127   );
2128 }
2129 #endif  // HAS_COPYROW_SSE2
2130 
2131 #ifdef HAS_COPYROW_X86
CopyRow_X86(const uint8 * src,uint8 * dst,int width)2132 void CopyRow_X86(const uint8* src, uint8* dst, int width) {
2133   size_t width_tmp = static_cast<size_t>(width);
2134   asm volatile (
2135     "shr       $0x2,%2                         \n"
2136     "rep movsl                                 \n"
2137   : "+S"(src),  // %0
2138     "+D"(dst),  // %1
2139     "+c"(width_tmp) // %2
2140   :
2141   : "memory", "cc"
2142   );
2143 }
2144 #endif  // HAS_COPYROW_X86
2145 
2146 #ifdef HAS_SETROW_X86
SetRow8_X86(uint8 * dst,uint32 v32,int width)2147 void SetRow8_X86(uint8* dst, uint32 v32, int width) {
2148   size_t width_tmp = static_cast<size_t>(width);
2149   asm volatile (
2150     "shr       $0x2,%1                         \n"
2151     "rep stosl                                 \n"
2152     : "+D"(dst),       // %0
2153       "+c"(width_tmp)  // %1
2154     : "a"(v32)         // %2
2155     : "memory", "cc");
2156 }
2157 
SetRows32_X86(uint8 * dst,uint32 v32,int width,int dst_stride,int height)2158 void SetRows32_X86(uint8* dst, uint32 v32, int width,
2159                    int dst_stride, int height) {
2160   for (int y = 0; y < height; ++y) {
2161     size_t width_tmp = static_cast<size_t>(width);
2162     uint32* d = reinterpret_cast<uint32*>(dst);
2163     asm volatile (
2164       "rep stosl                               \n"
2165       : "+D"(d),         // %0
2166         "+c"(width_tmp)  // %1
2167       : "a"(v32)         // %2
2168       : "memory", "cc");
2169     dst += dst_stride;
2170   }
2171 }
2172 #endif  // HAS_SETROW_X86
2173 
2174 #ifdef HAS_YUY2TOYROW_SSE2
YUY2ToYRow_SSE2(const uint8 * src_yuy2,uint8 * dst_y,int pix)2175 void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix) {
2176   asm volatile (
2177     "pcmpeqb   %%xmm5,%%xmm5                   \n"
2178     "psrlw     $0x8,%%xmm5                     \n"
2179     ".p2align  4                               \n"
2180   "1:                                          \n"
2181     "movdqa    (%0),%%xmm0                     \n"
2182     "movdqa    0x10(%0),%%xmm1                 \n"
2183     "lea       0x20(%0),%0                     \n"
2184     "pand      %%xmm5,%%xmm0                   \n"
2185     "pand      %%xmm5,%%xmm1                   \n"
2186     "packuswb  %%xmm1,%%xmm0                   \n"
2187     "movdqa    %%xmm0,(%1)                     \n"
2188     "lea       0x10(%1),%1                     \n"
2189     "sub       $0x10,%2                        \n"
2190     "jg        1b                              \n"
2191   : "+r"(src_yuy2),  // %0
2192     "+r"(dst_y),     // %1
2193     "+r"(pix)        // %2
2194   :
2195   : "memory", "cc"
2196 #if defined(__SSE2__)
2197     , "xmm0", "xmm1", "xmm5"
2198 #endif
2199   );
2200 }
2201 
YUY2ToUVRow_SSE2(const uint8 * src_yuy2,int stride_yuy2,uint8 * dst_u,uint8 * dst_v,int pix)2202 void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
2203                       uint8* dst_u, uint8* dst_v, int pix) {
2204   asm volatile (
2205     "pcmpeqb   %%xmm5,%%xmm5                   \n"
2206     "psrlw     $0x8,%%xmm5                     \n"
2207     "sub       %1,%2                           \n"
2208     ".p2align  4                               \n"
2209   "1:                                          \n"
2210     "movdqa    (%0),%%xmm0                     \n"
2211     "movdqa    0x10(%0),%%xmm1                 \n"
2212     "movdqa    (%0,%4,1),%%xmm2                \n"
2213     "movdqa    0x10(%0,%4,1),%%xmm3            \n"
2214     "lea       0x20(%0),%0                     \n"
2215     "pavgb     %%xmm2,%%xmm0                   \n"
2216     "pavgb     %%xmm3,%%xmm1                   \n"
2217     "psrlw     $0x8,%%xmm0                     \n"
2218     "psrlw     $0x8,%%xmm1                     \n"
2219     "packuswb  %%xmm1,%%xmm0                   \n"
2220     "movdqa    %%xmm0,%%xmm1                   \n"
2221     "pand      %%xmm5,%%xmm0                   \n"
2222     "packuswb  %%xmm0,%%xmm0                   \n"
2223     "psrlw     $0x8,%%xmm1                     \n"
2224     "packuswb  %%xmm1,%%xmm1                   \n"
2225     "movq      %%xmm0,(%1)                     \n"
2226     "movq      %%xmm1,(%1,%2)                  \n"
2227     "lea       0x8(%1),%1                      \n"
2228     "sub       $0x10,%3                        \n"
2229     "jg        1b                              \n"
2230   : "+r"(src_yuy2),    // %0
2231     "+r"(dst_u),       // %1
2232     "+r"(dst_v),       // %2
2233     "+r"(pix)          // %3
2234   : "r"(static_cast<intptr_t>(stride_yuy2))  // %4
2235   : "memory", "cc"
2236 #if defined(__SSE2__)
2237     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
2238 #endif
2239   );
2240 }
2241 
YUY2ToUV422Row_SSE2(const uint8 * src_yuy2,uint8 * dst_u,uint8 * dst_v,int pix)2242 void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
2243                          uint8* dst_u, uint8* dst_v, int pix) {
2244   asm volatile (
2245     "pcmpeqb   %%xmm5,%%xmm5                   \n"
2246     "psrlw     $0x8,%%xmm5                     \n"
2247     "sub       %1,%2                           \n"
2248     ".p2align  4                               \n"
2249   "1:                                          \n"
2250     "movdqa    (%0),%%xmm0                     \n"
2251     "movdqa    0x10(%0),%%xmm1                 \n"
2252     "lea       0x20(%0),%0                     \n"
2253     "psrlw     $0x8,%%xmm0                     \n"
2254     "psrlw     $0x8,%%xmm1                     \n"
2255     "packuswb  %%xmm1,%%xmm0                   \n"
2256     "movdqa    %%xmm0,%%xmm1                   \n"
2257     "pand      %%xmm5,%%xmm0                   \n"
2258     "packuswb  %%xmm0,%%xmm0                   \n"
2259     "psrlw     $0x8,%%xmm1                     \n"
2260     "packuswb  %%xmm1,%%xmm1                   \n"
2261     "movq      %%xmm0,(%1)                     \n"
2262     "movq      %%xmm1,(%1,%2)                  \n"
2263     "lea       0x8(%1),%1                      \n"
2264     "sub       $0x10,%3                        \n"
2265     "jg        1b                              \n"
2266   : "+r"(src_yuy2),    // %0
2267     "+r"(dst_u),       // %1
2268     "+r"(dst_v),       // %2
2269     "+r"(pix)          // %3
2270   :
2271   : "memory", "cc"
2272 #if defined(__SSE2__)
2273     , "xmm0", "xmm1", "xmm5"
2274 #endif
2275   );
2276 }
2277 
YUY2ToYRow_Unaligned_SSE2(const uint8 * src_yuy2,uint8 * dst_y,int pix)2278 void YUY2ToYRow_Unaligned_SSE2(const uint8* src_yuy2,
2279                                uint8* dst_y, int pix) {
2280   asm volatile (
2281     "pcmpeqb   %%xmm5,%%xmm5                   \n"
2282     "psrlw     $0x8,%%xmm5                     \n"
2283     ".p2align  4                               \n"
2284   "1:                                          \n"
2285     "movdqu    (%0),%%xmm0                     \n"
2286     "movdqu    0x10(%0),%%xmm1                 \n"
2287     "lea       0x20(%0),%0                     \n"
2288     "pand      %%xmm5,%%xmm0                   \n"
2289     "pand      %%xmm5,%%xmm1                   \n"
2290     "packuswb  %%xmm1,%%xmm0                   \n"
2291     "sub       $0x10,%2                        \n"
2292     "movdqu    %%xmm0,(%1)                     \n"
2293     "lea       0x10(%1),%1                     \n"
2294     "jg        1b                              \n"
2295   : "+r"(src_yuy2),  // %0
2296     "+r"(dst_y),     // %1
2297     "+r"(pix)        // %2
2298   :
2299   : "memory", "cc"
2300 #if defined(__SSE2__)
2301     , "xmm0", "xmm1", "xmm5"
2302 #endif
2303   );
2304 }
2305 
YUY2ToUVRow_Unaligned_SSE2(const uint8 * src_yuy2,int stride_yuy2,uint8 * dst_u,uint8 * dst_v,int pix)2306 void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2,
2307                                 int stride_yuy2,
2308                                 uint8* dst_u, uint8* dst_v, int pix) {
2309   asm volatile (
2310     "pcmpeqb   %%xmm5,%%xmm5                   \n"
2311     "psrlw     $0x8,%%xmm5                     \n"
2312     "sub       %1,%2                           \n"
2313     ".p2align  4                               \n"
2314   "1:                                          \n"
2315     "movdqu    (%0),%%xmm0                     \n"
2316     "movdqu    0x10(%0),%%xmm1                 \n"
2317     "movdqu    (%0,%4,1),%%xmm2                \n"
2318     "movdqu    0x10(%0,%4,1),%%xmm3            \n"
2319     "lea       0x20(%0),%0                     \n"
2320     "pavgb     %%xmm2,%%xmm0                   \n"
2321     "pavgb     %%xmm3,%%xmm1                   \n"
2322     "psrlw     $0x8,%%xmm0                     \n"
2323     "psrlw     $0x8,%%xmm1                     \n"
2324     "packuswb  %%xmm1,%%xmm0                   \n"
2325     "movdqa    %%xmm0,%%xmm1                   \n"
2326     "pand      %%xmm5,%%xmm0                   \n"
2327     "packuswb  %%xmm0,%%xmm0                   \n"
2328     "psrlw     $0x8,%%xmm1                     \n"
2329     "packuswb  %%xmm1,%%xmm1                   \n"
2330     "movq      %%xmm0,(%1)                     \n"
2331     "movq      %%xmm1,(%1,%2)                  \n"
2332     "lea       0x8(%1),%1                      \n"
2333     "sub       $0x10,%3                        \n"
2334     "jg        1b                              \n"
2335   : "+r"(src_yuy2),    // %0
2336     "+r"(dst_u),       // %1
2337     "+r"(dst_v),       // %2
2338     "+r"(pix)          // %3
2339   : "r"(static_cast<intptr_t>(stride_yuy2))  // %4
2340   : "memory", "cc"
2341 #if defined(__SSE2__)
2342     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
2343 #endif
2344   );
2345 }
2346 
YUY2ToUV422Row_Unaligned_SSE2(const uint8 * src_yuy2,uint8 * dst_u,uint8 * dst_v,int pix)2347 void YUY2ToUV422Row_Unaligned_SSE2(const uint8* src_yuy2,
2348                                    uint8* dst_u, uint8* dst_v, int pix) {
2349   asm volatile (
2350     "pcmpeqb   %%xmm5,%%xmm5                   \n"
2351     "psrlw     $0x8,%%xmm5                     \n"
2352     "sub       %1,%2                           \n"
2353     ".p2align  4                               \n"
2354   "1:                                          \n"
2355     "movdqu    (%0),%%xmm0                     \n"
2356     "movdqu    0x10(%0),%%xmm1                 \n"
2357     "lea       0x20(%0),%0                     \n"
2358     "psrlw     $0x8,%%xmm0                     \n"
2359     "psrlw     $0x8,%%xmm1                     \n"
2360     "packuswb  %%xmm1,%%xmm0                   \n"
2361     "movdqa    %%xmm0,%%xmm1                   \n"
2362     "pand      %%xmm5,%%xmm0                   \n"
2363     "packuswb  %%xmm0,%%xmm0                   \n"
2364     "psrlw     $0x8,%%xmm1                     \n"
2365     "packuswb  %%xmm1,%%xmm1                   \n"
2366     "movq      %%xmm0,(%1)                     \n"
2367     "movq      %%xmm1,(%1,%2)                  \n"
2368     "lea       0x8(%1),%1                      \n"
2369     "sub       $0x10,%3                        \n"
2370     "jg        1b                              \n"
2371   : "+r"(src_yuy2),    // %0
2372     "+r"(dst_u),       // %1
2373     "+r"(dst_v),       // %2
2374     "+r"(pix)          // %3
2375   :
2376   : "memory", "cc"
2377 #if defined(__SSE2__)
2378     , "xmm0", "xmm1", "xmm5"
2379 #endif
2380   );
2381 }
2382 
UYVYToYRow_SSE2(const uint8 * src_uyvy,uint8 * dst_y,int pix)2383 void UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int pix) {
2384   asm volatile (
2385     ".p2align  4                               \n"
2386   "1:                                          \n"
2387     "movdqa    (%0),%%xmm0                     \n"
2388     "movdqa    0x10(%0),%%xmm1                 \n"
2389     "lea       0x20(%0),%0                     \n"
2390     "psrlw     $0x8,%%xmm0                     \n"
2391     "psrlw     $0x8,%%xmm1                     \n"
2392     "packuswb  %%xmm1,%%xmm0                   \n"
2393     "sub       $0x10,%2                        \n"
2394     "movdqa    %%xmm0,(%1)                     \n"
2395     "lea       0x10(%1),%1                     \n"
2396     "jg        1b                              \n"
2397   : "+r"(src_uyvy),  // %0
2398     "+r"(dst_y),     // %1
2399     "+r"(pix)        // %2
2400   :
2401   : "memory", "cc"
2402 #if defined(__SSE2__)
2403     , "xmm0", "xmm1"
2404 #endif
2405   );
2406 }
2407 
UYVYToUVRow_SSE2(const uint8 * src_uyvy,int stride_uyvy,uint8 * dst_u,uint8 * dst_v,int pix)2408 void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
2409                       uint8* dst_u, uint8* dst_v, int pix) {
2410   asm volatile (
2411     "pcmpeqb   %%xmm5,%%xmm5                   \n"
2412     "psrlw     $0x8,%%xmm5                     \n"
2413     "sub       %1,%2                           \n"
2414     ".p2align  4                               \n"
2415   "1:                                          \n"
2416     "movdqa    (%0),%%xmm0                     \n"
2417     "movdqa    0x10(%0),%%xmm1                 \n"
2418     "movdqa    (%0,%4,1),%%xmm2                \n"
2419     "movdqa    0x10(%0,%4,1),%%xmm3            \n"
2420     "lea       0x20(%0),%0                     \n"
2421     "pavgb     %%xmm2,%%xmm0                   \n"
2422     "pavgb     %%xmm3,%%xmm1                   \n"
2423     "pand      %%xmm5,%%xmm0                   \n"
2424     "pand      %%xmm5,%%xmm1                   \n"
2425     "packuswb  %%xmm1,%%xmm0                   \n"
2426     "movdqa    %%xmm0,%%xmm1                   \n"
2427     "pand      %%xmm5,%%xmm0                   \n"
2428     "packuswb  %%xmm0,%%xmm0                   \n"
2429     "psrlw     $0x8,%%xmm1                     \n"
2430     "packuswb  %%xmm1,%%xmm1                   \n"
2431     "movq      %%xmm0,(%1)                     \n"
2432     "movq      %%xmm1,(%1,%2)                  \n"
2433     "lea       0x8(%1),%1                      \n"
2434     "sub       $0x10,%3                        \n"
2435     "jg        1b                              \n"
2436   : "+r"(src_uyvy),    // %0
2437     "+r"(dst_u),       // %1
2438     "+r"(dst_v),       // %2
2439     "+r"(pix)          // %3
2440   : "r"(static_cast<intptr_t>(stride_uyvy))  // %4
2441   : "memory", "cc"
2442 #if defined(__SSE2__)
2443     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
2444 #endif
2445   );
2446 }
2447 
UYVYToUV422Row_SSE2(const uint8 * src_uyvy,uint8 * dst_u,uint8 * dst_v,int pix)2448 void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
2449                          uint8* dst_u, uint8* dst_v, int pix) {
2450   asm volatile (
2451     "pcmpeqb   %%xmm5,%%xmm5                   \n"
2452     "psrlw     $0x8,%%xmm5                     \n"
2453     "sub       %1,%2                           \n"
2454     ".p2align  4                               \n"
2455   "1:                                          \n"
2456     "movdqa    (%0),%%xmm0                     \n"
2457     "movdqa    0x10(%0),%%xmm1                 \n"
2458     "lea       0x20(%0),%0                     \n"
2459     "pand      %%xmm5,%%xmm0                   \n"
2460     "pand      %%xmm5,%%xmm1                   \n"
2461     "packuswb  %%xmm1,%%xmm0                   \n"
2462     "movdqa    %%xmm0,%%xmm1                   \n"
2463     "pand      %%xmm5,%%xmm0                   \n"
2464     "packuswb  %%xmm0,%%xmm0                   \n"
2465     "psrlw     $0x8,%%xmm1                     \n"
2466     "packuswb  %%xmm1,%%xmm1                   \n"
2467     "movq      %%xmm0,(%1)                     \n"
2468     "movq      %%xmm1,(%1,%2)                  \n"
2469     "lea       0x8(%1),%1                      \n"
2470     "sub       $0x10,%3                        \n"
2471     "jg        1b                              \n"
2472   : "+r"(src_uyvy),    // %0
2473     "+r"(dst_u),       // %1
2474     "+r"(dst_v),       // %2
2475     "+r"(pix)          // %3
2476   :
2477   : "memory", "cc"
2478 #if defined(__SSE2__)
2479     , "xmm0", "xmm1", "xmm5"
2480 #endif
2481   );
2482 }
2483 
UYVYToYRow_Unaligned_SSE2(const uint8 * src_uyvy,uint8 * dst_y,int pix)2484 void UYVYToYRow_Unaligned_SSE2(const uint8* src_uyvy,
2485                                uint8* dst_y, int pix) {
2486   asm volatile (
2487     ".p2align  4                               \n"
2488   "1:                                          \n"
2489     "movdqu    (%0),%%xmm0                     \n"
2490     "movdqu    0x10(%0),%%xmm1                 \n"
2491     "lea       0x20(%0),%0                     \n"
2492     "psrlw     $0x8,%%xmm0                     \n"
2493     "psrlw     $0x8,%%xmm1                     \n"
2494     "packuswb  %%xmm1,%%xmm0                   \n"
2495     "sub       $0x10,%2                        \n"
2496     "movdqu    %%xmm0,(%1)                     \n"
2497     "lea       0x10(%1),%1                     \n"
2498     "jg        1b                              \n"
2499   : "+r"(src_uyvy),  // %0
2500     "+r"(dst_y),     // %1
2501     "+r"(pix)        // %2
2502   :
2503   : "memory", "cc"
2504 #if defined(__SSE2__)
2505     , "xmm0", "xmm1"
2506 #endif
2507   );
2508 }
2509 
UYVYToUVRow_Unaligned_SSE2(const uint8 * src_uyvy,int stride_uyvy,uint8 * dst_u,uint8 * dst_v,int pix)2510 void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy,
2511                                 uint8* dst_u, uint8* dst_v, int pix) {
2512   asm volatile (
2513     "pcmpeqb   %%xmm5,%%xmm5                   \n"
2514     "psrlw     $0x8,%%xmm5                     \n"
2515     "sub       %1,%2                           \n"
2516     ".p2align  4                               \n"
2517   "1:                                          \n"
2518     "movdqu    (%0),%%xmm0                     \n"
2519     "movdqu    0x10(%0),%%xmm1                 \n"
2520     "movdqu    (%0,%4,1),%%xmm2                \n"
2521     "movdqu    0x10(%0,%4,1),%%xmm3            \n"
2522     "lea       0x20(%0),%0                     \n"
2523     "pavgb     %%xmm2,%%xmm0                   \n"
2524     "pavgb     %%xmm3,%%xmm1                   \n"
2525     "pand      %%xmm5,%%xmm0                   \n"
2526     "pand      %%xmm5,%%xmm1                   \n"
2527     "packuswb  %%xmm1,%%xmm0                   \n"
2528     "movdqa    %%xmm0,%%xmm1                   \n"
2529     "pand      %%xmm5,%%xmm0                   \n"
2530     "packuswb  %%xmm0,%%xmm0                   \n"
2531     "psrlw     $0x8,%%xmm1                     \n"
2532     "packuswb  %%xmm1,%%xmm1                   \n"
2533     "movq      %%xmm0,(%1)                     \n"
2534     "movq      %%xmm1,(%1,%2)                  \n"
2535     "lea       0x8(%1),%1                      \n"
2536     "sub       $0x10,%3                        \n"
2537     "jg        1b                              \n"
2538   : "+r"(src_uyvy),    // %0
2539     "+r"(dst_u),       // %1
2540     "+r"(dst_v),       // %2
2541     "+r"(pix)          // %3
2542   : "r"(static_cast<intptr_t>(stride_uyvy))  // %4
2543   : "memory", "cc"
2544 #if defined(__SSE2__)
2545     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
2546 #endif
2547   );
2548 }
2549 
UYVYToUV422Row_Unaligned_SSE2(const uint8 * src_uyvy,uint8 * dst_u,uint8 * dst_v,int pix)2550 void UYVYToUV422Row_Unaligned_SSE2(const uint8* src_uyvy,
2551                                    uint8* dst_u, uint8* dst_v, int pix) {
2552   asm volatile (
2553     "pcmpeqb   %%xmm5,%%xmm5                   \n"
2554     "psrlw     $0x8,%%xmm5                     \n"
2555     "sub       %1,%2                           \n"
2556     ".p2align  4                               \n"
2557   "1:                                          \n"
2558     "movdqu    (%0),%%xmm0                     \n"
2559     "movdqu    0x10(%0),%%xmm1                 \n"
2560     "lea       0x20(%0),%0                     \n"
2561     "pand      %%xmm5,%%xmm0                   \n"
2562     "pand      %%xmm5,%%xmm1                   \n"
2563     "packuswb  %%xmm1,%%xmm0                   \n"
2564     "movdqa    %%xmm0,%%xmm1                   \n"
2565     "pand      %%xmm5,%%xmm0                   \n"
2566     "packuswb  %%xmm0,%%xmm0                   \n"
2567     "psrlw     $0x8,%%xmm1                     \n"
2568     "packuswb  %%xmm1,%%xmm1                   \n"
2569     "movq      %%xmm0,(%1)                     \n"
2570     "movq      %%xmm1,(%1,%2)                  \n"
2571     "lea       0x8(%1),%1                      \n"
2572     "sub       $0x10,%3                        \n"
2573     "jg        1b                              \n"
2574   : "+r"(src_uyvy),    // %0
2575     "+r"(dst_u),       // %1
2576     "+r"(dst_v),       // %2
2577     "+r"(pix)          // %3
2578   :
2579   : "memory", "cc"
2580 #if defined(__SSE2__)
2581     , "xmm0", "xmm1", "xmm5"
2582 #endif
2583   );
2584 }
2585 #endif  // HAS_YUY2TOYROW_SSE2
2586 
2587 #ifdef HAS_ARGBBLENDROW_SSE2
2588 // Blend 8 pixels at a time.
ARGBBlendRow_SSE2(const uint8 * src_argb0,const uint8 * src_argb1,uint8 * dst_argb,int width)2589 void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
2590                        uint8* dst_argb, int width) {
2591   asm volatile (
2592     "pcmpeqb   %%xmm7,%%xmm7                   \n"
2593     "psrlw     $0xf,%%xmm7                     \n"
2594     "pcmpeqb   %%xmm6,%%xmm6                   \n"
2595     "psrlw     $0x8,%%xmm6                     \n"
2596     "pcmpeqb   %%xmm5,%%xmm5                   \n"
2597     "psllw     $0x8,%%xmm5                     \n"
2598     "pcmpeqb   %%xmm4,%%xmm4                   \n"
2599     "pslld     $0x18,%%xmm4                    \n"
2600     "sub       $0x1,%3                         \n"
2601     "je        91f                             \n"
2602     "jl        99f                             \n"
2603 
2604     // 1 pixel loop until destination pointer is aligned.
2605   "10:                                         \n"
2606     "test      $0xf,%2                         \n"
2607     "je        19f                             \n"
2608     "movd      (%0),%%xmm3                     \n"
2609     "lea       0x4(%0),%0                      \n"
2610     "movdqa    %%xmm3,%%xmm0                   \n"
2611     "pxor      %%xmm4,%%xmm3                   \n"
2612     "movd      (%1),%%xmm2                     \n"
2613     "psrlw     $0x8,%%xmm3                     \n"
2614     "pshufhw   $0xf5,%%xmm3,%%xmm3             \n"
2615     "pshuflw   $0xf5,%%xmm3,%%xmm3             \n"
2616     "pand      %%xmm6,%%xmm2                   \n"
2617     "paddw     %%xmm7,%%xmm3                   \n"
2618     "pmullw    %%xmm3,%%xmm2                   \n"
2619     "movd      (%1),%%xmm1                     \n"
2620     "lea       0x4(%1),%1                      \n"
2621     "psrlw     $0x8,%%xmm1                     \n"
2622     "por       %%xmm4,%%xmm0                   \n"
2623     "pmullw    %%xmm3,%%xmm1                   \n"
2624     "psrlw     $0x8,%%xmm2                     \n"
2625     "paddusb   %%xmm2,%%xmm0                   \n"
2626     "pand      %%xmm5,%%xmm1                   \n"
2627     "paddusb   %%xmm1,%%xmm0                   \n"
2628     "sub       $0x1,%3                         \n"
2629     "movd      %%xmm0,(%2)                     \n"
2630     "lea       0x4(%2),%2                      \n"
2631     "jge       10b                             \n"
2632 
2633   "19:                                         \n"
2634     "add       $1-4,%3                         \n"
2635     "jl        49f                             \n"
2636 
2637     // 4 pixel loop.
2638     ".p2align  2                               \n"
2639   "41:                                         \n"
2640     "movdqu    (%0),%%xmm3                     \n"
2641     "lea       0x10(%0),%0                     \n"
2642     "movdqa    %%xmm3,%%xmm0                   \n"
2643     "pxor      %%xmm4,%%xmm3                   \n"
2644     "movdqu    (%1),%%xmm2                     \n"
2645     "psrlw     $0x8,%%xmm3                     \n"
2646     "pshufhw   $0xf5,%%xmm3,%%xmm3             \n"
2647     "pshuflw   $0xf5,%%xmm3,%%xmm3             \n"
2648     "pand      %%xmm6,%%xmm2                   \n"
2649     "paddw     %%xmm7,%%xmm3                   \n"
2650     "pmullw    %%xmm3,%%xmm2                   \n"
2651     "movdqu    (%1),%%xmm1                     \n"
2652     "lea       0x10(%1),%1                     \n"
2653     "psrlw     $0x8,%%xmm1                     \n"
2654     "por       %%xmm4,%%xmm0                   \n"
2655     "pmullw    %%xmm3,%%xmm1                   \n"
2656     "psrlw     $0x8,%%xmm2                     \n"
2657     "paddusb   %%xmm2,%%xmm0                   \n"
2658     "pand      %%xmm5,%%xmm1                   \n"
2659     "paddusb   %%xmm1,%%xmm0                   \n"
2660     "sub       $0x4,%3                         \n"
2661     "movdqa    %%xmm0,(%2)                     \n"
2662     "lea       0x10(%2),%2                     \n"
2663     "jge       41b                             \n"
2664 
2665   "49:                                         \n"
2666     "add       $0x3,%3                         \n"
2667     "jl        99f                             \n"
2668 
2669     // 1 pixel loop.
2670   "91:                                         \n"
2671     "movd      (%0),%%xmm3                     \n"
2672     "lea       0x4(%0),%0                      \n"
2673     "movdqa    %%xmm3,%%xmm0                   \n"
2674     "pxor      %%xmm4,%%xmm3                   \n"
2675     "movd      (%1),%%xmm2                     \n"
2676     "psrlw     $0x8,%%xmm3                     \n"
2677     "pshufhw   $0xf5,%%xmm3,%%xmm3             \n"
2678     "pshuflw   $0xf5,%%xmm3,%%xmm3             \n"
2679     "pand      %%xmm6,%%xmm2                   \n"
2680     "paddw     %%xmm7,%%xmm3                   \n"
2681     "pmullw    %%xmm3,%%xmm2                   \n"
2682     "movd      (%1),%%xmm1                     \n"
2683     "lea       0x4(%1),%1                      \n"
2684     "psrlw     $0x8,%%xmm1                     \n"
2685     "por       %%xmm4,%%xmm0                   \n"
2686     "pmullw    %%xmm3,%%xmm1                   \n"
2687     "psrlw     $0x8,%%xmm2                     \n"
2688     "paddusb   %%xmm2,%%xmm0                   \n"
2689     "pand      %%xmm5,%%xmm1                   \n"
2690     "paddusb   %%xmm1,%%xmm0                   \n"
2691     "sub       $0x1,%3                         \n"
2692     "movd      %%xmm0,(%2)                     \n"
2693     "lea       0x4(%2),%2                      \n"
2694     "jge       91b                             \n"
2695   "99:                                         \n"
2696   : "+r"(src_argb0),    // %0
2697     "+r"(src_argb1),    // %1
2698     "+r"(dst_argb),     // %2
2699     "+r"(width)         // %3
2700   :
2701   : "memory", "cc"
2702 #if defined(__SSE2__)
2703     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
2704 #endif
2705   );
2706 }
2707 #endif  // HAS_ARGBBLENDROW_SSE2
2708 
2709 #ifdef HAS_ARGBBLENDROW_SSSE3
2710 // Shuffle table for isolating alpha.
2711 CONST uvec8 kShuffleAlpha = {
2712   3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80,
2713   11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80
2714 };
2715 
2716 // Blend 8 pixels at a time
2717 // Shuffle table for reversing the bytes.
2718 
2719 // Same as SSE2, but replaces
2720 //    psrlw      xmm3, 8          // alpha
2721 //    pshufhw    xmm3, xmm3,0F5h  // 8 alpha words
2722 //    pshuflw    xmm3, xmm3,0F5h
2723 // with..
2724 //    pshufb     xmm3, kShuffleAlpha // alpha
2725 
ARGBBlendRow_SSSE3(const uint8 * src_argb0,const uint8 * src_argb1,uint8 * dst_argb,int width)2726 void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
2727                         uint8* dst_argb, int width) {
2728   asm volatile (
2729     "pcmpeqb   %%xmm7,%%xmm7                   \n"
2730     "psrlw     $0xf,%%xmm7                     \n"
2731     "pcmpeqb   %%xmm6,%%xmm6                   \n"
2732     "psrlw     $0x8,%%xmm6                     \n"
2733     "pcmpeqb   %%xmm5,%%xmm5                   \n"
2734     "psllw     $0x8,%%xmm5                     \n"
2735     "pcmpeqb   %%xmm4,%%xmm4                   \n"
2736     "pslld     $0x18,%%xmm4                    \n"
2737     "sub       $0x1,%3                         \n"
2738     "je        91f                             \n"
2739     "jl        99f                             \n"
2740 
2741     // 1 pixel loop until destination pointer is aligned.
2742   "10:                                         \n"
2743     "test      $0xf,%2                         \n"
2744     "je        19f                             \n"
2745     "movd      (%0),%%xmm3                     \n"
2746     "lea       0x4(%0),%0                      \n"
2747     "movdqa    %%xmm3,%%xmm0                   \n"
2748     "pxor      %%xmm4,%%xmm3                   \n"
2749     "movd      (%1),%%xmm2                     \n"
2750     "pshufb    %4,%%xmm3                       \n"
2751     "pand      %%xmm6,%%xmm2                   \n"
2752     "paddw     %%xmm7,%%xmm3                   \n"
2753     "pmullw    %%xmm3,%%xmm2                   \n"
2754     "movd      (%1),%%xmm1                     \n"
2755     "lea       0x4(%1),%1                      \n"
2756     "psrlw     $0x8,%%xmm1                     \n"
2757     "por       %%xmm4,%%xmm0                   \n"
2758     "pmullw    %%xmm3,%%xmm1                   \n"
2759     "psrlw     $0x8,%%xmm2                     \n"
2760     "paddusb   %%xmm2,%%xmm0                   \n"
2761     "pand      %%xmm5,%%xmm1                   \n"
2762     "paddusb   %%xmm1,%%xmm0                   \n"
2763     "sub       $0x1,%3                         \n"
2764     "movd      %%xmm0,(%2)                     \n"
2765     "lea       0x4(%2),%2                      \n"
2766     "jge       10b                             \n"
2767 
2768   "19:                                         \n"
2769     "add       $1-4,%3                         \n"
2770     "jl        49f                             \n"
2771     "test      $0xf,%0                         \n"
2772     "jne       41f                             \n"
2773     "test      $0xf,%1                         \n"
2774     "jne       41f                             \n"
2775 
2776     // 4 pixel loop.
2777     ".p2align  2                               \n"
2778   "40:                                         \n"
2779     "movdqa    (%0),%%xmm3                     \n"
2780     "lea       0x10(%0),%0                     \n"
2781     "movdqa    %%xmm3,%%xmm0                   \n"
2782     "pxor      %%xmm4,%%xmm3                   \n"
2783     "movdqa    (%1),%%xmm2                     \n"
2784     "pshufb    %4,%%xmm3                       \n"
2785     "pand      %%xmm6,%%xmm2                   \n"
2786     "paddw     %%xmm7,%%xmm3                   \n"
2787     "pmullw    %%xmm3,%%xmm2                   \n"
2788     "movdqa    (%1),%%xmm1                     \n"
2789     "lea       0x10(%1),%1                     \n"
2790     "psrlw     $0x8,%%xmm1                     \n"
2791     "por       %%xmm4,%%xmm0                   \n"
2792     "pmullw    %%xmm3,%%xmm1                   \n"
2793     "psrlw     $0x8,%%xmm2                     \n"
2794     "paddusb   %%xmm2,%%xmm0                   \n"
2795     "pand      %%xmm5,%%xmm1                   \n"
2796     "paddusb   %%xmm1,%%xmm0                   \n"
2797     "sub       $0x4,%3                         \n"
2798     "movdqa    %%xmm0,(%2)                     \n"
2799     "lea       0x10(%2),%2                     \n"
2800     "jge       40b                             \n"
2801     "jmp       49f                             \n"
2802 
2803     // 4 pixel unaligned loop.
2804     ".p2align  2                               \n"
2805   "41:                                         \n"
2806     "movdqu    (%0),%%xmm3                     \n"
2807     "lea       0x10(%0),%0                     \n"
2808     "movdqa    %%xmm3,%%xmm0                   \n"
2809     "pxor      %%xmm4,%%xmm3                   \n"
2810     "movdqu    (%1),%%xmm2                     \n"
2811     "pshufb    %4,%%xmm3                       \n"
2812     "pand      %%xmm6,%%xmm2                   \n"
2813     "paddw     %%xmm7,%%xmm3                   \n"
2814     "pmullw    %%xmm3,%%xmm2                   \n"
2815     "movdqu    (%1),%%xmm1                     \n"
2816     "lea       0x10(%1),%1                     \n"
2817     "psrlw     $0x8,%%xmm1                     \n"
2818     "por       %%xmm4,%%xmm0                   \n"
2819     "pmullw    %%xmm3,%%xmm1                   \n"
2820     "psrlw     $0x8,%%xmm2                     \n"
2821     "paddusb   %%xmm2,%%xmm0                   \n"
2822     "pand      %%xmm5,%%xmm1                   \n"
2823     "paddusb   %%xmm1,%%xmm0                   \n"
2824     "sub       $0x4,%3                         \n"
2825     "movdqa    %%xmm0,(%2)                     \n"
2826     "lea       0x10(%2),%2                     \n"
2827     "jge       41b                             \n"
2828 
2829   "49:                                         \n"
2830     "add       $0x3,%3                         \n"
2831     "jl        99f                             \n"
2832 
2833     // 1 pixel loop.
2834   "91:                                         \n"
2835     "movd      (%0),%%xmm3                     \n"
2836     "lea       0x4(%0),%0                      \n"
2837     "movdqa    %%xmm3,%%xmm0                   \n"
2838     "pxor      %%xmm4,%%xmm3                   \n"
2839     "movd      (%1),%%xmm2                     \n"
2840     "pshufb    %4,%%xmm3                       \n"
2841     "pand      %%xmm6,%%xmm2                   \n"
2842     "paddw     %%xmm7,%%xmm3                   \n"
2843     "pmullw    %%xmm3,%%xmm2                   \n"
2844     "movd      (%1),%%xmm1                     \n"
2845     "lea       0x4(%1),%1                      \n"
2846     "psrlw     $0x8,%%xmm1                     \n"
2847     "por       %%xmm4,%%xmm0                   \n"
2848     "pmullw    %%xmm3,%%xmm1                   \n"
2849     "psrlw     $0x8,%%xmm2                     \n"
2850     "paddusb   %%xmm2,%%xmm0                   \n"
2851     "pand      %%xmm5,%%xmm1                   \n"
2852     "paddusb   %%xmm1,%%xmm0                   \n"
2853     "sub       $0x1,%3                         \n"
2854     "movd      %%xmm0,(%2)                     \n"
2855     "lea       0x4(%2),%2                      \n"
2856     "jge       91b                             \n"
2857   "99:                                         \n"
2858   : "+r"(src_argb0),    // %0
2859     "+r"(src_argb1),    // %1
2860     "+r"(dst_argb),     // %2
2861     "+r"(width)         // %3
2862   : "m"(kShuffleAlpha)  // %4
2863   : "memory", "cc"
2864 #if defined(__SSE2__)
2865     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
2866 #endif
2867   );
2868 }
2869 #endif  // HAS_ARGBBLENDROW_SSSE3
2870 
2871 #ifdef HAS_ARGBATTENUATE_SSE2
2872 // Attenuate 4 pixels at a time.
2873 // aligned to 16 bytes
ARGBAttenuateRow_SSE2(const uint8 * src_argb,uint8 * dst_argb,int width)2874 void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
2875   asm volatile (
2876     "sub       %0,%1                           \n"
2877     "pcmpeqb   %%xmm4,%%xmm4                   \n"
2878     "pslld     $0x18,%%xmm4                    \n"
2879     "pcmpeqb   %%xmm5,%%xmm5                   \n"
2880     "psrld     $0x8,%%xmm5                     \n"
2881 
2882     // 4 pixel loop.
2883     ".p2align  4                               \n"
2884   "1:                                          \n"
2885     "movdqa    (%0),%%xmm0                     \n"
2886     "punpcklbw %%xmm0,%%xmm0                   \n"
2887     "pshufhw   $0xff,%%xmm0,%%xmm2             \n"
2888     "pshuflw   $0xff,%%xmm2,%%xmm2             \n"
2889     "pmulhuw   %%xmm2,%%xmm0                   \n"
2890     "movdqa    (%0),%%xmm1                     \n"
2891     "punpckhbw %%xmm1,%%xmm1                   \n"
2892     "pshufhw   $0xff,%%xmm1,%%xmm2             \n"
2893     "pshuflw   $0xff,%%xmm2,%%xmm2             \n"
2894     "pmulhuw   %%xmm2,%%xmm1                   \n"
2895     "movdqa    (%0),%%xmm2                     \n"
2896     "psrlw     $0x8,%%xmm0                     \n"
2897     "pand      %%xmm4,%%xmm2                   \n"
2898     "psrlw     $0x8,%%xmm1                     \n"
2899     "packuswb  %%xmm1,%%xmm0                   \n"
2900     "pand      %%xmm5,%%xmm0                   \n"
2901     "por       %%xmm2,%%xmm0                   \n"
2902     "sub       $0x4,%2                         \n"
2903     "movdqa    %%xmm0,(%0,%1,1)                \n"
2904     "lea       0x10(%0),%0                     \n"
2905     "jg        1b                              \n"
2906   : "+r"(src_argb),    // %0
2907     "+r"(dst_argb),    // %1
2908     "+r"(width)        // %2
2909   :
2910   : "memory", "cc"
2911 #if defined(__SSE2__)
2912     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2913 #endif
2914   );
2915 }
2916 #endif  // HAS_ARGBATTENUATE_SSE2
2917 
2918 #ifdef HAS_ARGBATTENUATEROW_SSSE3
2919 // Shuffle table duplicating alpha
2920 CONST uvec8 kShuffleAlpha0 = {
2921   3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u,
2922 };
2923 CONST uvec8 kShuffleAlpha1 = {
2924   11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
2925   15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u,
2926 };
2927 // Attenuate 4 pixels at a time.
2928 // aligned to 16 bytes
ARGBAttenuateRow_SSSE3(const uint8 * src_argb,uint8 * dst_argb,int width)2929 void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
2930   asm volatile (
2931     "sub       %0,%1                           \n"
2932     "pcmpeqb   %%xmm3,%%xmm3                   \n"
2933     "pslld     $0x18,%%xmm3                    \n"
2934     "movdqa    %3,%%xmm4                       \n"
2935     "movdqa    %4,%%xmm5                       \n"
2936 
2937     // 4 pixel loop.
2938     ".p2align  4                               \n"
2939   "1:                                          \n"
2940     "movdqa    (%0),%%xmm0                     \n"
2941     "pshufb    %%xmm4,%%xmm0                   \n"
2942     "movdqa    (%0),%%xmm1                     \n"
2943     "punpcklbw %%xmm1,%%xmm1                   \n"
2944     "pmulhuw   %%xmm1,%%xmm0                   \n"
2945     "movdqa    (%0),%%xmm1                     \n"
2946     "pshufb    %%xmm5,%%xmm1                   \n"
2947     "movdqa    (%0),%%xmm2                     \n"
2948     "punpckhbw %%xmm2,%%xmm2                   \n"
2949     "pmulhuw   %%xmm2,%%xmm1                   \n"
2950     "movdqa    (%0),%%xmm2                     \n"
2951     "pand      %%xmm3,%%xmm2                   \n"
2952     "psrlw     $0x8,%%xmm0                     \n"
2953     "psrlw     $0x8,%%xmm1                     \n"
2954     "packuswb  %%xmm1,%%xmm0                   \n"
2955     "por       %%xmm2,%%xmm0                   \n"
2956     "sub       $0x4,%2                         \n"
2957     "movdqa    %%xmm0,(%0,%1,1)                \n"
2958     "lea       0x10(%0),%0                     \n"
2959     "jg        1b                              \n"
2960   : "+r"(src_argb),    // %0
2961     "+r"(dst_argb),    // %1
2962     "+r"(width)        // %2
2963   : "m"(kShuffleAlpha0),  // %3
2964     "m"(kShuffleAlpha1)  // %4
2965   : "memory", "cc"
2966 #if defined(__SSE2__)
2967     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2968 #endif
2969   );
2970 }
2971 #endif  // HAS_ARGBATTENUATEROW_SSSE3
2972 
2973 #ifdef HAS_ARGBUNATTENUATEROW_SSE2
2974 // Unattenuate 4 pixels at a time.
2975 // aligned to 16 bytes
ARGBUnattenuateRow_SSE2(const uint8 * src_argb,uint8 * dst_argb,int width)2976 void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
2977                              int width) {
2978   uintptr_t alpha = 0;
2979   asm volatile (
2980     "sub       %0,%1                           \n"
2981     "pcmpeqb   %%xmm4,%%xmm4                   \n"
2982     "pslld     $0x18,%%xmm4                    \n"
2983 
2984     // 4 pixel loop.
2985     ".p2align  4                               \n"
2986   "1:                                          \n"
2987     "movdqa    (%0),%%xmm0                     \n"
2988     "movzb     0x3(%0),%3                      \n"
2989     "punpcklbw %%xmm0,%%xmm0                   \n"
2990     "movd      0x0(%4,%3,4),%%xmm2             \n"
2991     "movzb     0x7(%0),%3                      \n"
2992     "movd      0x0(%4,%3,4),%%xmm3             \n"
2993     "pshuflw   $0xc0,%%xmm2,%%xmm2             \n"
2994     "pshuflw   $0xc0,%%xmm3,%%xmm3             \n"
2995     "movlhps   %%xmm3,%%xmm2                   \n"
2996     "pmulhuw   %%xmm2,%%xmm0                   \n"
2997     "movdqa    (%0),%%xmm1                     \n"
2998     "movzb     0xb(%0),%3                      \n"
2999     "punpckhbw %%xmm1,%%xmm1                   \n"
3000     "movd      0x0(%4,%3,4),%%xmm2             \n"
3001     "movzb     0xf(%0),%3                      \n"
3002     "movd      0x0(%4,%3,4),%%xmm3             \n"
3003     "pshuflw   $0xc0,%%xmm2,%%xmm2             \n"
3004     "pshuflw   $0xc0,%%xmm3,%%xmm3             \n"
3005     "movlhps   %%xmm3,%%xmm2                   \n"
3006     "pmulhuw   %%xmm2,%%xmm1                   \n"
3007     "movdqa    (%0),%%xmm2                     \n"
3008     "pand      %%xmm4,%%xmm2                   \n"
3009     "packuswb  %%xmm1,%%xmm0                   \n"
3010     "por       %%xmm2,%%xmm0                   \n"
3011     "sub       $0x4,%2                         \n"
3012     "movdqa    %%xmm0,(%0,%1,1)                \n"
3013     "lea       0x10(%0),%0                     \n"
3014     "jg        1b                              \n"
3015   : "+r"(src_argb),    // %0
3016     "+r"(dst_argb),    // %1
3017     "+r"(width),       // %2
3018     "+r"(alpha)        // %3
3019   : "r"(fixed_invtbl8)  // %4
3020   : "memory", "cc"
3021 #if defined(__SSE2__)
3022     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
3023 #endif
3024   );
3025 }
3026 #endif  // HAS_ARGBUNATTENUATEROW_SSE2
3027 
3028 #ifdef HAS_ARGBGRAYROW_SSSE3
3029 // Constant for ARGB color to gray scale. 0.11 * B + 0.59 * G + 0.30 * R
3030 CONST vec8 kARGBToGray = {
3031   14, 76, 38, 0, 14, 76, 38, 0, 14, 76, 38, 0, 14, 76, 38, 0
3032 };
3033 
3034 // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
ARGBGrayRow_SSSE3(const uint8 * src_argb,uint8 * dst_argb,int width)3035 void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
3036   asm volatile (
3037     "movdqa    %3,%%xmm4                       \n"
3038     "sub       %0,%1                           \n"
3039 
3040     // 8 pixel loop.
3041     ".p2align  4                               \n"
3042   "1:                                          \n"
3043     "movdqa    (%0),%%xmm0                     \n"
3044     "movdqa    0x10(%0),%%xmm1                 \n"
3045     "pmaddubsw %%xmm4,%%xmm0                   \n"
3046     "pmaddubsw %%xmm4,%%xmm1                   \n"
3047     "phaddw    %%xmm1,%%xmm0                   \n"
3048     "psrlw     $0x7,%%xmm0                     \n"
3049     "packuswb  %%xmm0,%%xmm0                   \n"
3050     "movdqa    (%0),%%xmm2                     \n"
3051     "movdqa    0x10(%0),%%xmm3                 \n"
3052     "psrld     $0x18,%%xmm2                    \n"
3053     "psrld     $0x18,%%xmm3                    \n"
3054     "packuswb  %%xmm3,%%xmm2                   \n"
3055     "packuswb  %%xmm2,%%xmm2                   \n"
3056     "movdqa    %%xmm0,%%xmm3                   \n"
3057     "punpcklbw %%xmm0,%%xmm0                   \n"
3058     "punpcklbw %%xmm2,%%xmm3                   \n"
3059     "movdqa    %%xmm0,%%xmm1                   \n"
3060     "punpcklwd %%xmm3,%%xmm0                   \n"
3061     "punpckhwd %%xmm3,%%xmm1                   \n"
3062     "sub       $0x8,%2                         \n"
3063     "movdqa    %%xmm0,(%0,%1,1)                \n"
3064     "movdqa    %%xmm1,0x10(%0,%1,1)            \n"
3065     "lea       0x20(%0),%0                     \n"
3066     "jg        1b                              \n"
3067   : "+r"(src_argb),   // %0
3068     "+r"(dst_argb),   // %1
3069     "+r"(width)       // %2
3070   : "m"(kARGBToGray)  // %3
3071   : "memory", "cc"
3072 #if defined(__SSE2__)
3073     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
3074 #endif
3075   );
3076 }
3077 #endif  // HAS_ARGBGRAYROW_SSSE3
3078 
3079 #ifdef HAS_ARGBSEPIAROW_SSSE3
3080 //    b = (r * 35 + g * 68 + b * 17) >> 7
3081 //    g = (r * 45 + g * 88 + b * 22) >> 7
3082 //    r = (r * 50 + g * 98 + b * 24) >> 7
3083 // Constant for ARGB color to sepia tone
3084 CONST vec8 kARGBToSepiaB = {
3085   17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0
3086 };
3087 
3088 CONST vec8 kARGBToSepiaG = {
3089   22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0
3090 };
3091 
3092 CONST vec8 kARGBToSepiaR = {
3093   24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0
3094 };
3095 
3096 // Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
ARGBSepiaRow_SSSE3(uint8 * dst_argb,int width)3097 void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
3098   asm volatile (
3099     "movdqa    %2,%%xmm2                       \n"
3100     "movdqa    %3,%%xmm3                       \n"
3101     "movdqa    %4,%%xmm4                       \n"
3102 
3103     // 8 pixel loop.
3104     ".p2align  4                               \n"
3105   "1:                                          \n"
3106     "movdqa    (%0),%%xmm0                     \n"
3107     "movdqa    0x10(%0),%%xmm6                 \n"
3108     "pmaddubsw %%xmm2,%%xmm0                   \n"
3109     "pmaddubsw %%xmm2,%%xmm6                   \n"
3110     "phaddw    %%xmm6,%%xmm0                   \n"
3111     "psrlw     $0x7,%%xmm0                     \n"
3112     "packuswb  %%xmm0,%%xmm0                   \n"
3113     "movdqa    (%0),%%xmm5                     \n"
3114     "movdqa    0x10(%0),%%xmm1                 \n"
3115     "pmaddubsw %%xmm3,%%xmm5                   \n"
3116     "pmaddubsw %%xmm3,%%xmm1                   \n"
3117     "phaddw    %%xmm1,%%xmm5                   \n"
3118     "psrlw     $0x7,%%xmm5                     \n"
3119     "packuswb  %%xmm5,%%xmm5                   \n"
3120     "punpcklbw %%xmm5,%%xmm0                   \n"
3121     "movdqa    (%0),%%xmm5                     \n"
3122     "movdqa    0x10(%0),%%xmm1                 \n"
3123     "pmaddubsw %%xmm4,%%xmm5                   \n"
3124     "pmaddubsw %%xmm4,%%xmm1                   \n"
3125     "phaddw    %%xmm1,%%xmm5                   \n"
3126     "psrlw     $0x7,%%xmm5                     \n"
3127     "packuswb  %%xmm5,%%xmm5                   \n"
3128     "movdqa    (%0),%%xmm6                     \n"
3129     "movdqa    0x10(%0),%%xmm1                 \n"
3130     "psrld     $0x18,%%xmm6                    \n"
3131     "psrld     $0x18,%%xmm1                    \n"
3132     "packuswb  %%xmm1,%%xmm6                   \n"
3133     "packuswb  %%xmm6,%%xmm6                   \n"
3134     "punpcklbw %%xmm6,%%xmm5                   \n"
3135     "movdqa    %%xmm0,%%xmm1                   \n"
3136     "punpcklwd %%xmm5,%%xmm0                   \n"
3137     "punpckhwd %%xmm5,%%xmm1                   \n"
3138     "sub       $0x8,%1                         \n"
3139     "movdqa    %%xmm0,(%0)                     \n"
3140     "movdqa    %%xmm1,0x10(%0)                 \n"
3141     "lea       0x20(%0),%0                     \n"
3142     "jg        1b                              \n"
3143   : "+r"(dst_argb),      // %0
3144     "+r"(width)          // %1
3145   : "m"(kARGBToSepiaB),  // %2
3146     "m"(kARGBToSepiaG),  // %3
3147     "m"(kARGBToSepiaR)   // %4
3148   : "memory", "cc"
3149 #if defined(__SSE2__)
3150     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
3151 #endif
3152   );
3153 }
3154 #endif  // HAS_ARGBSEPIAROW_SSSE3
3155 
3156 #ifdef HAS_ARGBCOLORMATRIXROW_SSSE3
3157 // Tranform 8 ARGB pixels (32 bytes) with color matrix.
3158 // Same as Sepia except matrix is provided.
ARGBColorMatrixRow_SSSE3(uint8 * dst_argb,const int8 * matrix_argb,int width)3159 void ARGBColorMatrixRow_SSSE3(uint8* dst_argb, const int8* matrix_argb,
3160                               int width) {
3161   asm volatile (
3162     "movd      (%2),%%xmm2                     \n"
3163     "movd      0x4(%2),%%xmm3                  \n"
3164     "movd      0x8(%2),%%xmm4                  \n"
3165     "pshufd    $0x0,%%xmm2,%%xmm2              \n"
3166     "pshufd    $0x0,%%xmm3,%%xmm3              \n"
3167     "pshufd    $0x0,%%xmm4,%%xmm4              \n"
3168 
3169     // 8 pixel loop.
3170     ".p2align  4                               \n"
3171   "1:                                          \n"
3172     "movdqa    (%0),%%xmm0                     \n"
3173     "movdqa    0x10(%0),%%xmm6                 \n"
3174     "pmaddubsw %%xmm2,%%xmm0                   \n"
3175     "pmaddubsw %%xmm2,%%xmm6                   \n"
3176     "movdqa    (%0),%%xmm5                     \n"
3177     "movdqa    0x10(%0),%%xmm1                 \n"
3178     "pmaddubsw %%xmm3,%%xmm5                   \n"
3179     "pmaddubsw %%xmm3,%%xmm1                   \n"
3180     "phaddsw   %%xmm6,%%xmm0                   \n"
3181     "phaddsw   %%xmm1,%%xmm5                   \n"
3182     "psraw     $0x7,%%xmm0                     \n"
3183     "psraw     $0x7,%%xmm5                     \n"
3184     "packuswb  %%xmm0,%%xmm0                   \n"
3185     "packuswb  %%xmm5,%%xmm5                   \n"
3186     "punpcklbw %%xmm5,%%xmm0                   \n"
3187     "movdqa    (%0),%%xmm5                     \n"
3188     "movdqa    0x10(%0),%%xmm1                 \n"
3189     "pmaddubsw %%xmm4,%%xmm5                   \n"
3190     "pmaddubsw %%xmm4,%%xmm1                   \n"
3191     "phaddsw   %%xmm1,%%xmm5                   \n"
3192     "psraw     $0x7,%%xmm5                     \n"
3193     "packuswb  %%xmm5,%%xmm5                   \n"
3194     "movdqa    (%0),%%xmm6                     \n"
3195     "movdqa    0x10(%0),%%xmm1                 \n"
3196     "psrld     $0x18,%%xmm6                    \n"
3197     "psrld     $0x18,%%xmm1                    \n"
3198     "packuswb  %%xmm1,%%xmm6                   \n"
3199     "packuswb  %%xmm6,%%xmm6                   \n"
3200     "movdqa    %%xmm0,%%xmm1                   \n"
3201     "punpcklbw %%xmm6,%%xmm5                   \n"
3202     "punpcklwd %%xmm5,%%xmm0                   \n"
3203     "punpckhwd %%xmm5,%%xmm1                   \n"
3204     "sub       $0x8,%1                         \n"
3205     "movdqa    %%xmm0,(%0)                     \n"
3206     "movdqa    %%xmm1,0x10(%0)                 \n"
3207     "lea       0x20(%0),%0                     \n"
3208     "jg        1b                              \n"
3209   : "+r"(dst_argb),      // %0
3210     "+r"(width)          // %1
3211   : "r"(matrix_argb)     // %2
3212   : "memory", "cc"
3213 #if defined(__SSE2__)
3214     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
3215 #endif
3216   );
3217 }
3218 #endif  // HAS_ARGBCOLORMATRIXROW_SSSE3
3219 
3220 #ifdef HAS_ARGBQUANTIZEROW_SSE2
3221 // Quantize 4 ARGB pixels (16 bytes).
3222 // aligned to 16 bytes
ARGBQuantizeRow_SSE2(uint8 * dst_argb,int scale,int interval_size,int interval_offset,int width)3223 void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,
3224                           int interval_offset, int width) {
3225   asm volatile (
3226     "movd      %2,%%xmm2                       \n"
3227     "movd      %3,%%xmm3                       \n"
3228     "movd      %4,%%xmm4                       \n"
3229     "pshuflw   $0x40,%%xmm2,%%xmm2             \n"
3230     "pshufd    $0x44,%%xmm2,%%xmm2             \n"
3231     "pshuflw   $0x40,%%xmm3,%%xmm3             \n"
3232     "pshufd    $0x44,%%xmm3,%%xmm3             \n"
3233     "pshuflw   $0x40,%%xmm4,%%xmm4             \n"
3234     "pshufd    $0x44,%%xmm4,%%xmm4             \n"
3235     "pxor      %%xmm5,%%xmm5                   \n"
3236     "pcmpeqb   %%xmm6,%%xmm6                   \n"
3237     "pslld     $0x18,%%xmm6                    \n"
3238 
3239     // 4 pixel loop.
3240     ".p2align  2                               \n"
3241   "1:                                          \n"
3242     "movdqa    (%0),%%xmm0                     \n"
3243     "punpcklbw %%xmm5,%%xmm0                   \n"
3244     "pmulhuw   %%xmm2,%%xmm0                   \n"
3245     "movdqa    (%0),%%xmm1                     \n"
3246     "punpckhbw %%xmm5,%%xmm1                   \n"
3247     "pmulhuw   %%xmm2,%%xmm1                   \n"
3248     "pmullw    %%xmm3,%%xmm0                   \n"
3249     "movdqa    (%0),%%xmm7                     \n"
3250     "pmullw    %%xmm3,%%xmm1                   \n"
3251     "pand      %%xmm6,%%xmm7                   \n"
3252     "paddw     %%xmm4,%%xmm0                   \n"
3253     "paddw     %%xmm4,%%xmm1                   \n"
3254     "packuswb  %%xmm1,%%xmm0                   \n"
3255     "por       %%xmm7,%%xmm0                   \n"
3256     "sub       $0x4,%1                         \n"
3257     "movdqa    %%xmm0,(%0)                     \n"
3258     "lea       0x10(%0),%0                     \n"
3259     "jg        1b                              \n"
3260   : "+r"(dst_argb),       // %0
3261     "+r"(width)           // %1
3262   : "r"(scale),           // %2
3263     "r"(interval_size),   // %3
3264     "r"(interval_offset)  // %4
3265   : "memory", "cc"
3266 #if defined(__SSE2__)
3267     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
3268 #endif
3269   );
3270 }
3271 #endif  // HAS_ARGBQUANTIZEROW_SSE2
3272 
3273 #ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2
3274 // Creates a table of cumulative sums where each value is a sum of all values
3275 // above and to the left of the value, inclusive of the value.
ComputeCumulativeSumRow_SSE2(const uint8 * row,int32 * cumsum,const int32 * previous_cumsum,int width)3276 void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
3277                                   const int32* previous_cumsum, int width) {
3278   asm volatile (
3279     "sub       %1,%2                           \n"
3280     "pxor      %%xmm0,%%xmm0                   \n"
3281     "pxor      %%xmm1,%%xmm1                   \n"
3282     "sub       $0x4,%3                         \n"
3283     "jl        49f                             \n"
3284     "test      $0xf,%1                         \n"
3285     "jne       49f                             \n"
3286 
3287   // 4 pixel loop                              \n"
3288     ".p2align  2                               \n"
3289   "40:                                         \n"
3290     "movdqu    (%0),%%xmm2                     \n"
3291     "lea       0x10(%0),%0                     \n"
3292     "movdqa    %%xmm2,%%xmm4                   \n"
3293     "punpcklbw %%xmm1,%%xmm2                   \n"
3294     "movdqa    %%xmm2,%%xmm3                   \n"
3295     "punpcklwd %%xmm1,%%xmm2                   \n"
3296     "punpckhwd %%xmm1,%%xmm3                   \n"
3297     "punpckhbw %%xmm1,%%xmm4                   \n"
3298     "movdqa    %%xmm4,%%xmm5                   \n"
3299     "punpcklwd %%xmm1,%%xmm4                   \n"
3300     "punpckhwd %%xmm1,%%xmm5                   \n"
3301     "paddd     %%xmm2,%%xmm0                   \n"
3302     "movdqa    (%1,%2,1),%%xmm2                \n"
3303     "paddd     %%xmm0,%%xmm2                   \n"
3304     "paddd     %%xmm3,%%xmm0                   \n"
3305     "movdqa    0x10(%1,%2,1),%%xmm3            \n"
3306     "paddd     %%xmm0,%%xmm3                   \n"
3307     "paddd     %%xmm4,%%xmm0                   \n"
3308     "movdqa    0x20(%1,%2,1),%%xmm4            \n"
3309     "paddd     %%xmm0,%%xmm4                   \n"
3310     "paddd     %%xmm5,%%xmm0                   \n"
3311     "movdqa    0x30(%1,%2,1),%%xmm5            \n"
3312     "paddd     %%xmm0,%%xmm5                   \n"
3313     "movdqa    %%xmm2,(%1)                     \n"
3314     "movdqa    %%xmm3,0x10(%1)                 \n"
3315     "movdqa    %%xmm4,0x20(%1)                 \n"
3316     "movdqa    %%xmm5,0x30(%1)                 \n"
3317     "lea       0x40(%1),%1                     \n"
3318     "sub       $0x4,%3                         \n"
3319     "jge       40b                             \n"
3320 
3321   "49:                                         \n"
3322     "add       $0x3,%3                         \n"
3323     "jl        19f                             \n"
3324 
3325   // 1 pixel loop                              \n"
3326     ".p2align  2                               \n"
3327   "10:                                         \n"
3328     "movd      (%0),%%xmm2                     \n"
3329     "lea       0x4(%0),%0                      \n"
3330     "punpcklbw %%xmm1,%%xmm2                   \n"
3331     "punpcklwd %%xmm1,%%xmm2                   \n"
3332     "paddd     %%xmm2,%%xmm0                   \n"
3333     "movdqu    (%1,%2,1),%%xmm2                \n"
3334     "paddd     %%xmm0,%%xmm2                   \n"
3335     "movdqu    %%xmm2,(%1)                     \n"
3336     "lea       0x10(%1),%1                     \n"
3337     "sub       $0x1,%3                         \n"
3338     "jge       10b                             \n"
3339 
3340   "19:                                         \n"
3341   : "+r"(row),  // %0
3342     "+r"(cumsum),  // %1
3343     "+r"(previous_cumsum),  // %2
3344     "+r"(width)  // %3
3345   :
3346   : "memory", "cc"
3347 #if defined(__SSE2__)
3348     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
3349 #endif
3350   );
3351 }
3352 #endif  // HAS_COMPUTECUMULATIVESUMROW_SSE2
3353 
3354 #ifdef HAS_CUMULATIVESUMTOAVERAGE_SSE2
CumulativeSumToAverage_SSE2(const int32 * topleft,const int32 * botleft,int width,int area,uint8 * dst,int count)3355 void CumulativeSumToAverage_SSE2(const int32* topleft, const int32* botleft,
3356                                  int width, int area, uint8* dst, int count) {
3357   asm volatile (
3358     "movd      %5,%%xmm4                       \n"
3359     "cvtdq2ps  %%xmm4,%%xmm4                   \n"
3360     "rcpss     %%xmm4,%%xmm4                   \n"
3361     "pshufd    $0x0,%%xmm4,%%xmm4              \n"
3362     "sub       $0x4,%3                         \n"
3363     "jl        49f                             \n"
3364 
3365   // 4 pixel loop                              \n"
3366     ".p2align  2                               \n"
3367   "40:                                         \n"
3368     "movdqa    (%0),%%xmm0                     \n"
3369     "movdqa    0x10(%0),%%xmm1                 \n"
3370     "movdqa    0x20(%0),%%xmm2                 \n"
3371     "movdqa    0x30(%0),%%xmm3                 \n"
3372     "psubd     (%0,%4,4),%%xmm0                \n"
3373     "psubd     0x10(%0,%4,4),%%xmm1            \n"
3374     "psubd     0x20(%0,%4,4),%%xmm2            \n"
3375     "psubd     0x30(%0,%4,4),%%xmm3            \n"
3376     "lea       0x40(%0),%0                     \n"
3377     "psubd     (%1),%%xmm0                     \n"
3378     "psubd     0x10(%1),%%xmm1                 \n"
3379     "psubd     0x20(%1),%%xmm2                 \n"
3380     "psubd     0x30(%1),%%xmm3                 \n"
3381     "paddd     (%1,%4,4),%%xmm0                \n"
3382     "paddd     0x10(%1,%4,4),%%xmm1            \n"
3383     "paddd     0x20(%1,%4,4),%%xmm2            \n"
3384     "paddd     0x30(%1,%4,4),%%xmm3            \n"
3385     "lea       0x40(%1),%1                     \n"
3386     "cvtdq2ps  %%xmm0,%%xmm0                   \n"
3387     "cvtdq2ps  %%xmm1,%%xmm1                   \n"
3388     "mulps     %%xmm4,%%xmm0                   \n"
3389     "mulps     %%xmm4,%%xmm1                   \n"
3390     "cvtdq2ps  %%xmm2,%%xmm2                   \n"
3391     "cvtdq2ps  %%xmm3,%%xmm3                   \n"
3392     "mulps     %%xmm4,%%xmm2                   \n"
3393     "mulps     %%xmm4,%%xmm3                   \n"
3394     "cvtps2dq  %%xmm0,%%xmm0                   \n"
3395     "cvtps2dq  %%xmm1,%%xmm1                   \n"
3396     "cvtps2dq  %%xmm2,%%xmm2                   \n"
3397     "cvtps2dq  %%xmm3,%%xmm3                   \n"
3398     "packssdw  %%xmm1,%%xmm0                   \n"
3399     "packssdw  %%xmm3,%%xmm2                   \n"
3400     "packuswb  %%xmm2,%%xmm0                   \n"
3401     "movdqu    %%xmm0,(%2)                     \n"
3402     "lea       0x10(%2),%2                     \n"
3403     "sub       $0x4,%3                         \n"
3404     "jge       40b                             \n"
3405 
3406   "49:                                         \n"
3407     "add       $0x3,%3                         \n"
3408     "jl        19f                             \n"
3409 
3410   // 1 pixel loop                              \n"
3411     ".p2align  2                               \n"
3412   "10:                                         \n"
3413     "movdqa    (%0),%%xmm0                     \n"
3414     "psubd     (%0,%4,4),%%xmm0                \n"
3415     "lea       0x10(%0),%0                     \n"
3416     "psubd     (%1),%%xmm0                     \n"
3417     "paddd     (%1,%4,4),%%xmm0                \n"
3418     "lea       0x10(%1),%1                     \n"
3419     "cvtdq2ps  %%xmm0,%%xmm0                   \n"
3420     "mulps     %%xmm4,%%xmm0                   \n"
3421     "cvtps2dq  %%xmm0,%%xmm0                   \n"
3422     "packssdw  %%xmm0,%%xmm0                   \n"
3423     "packuswb  %%xmm0,%%xmm0                   \n"
3424     "movd      %%xmm0,(%2)                     \n"
3425     "lea       0x4(%2),%2                      \n"
3426     "sub       $0x1,%3                         \n"
3427     "jge       10b                             \n"
3428   "19:                                         \n"
3429   : "+r"(topleft),  // %0
3430     "+r"(botleft),  // %1
3431     "+r"(dst),      // %2
3432     "+rm"(count)    // %3
3433   : "r"(static_cast<intptr_t>(width)),  // %4
3434     "rm"(area)     // %5
3435   : "memory", "cc"
3436 #if defined(__SSE2__)
3437     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
3438 #endif
3439   );
3440 }
3441 #endif  // HAS_CUMULATIVESUMTOAVERAGE_SSE2
3442 #ifdef HAS_ARGBSHADE_SSE2
3443 // Shade 4 pixels at a time by specified value.
3444 // Aligned to 16 bytes.
ARGBShadeRow_SSE2(const uint8 * src_argb,uint8 * dst_argb,int width,uint32 value)3445 void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
3446                        uint32 value) {
3447   asm volatile (
3448     "movd      %3,%%xmm2                       \n"
3449     "sub       %0,%1                           \n"
3450     "punpcklbw %%xmm2,%%xmm2                   \n"
3451     "punpcklqdq %%xmm2,%%xmm2                  \n"
3452 
3453     // 4 pixel loop.
3454     ".p2align  2                               \n"
3455   "1:                                          \n"
3456     "movdqa    (%0),%%xmm0                     \n"
3457     "movdqa    %%xmm0,%%xmm1                   \n"
3458     "punpcklbw %%xmm0,%%xmm0                   \n"
3459     "punpckhbw %%xmm1,%%xmm1                   \n"
3460     "pmulhuw   %%xmm2,%%xmm0                   \n"
3461     "pmulhuw   %%xmm2,%%xmm1                   \n"
3462     "psrlw     $0x8,%%xmm0                     \n"
3463     "psrlw     $0x8,%%xmm1                     \n"
3464     "packuswb  %%xmm1,%%xmm0                   \n"
3465     "sub       $0x4,%2                         \n"
3466     "movdqa    %%xmm0,(%0,%1,1)                \n"
3467     "lea       0x10(%0),%0                     \n"
3468     "jg        1b                              \n"
3469   : "+r"(src_argb),       // %0
3470     "+r"(dst_argb),       // %1
3471     "+r"(width)           // %2
3472   : "r"(value)            // %3
3473   : "memory", "cc"
3474 #if defined(__SSE2__)
3475     , "xmm0", "xmm1", "xmm2"
3476 #endif
3477   );
3478 }
3479 #endif  // HAS_ARGBSHADE_SSE2
3480 
3481 #ifdef HAS_ARGBAFFINEROW_SSE2
3482 // TODO(fbarchard): Find 64 bit way to avoid masking.
3483 // TODO(fbarchard): Investigate why 4 pixels is slower than 2 on Core2.
3484 // Copy ARGB pixels from source image with slope to a row of destination.
3485 // Caveat - in 64 bit, movd is used with 64 bit gpr due to Mac gcc producing
3486 // an error if movq is used. movd  %%xmm0,%1
3487 
3488 LIBYUV_API
ARGBAffineRow_SSE2(const uint8 * src_argb,int src_argb_stride,uint8 * dst_argb,const float * uv_dudv,int width)3489 void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
3490                         uint8* dst_argb, const float* uv_dudv, int width) {
3491   intptr_t src_argb_stride_temp = src_argb_stride;
3492   intptr_t temp = 0;
3493   asm volatile (
3494     "movq      (%3),%%xmm2                     \n"
3495     "movq      0x8(%3),%%xmm7                  \n"
3496     "shl       $0x10,%1                        \n"
3497     "add       $0x4,%1                         \n"
3498     "movd      %1,%%xmm5                       \n"
3499     "sub       $0x4,%4                         \n"
3500     "jl        49f                             \n"
3501 
3502     "pshufd    $0x44,%%xmm7,%%xmm7             \n"
3503     "pshufd    $0x0,%%xmm5,%%xmm5              \n"
3504     "movdqa    %%xmm2,%%xmm0                   \n"
3505     "addps     %%xmm7,%%xmm0                   \n"
3506     "movlhps   %%xmm0,%%xmm2                   \n"
3507     "movdqa    %%xmm7,%%xmm4                   \n"
3508     "addps     %%xmm4,%%xmm4                   \n"
3509     "movdqa    %%xmm2,%%xmm3                   \n"
3510     "addps     %%xmm4,%%xmm3                   \n"
3511     "addps     %%xmm4,%%xmm4                   \n"
3512 
3513   // 4 pixel loop                              \n"
3514     ".p2align  4                               \n"
3515   "40:                                         \n"
3516     "cvttps2dq %%xmm2,%%xmm0                   \n"
3517     "cvttps2dq %%xmm3,%%xmm1                   \n"
3518     "packssdw  %%xmm1,%%xmm0                   \n"
3519     "pmaddwd   %%xmm5,%%xmm0                   \n"
3520 #if defined(__x86_64__)
3521     "movd      %%xmm0,%1                       \n"
3522     "mov       %1,%5                           \n"
3523     "and       $0x0fffffff,%1                  \n"
3524     "shr       $32,%5                          \n"
3525     "pshufd    $0xEE,%%xmm0,%%xmm0             \n"
3526 #else
3527     "movd      %%xmm0,%1                       \n"
3528     "pshufd    $0x39,%%xmm0,%%xmm0             \n"
3529     "movd      %%xmm0,%5                       \n"
3530     "pshufd    $0x39,%%xmm0,%%xmm0             \n"
3531 #endif
3532     "movd      (%0,%1,1),%%xmm1                \n"
3533     "movd      (%0,%5,1),%%xmm6                \n"
3534     "punpckldq %%xmm6,%%xmm1                   \n"
3535     "addps     %%xmm4,%%xmm2                   \n"
3536     "movq      %%xmm1,(%2)                     \n"
3537 #if defined(__x86_64__)
3538     "movd      %%xmm0,%1                       \n"
3539     "mov       %1,%5                           \n"
3540     "and       $0x0fffffff,%1                  \n"
3541     "shr       $32,%5                          \n"
3542 #else
3543     "movd      %%xmm0,%1                       \n"
3544     "pshufd    $0x39,%%xmm0,%%xmm0             \n"
3545     "movd      %%xmm0,%5                       \n"
3546 #endif
3547     "movd      (%0,%1,1),%%xmm0                \n"
3548     "movd      (%0,%5,1),%%xmm6                \n"
3549     "punpckldq %%xmm6,%%xmm0                   \n"
3550     "addps     %%xmm4,%%xmm3                   \n"
3551     "sub       $0x4,%4                         \n"
3552     "movq      %%xmm0,0x08(%2)                 \n"
3553     "lea       0x10(%2),%2                     \n"
3554     "jge       40b                             \n"
3555 
3556   "49:                                         \n"
3557     "add       $0x3,%4                         \n"
3558     "jl        19f                             \n"
3559 
3560   // 1 pixel loop                              \n"
3561     ".p2align  4                               \n"
3562   "10:                                         \n"
3563     "cvttps2dq %%xmm2,%%xmm0                   \n"
3564     "packssdw  %%xmm0,%%xmm0                   \n"
3565     "pmaddwd   %%xmm5,%%xmm0                   \n"
3566     "addps     %%xmm7,%%xmm2                   \n"
3567     "movd      %%xmm0,%1                       \n"
3568 #if defined(__x86_64__)
3569     "and       $0x0fffffff,%1                  \n"
3570 #endif
3571     "movd      (%0,%1,1),%%xmm0                \n"
3572     "sub       $0x1,%4                         \n"
3573     "movd      %%xmm0,(%2)                     \n"
3574     "lea       0x4(%2),%2                      \n"
3575     "jge       10b                             \n"
3576   "19:                                         \n"
3577   : "+r"(src_argb),  // %0
3578     "+r"(src_argb_stride_temp),  // %1
3579     "+r"(dst_argb),  // %2
3580     "+r"(uv_dudv),   // %3
3581     "+rm"(width),    // %4
3582     "+r"(temp)   // %5
3583   :
3584   : "memory", "cc"
3585 #if defined(__SSE2__)
3586     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
3587 #endif
3588   );
3589 }
3590 #endif  // HAS_ARGBAFFINEROW_SSE2
3591 
3592 // Bilinear row filtering combines 4x2 -> 4x1. SSSE3 version
ARGBInterpolateRow_SSSE3(uint8 * dst_ptr,const uint8 * src_ptr,ptrdiff_t src_stride,int dst_width,int source_y_fraction)3593 void ARGBInterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
3594                               ptrdiff_t src_stride, int dst_width,
3595                               int source_y_fraction) {
3596   asm volatile (
3597     "sub       %1,%0                           \n"
3598     "shr       %3                              \n"
3599     "cmp       $0x0,%3                         \n"
3600     "je        2f                              \n"
3601     "cmp       $0x40,%3                        \n"
3602     "je        3f                              \n"
3603     "movd      %3,%%xmm0                       \n"
3604     "neg       %3                              \n"
3605     "add       $0x80,%3                        \n"
3606     "movd      %3,%%xmm5                       \n"
3607     "punpcklbw %%xmm0,%%xmm5                   \n"
3608     "punpcklwd %%xmm5,%%xmm5                   \n"
3609     "pshufd    $0x0,%%xmm5,%%xmm5              \n"
3610     ".p2align  4                               \n"
3611   "1:                                          \n"
3612     "movdqa    (%1),%%xmm0                     \n"
3613     "movdqa    (%1,%4,1),%%xmm2                \n"
3614     "movdqa    %%xmm0,%%xmm1                   \n"
3615     "punpcklbw %%xmm2,%%xmm0                   \n"
3616     "punpckhbw %%xmm2,%%xmm1                   \n"
3617     "pmaddubsw %%xmm5,%%xmm0                   \n"
3618     "pmaddubsw %%xmm5,%%xmm1                   \n"
3619     "psrlw     $0x7,%%xmm0                     \n"
3620     "psrlw     $0x7,%%xmm1                     \n"
3621     "packuswb  %%xmm1,%%xmm0                   \n"
3622     "sub       $0x4,%2                         \n"
3623     "movdqa    %%xmm0,(%1,%0,1)                \n"
3624     "lea       0x10(%1),%1                     \n"
3625     "jg        1b                              \n"
3626     "jmp       4f                              \n"
3627     ".p2align  4                               \n"
3628   "2:                                          \n"
3629     "movdqa    (%1),%%xmm0                     \n"
3630     "sub       $0x4,%2                         \n"
3631     "movdqa    %%xmm0,(%1,%0,1)                \n"
3632     "lea       0x10(%1),%1                     \n"
3633     "jg        2b                              \n"
3634     "jmp       4f                              \n"
3635     ".p2align  4                               \n"
3636   "3:                                          \n"
3637     "movdqa    (%1),%%xmm0                     \n"
3638     "pavgb     (%1,%4,1),%%xmm0                \n"
3639     "sub       $0x4,%2                         \n"
3640     "movdqa    %%xmm0,(%1,%0,1)                \n"
3641     "lea       0x10(%1),%1                     \n"
3642     "jg        3b                              \n"
3643   "4:                                          \n"
3644     ".p2align  4                               \n"
3645   : "+r"(dst_ptr),     // %0
3646     "+r"(src_ptr),     // %1
3647     "+r"(dst_width),   // %2
3648     "+r"(source_y_fraction)  // %3
3649   : "r"(static_cast<intptr_t>(src_stride))  // %4
3650   : "memory", "cc"
3651 #if defined(__SSE2__)
3652     , "xmm0", "xmm1", "xmm2", "xmm5"
3653 #endif
3654   );
3655 }
3656 
3657 #endif  // defined(__x86_64__) || defined(__i386__)
3658 
3659 #ifdef __cplusplus
3660 }  // extern "C"
3661 }  // namespace libyuv
3662 #endif
3663