1 /*
2  *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS. All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include "libyuv/row.h"
12 
13 #ifdef __cplusplus
14 namespace libyuv {
15 extern "C" {
16 #endif
17 
18 // This module is for GCC x86 and x64.
19 #if !defined(LIBYUV_DISABLE_X86) && \
20     (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))
21 
22 #if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3)
23 
24 // Constants for ARGB
25 static vec8 kARGBToY = {13, 65, 33, 0, 13, 65, 33, 0,
26                         13, 65, 33, 0, 13, 65, 33, 0};
27 
28 // JPeg full range.
29 static vec8 kARGBToYJ = {15, 75, 38, 0, 15, 75, 38, 0,
30                          15, 75, 38, 0, 15, 75, 38, 0};
31 #endif  // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3)
32 
33 #if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3)
34 
35 static vec8 kARGBToU = {112, -74, -38, 0, 112, -74, -38, 0,
36                         112, -74, -38, 0, 112, -74, -38, 0};
37 
38 static vec8 kARGBToUJ = {127, -84, -43, 0, 127, -84, -43, 0,
39                          127, -84, -43, 0, 127, -84, -43, 0};
40 
41 static vec8 kARGBToV = {
42     -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
43 };
44 
45 static vec8 kARGBToVJ = {-20, -107, 127, 0, -20, -107, 127, 0,
46                          -20, -107, 127, 0, -20, -107, 127, 0};
47 
48 // Constants for BGRA
49 static vec8 kBGRAToY = {0, 33, 65, 13, 0, 33, 65, 13,
50                         0, 33, 65, 13, 0, 33, 65, 13};
51 
52 static vec8 kBGRAToU = {0, -38, -74, 112, 0, -38, -74, 112,
53                         0, -38, -74, 112, 0, -38, -74, 112};
54 
55 static vec8 kBGRAToV = {0, 112, -94, -18, 0, 112, -94, -18,
56                         0, 112, -94, -18, 0, 112, -94, -18};
57 
58 // Constants for ABGR
59 static vec8 kABGRToY = {33, 65, 13, 0, 33, 65, 13, 0,
60                         33, 65, 13, 0, 33, 65, 13, 0};
61 
62 static vec8 kABGRToU = {-38, -74, 112, 0, -38, -74, 112, 0,
63                         -38, -74, 112, 0, -38, -74, 112, 0};
64 
65 static vec8 kABGRToV = {112, -94, -18, 0, 112, -94, -18, 0,
66                         112, -94, -18, 0, 112, -94, -18, 0};
67 
68 // Constants for RGBA.
69 static vec8 kRGBAToY = {0, 13, 65, 33, 0, 13, 65, 33,
70                         0, 13, 65, 33, 0, 13, 65, 33};
71 
72 static vec8 kRGBAToU = {0, 112, -74, -38, 0, 112, -74, -38,
73                         0, 112, -74, -38, 0, 112, -74, -38};
74 
75 static vec8 kRGBAToV = {0, -18, -94, 112, 0, -18, -94, 112,
76                         0, -18, -94, 112, 0, -18, -94, 112};
77 
78 static uvec8 kAddY16 = {16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u,
79                         16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u};
80 
81 // 7 bit fixed point 0.5.
82 static vec16 kAddYJ64 = {64, 64, 64, 64, 64, 64, 64, 64};
83 
84 static uvec8 kAddUV128 = {128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
85                           128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
86 
87 static uvec16 kAddUVJ128 = {0x8080u, 0x8080u, 0x8080u, 0x8080u,
88                             0x8080u, 0x8080u, 0x8080u, 0x8080u};
89 #endif  // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3)
90 
91 #ifdef HAS_RGB24TOARGBROW_SSSE3
92 
93 // Shuffle table for converting RGB24 to ARGB.
94 static uvec8 kShuffleMaskRGB24ToARGB = {0u, 1u, 2u, 12u, 3u, 4u,  5u,  13u,
95                                         6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u};
96 
97 // Shuffle table for converting RAW to ARGB.
98 static uvec8 kShuffleMaskRAWToARGB = {2u, 1u, 0u, 12u, 5u,  4u,  3u, 13u,
99                                       8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u};
100 
101 // Shuffle table for converting RAW to RGB24.  First 8.
102 static const uvec8 kShuffleMaskRAWToRGB24_0 = {
103     2u,   1u,   0u,   5u,   4u,   3u,   8u,   7u,
104     128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
105 
106 // Shuffle table for converting RAW to RGB24.  Middle 8.
107 static const uvec8 kShuffleMaskRAWToRGB24_1 = {
108     2u,   7u,   6u,   5u,   10u,  9u,   8u,   13u,
109     128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
110 
111 // Shuffle table for converting RAW to RGB24.  Last 8.
112 static const uvec8 kShuffleMaskRAWToRGB24_2 = {
113     8u,   7u,   12u,  11u,  10u,  15u,  14u,  13u,
114     128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
115 
116 // Shuffle table for converting ARGB to RGB24.
117 static uvec8 kShuffleMaskARGBToRGB24 = {
118     0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u};
119 
120 // Shuffle table for converting ARGB to RAW.
121 static uvec8 kShuffleMaskARGBToRAW = {
122     2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u};
123 
124 // Shuffle table for converting ARGBToRGB24 for I422ToRGB24.  First 8 + next 4
125 static uvec8 kShuffleMaskARGBToRGB24_0 = {
126     0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u};
127 
128 // YUY2 shuf 16 Y to 32 Y.
129 static const lvec8 kShuffleYUY2Y = {0,  0,  2,  2,  4,  4,  6,  6,  8,  8, 10,
130                                     10, 12, 12, 14, 14, 0,  0,  2,  2,  4, 4,
131                                     6,  6,  8,  8,  10, 10, 12, 12, 14, 14};
132 
133 // YUY2 shuf 8 UV to 16 UV.
134 static const lvec8 kShuffleYUY2UV = {1,  3,  1,  3,  5,  7,  5,  7,  9,  11, 9,
135                                      11, 13, 15, 13, 15, 1,  3,  1,  3,  5,  7,
136                                      5,  7,  9,  11, 9,  11, 13, 15, 13, 15};
137 
138 // UYVY shuf 16 Y to 32 Y.
139 static const lvec8 kShuffleUYVYY = {1,  1,  3,  3,  5,  5,  7,  7,  9,  9, 11,
140                                     11, 13, 13, 15, 15, 1,  1,  3,  3,  5, 5,
141                                     7,  7,  9,  9,  11, 11, 13, 13, 15, 15};
142 
143 // UYVY shuf 8 UV to 16 UV.
144 static const lvec8 kShuffleUYVYUV = {0,  2,  0,  2,  4,  6,  4,  6,  8,  10, 8,
145                                      10, 12, 14, 12, 14, 0,  2,  0,  2,  4,  6,
146                                      4,  6,  8,  10, 8,  10, 12, 14, 12, 14};
147 
148 // NV21 shuf 8 VU to 16 UV.
149 static const lvec8 kShuffleNV21 = {
150     1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,
151     1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,
152 };
153 #endif  // HAS_RGB24TOARGBROW_SSSE3
154 
155 #ifdef HAS_J400TOARGBROW_SSE2
J400ToARGBRow_SSE2(const uint8 * src_y,uint8 * dst_argb,int width)156 void J400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int width) {
157   asm volatile (
158     "pcmpeqb   %%xmm5,%%xmm5                   \n"
159     "pslld     $0x18,%%xmm5                    \n"
160     LABELALIGN
161     "1:                                        \n"
162     "movq      " MEMACCESS(0) ",%%xmm0         \n"
163     "lea       " MEMLEA(0x8,0) ",%0            \n"
164     "punpcklbw %%xmm0,%%xmm0                   \n"
165     "movdqa    %%xmm0,%%xmm1                   \n"
166     "punpcklwd %%xmm0,%%xmm0                   \n"
167     "punpckhwd %%xmm1,%%xmm1                   \n"
168     "por       %%xmm5,%%xmm0                   \n"
169     "por       %%xmm5,%%xmm1                   \n"
170     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
171     "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
172     "lea       " MEMLEA(0x20,1) ",%1           \n"
173     "sub       $0x8,%2                         \n"
174     "jg        1b                              \n"
175   : "+r"(src_y),     // %0
176     "+r"(dst_argb),  // %1
177     "+r"(width)        // %2
178   :: "memory", "cc", "xmm0", "xmm1", "xmm5"
179   );
180 }
181 #endif  // HAS_J400TOARGBROW_SSE2
182 
183 #ifdef HAS_RGB24TOARGBROW_SSSE3
RGB24ToARGBRow_SSSE3(const uint8 * src_rgb24,uint8 * dst_argb,int width)184 void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int width) {
185   asm volatile (
186     "pcmpeqb   %%xmm5,%%xmm5                   \n"  // generate mask 0xff000000
187     "pslld     $0x18,%%xmm5                    \n"
188     "movdqa    %3,%%xmm4                       \n"
189     LABELALIGN
190     "1:                                        \n"
191     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
192     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
193     "movdqu    " MEMACCESS2(0x20,0) ",%%xmm3   \n"
194     "lea       " MEMLEA(0x30,0) ",%0           \n"
195     "movdqa    %%xmm3,%%xmm2                   \n"
196     "palignr   $0x8,%%xmm1,%%xmm2              \n"
197     "pshufb    %%xmm4,%%xmm2                   \n"
198     "por       %%xmm5,%%xmm2                   \n"
199     "palignr   $0xc,%%xmm0,%%xmm1              \n"
200     "pshufb    %%xmm4,%%xmm0                   \n"
201     "movdqu    %%xmm2," MEMACCESS2(0x20,1) "   \n"
202     "por       %%xmm5,%%xmm0                   \n"
203     "pshufb    %%xmm4,%%xmm1                   \n"
204     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
205     "por       %%xmm5,%%xmm1                   \n"
206     "palignr   $0x4,%%xmm3,%%xmm3              \n"
207     "pshufb    %%xmm4,%%xmm3                   \n"
208     "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
209     "por       %%xmm5,%%xmm3                   \n"
210     "movdqu    %%xmm3," MEMACCESS2(0x30,1) "   \n"
211     "lea       " MEMLEA(0x40,1) ",%1           \n"
212     "sub       $0x10,%2                        \n"
213     "jg        1b                              \n"
214   : "+r"(src_rgb24),  // %0
215     "+r"(dst_argb),  // %1
216     "+r"(width)        // %2
217   : "m"(kShuffleMaskRGB24ToARGB)  // %3
218   : "memory", "cc" , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
219   );
220 }
221 
RAWToARGBRow_SSSE3(const uint8 * src_raw,uint8 * dst_argb,int width)222 void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int width) {
223   asm volatile (
224     "pcmpeqb   %%xmm5,%%xmm5                   \n"  // generate mask 0xff000000
225     "pslld     $0x18,%%xmm5                    \n"
226     "movdqa    %3,%%xmm4                       \n"
227     LABELALIGN
228     "1:                                        \n"
229     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
230     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
231     "movdqu    " MEMACCESS2(0x20,0) ",%%xmm3   \n"
232     "lea       " MEMLEA(0x30,0) ",%0           \n"
233     "movdqa    %%xmm3,%%xmm2                   \n"
234     "palignr   $0x8,%%xmm1,%%xmm2              \n"
235     "pshufb    %%xmm4,%%xmm2                   \n"
236     "por       %%xmm5,%%xmm2                   \n"
237     "palignr   $0xc,%%xmm0,%%xmm1              \n"
238     "pshufb    %%xmm4,%%xmm0                   \n"
239     "movdqu    %%xmm2," MEMACCESS2(0x20,1) "   \n"
240     "por       %%xmm5,%%xmm0                   \n"
241     "pshufb    %%xmm4,%%xmm1                   \n"
242     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
243     "por       %%xmm5,%%xmm1                   \n"
244     "palignr   $0x4,%%xmm3,%%xmm3              \n"
245     "pshufb    %%xmm4,%%xmm3                   \n"
246     "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
247     "por       %%xmm5,%%xmm3                   \n"
248     "movdqu    %%xmm3," MEMACCESS2(0x30,1) "   \n"
249     "lea       " MEMLEA(0x40,1) ",%1           \n"
250     "sub       $0x10,%2                        \n"
251     "jg        1b                              \n"
252   : "+r"(src_raw),   // %0
253     "+r"(dst_argb),  // %1
254     "+r"(width)        // %2
255   : "m"(kShuffleMaskRAWToARGB)  // %3
256   : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
257   );
258 }
259 
RAWToRGB24Row_SSSE3(const uint8 * src_raw,uint8 * dst_rgb24,int width)260 void RAWToRGB24Row_SSSE3(const uint8* src_raw, uint8* dst_rgb24, int width) {
261   asm volatile (
262    "movdqa     %3,%%xmm3                       \n"
263    "movdqa     %4,%%xmm4                       \n"
264    "movdqa     %5,%%xmm5                       \n"
265     LABELALIGN
266     "1:                                        \n"
267     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
268     "movdqu    " MEMACCESS2(0x4,0) ",%%xmm1    \n"
269     "movdqu    " MEMACCESS2(0x8,0) ",%%xmm2    \n"
270     "lea       " MEMLEA(0x18,0) ",%0           \n"
271     "pshufb    %%xmm3,%%xmm0                   \n"
272     "pshufb    %%xmm4,%%xmm1                   \n"
273     "pshufb    %%xmm5,%%xmm2                   \n"
274     "movq      %%xmm0," MEMACCESS(1) "         \n"
275     "movq      %%xmm1," MEMACCESS2(0x8,1) "    \n"
276     "movq      %%xmm2," MEMACCESS2(0x10,1) "   \n"
277     "lea       " MEMLEA(0x18,1) ",%1           \n"
278     "sub       $0x8,%2                         \n"
279     "jg        1b                              \n"
280   : "+r"(src_raw),    // %0
281     "+r"(dst_rgb24),  // %1
282     "+r"(width)       // %2
283   : "m"(kShuffleMaskRAWToRGB24_0),  // %3
284     "m"(kShuffleMaskRAWToRGB24_1),  // %4
285     "m"(kShuffleMaskRAWToRGB24_2)   // %5
286   : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
287   );
288 }
289 
RGB565ToARGBRow_SSE2(const uint8 * src,uint8 * dst,int width)290 void RGB565ToARGBRow_SSE2(const uint8* src, uint8* dst, int width) {
291   asm volatile (
292     "mov       $0x1080108,%%eax                \n"
293     "movd      %%eax,%%xmm5                    \n"
294     "pshufd    $0x0,%%xmm5,%%xmm5              \n"
295     "mov       $0x20802080,%%eax               \n"
296     "movd      %%eax,%%xmm6                    \n"
297     "pshufd    $0x0,%%xmm6,%%xmm6              \n"
298     "pcmpeqb   %%xmm3,%%xmm3                   \n"
299     "psllw     $0xb,%%xmm3                     \n"
300     "pcmpeqb   %%xmm4,%%xmm4                   \n"
301     "psllw     $0xa,%%xmm4                     \n"
302     "psrlw     $0x5,%%xmm4                     \n"
303     "pcmpeqb   %%xmm7,%%xmm7                   \n"
304     "psllw     $0x8,%%xmm7                     \n"
305     "sub       %0,%1                           \n"
306     "sub       %0,%1                           \n"
307     LABELALIGN
308     "1:                                        \n"
309     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
310     "movdqa    %%xmm0,%%xmm1                   \n"
311     "movdqa    %%xmm0,%%xmm2                   \n"
312     "pand      %%xmm3,%%xmm1                   \n"
313     "psllw     $0xb,%%xmm2                     \n"
314     "pmulhuw   %%xmm5,%%xmm1                   \n"
315     "pmulhuw   %%xmm5,%%xmm2                   \n"
316     "psllw     $0x8,%%xmm1                     \n"
317     "por       %%xmm2,%%xmm1                   \n"
318     "pand      %%xmm4,%%xmm0                   \n"
319     "pmulhuw   %%xmm6,%%xmm0                   \n"
320     "por       %%xmm7,%%xmm0                   \n"
321     "movdqa    %%xmm1,%%xmm2                   \n"
322     "punpcklbw %%xmm0,%%xmm1                   \n"
323     "punpckhbw %%xmm0,%%xmm2                   \n"
324     MEMOPMEM(movdqu,xmm1,0x00,1,0,2)           //  movdqu  %%xmm1,(%1,%0,2)
325     MEMOPMEM(movdqu,xmm2,0x10,1,0,2)           //  movdqu  %%xmm2,0x10(%1,%0,2)
326     "lea       " MEMLEA(0x10,0) ",%0           \n"
327     "sub       $0x8,%2                         \n"
328     "jg        1b                              \n"
329   : "+r"(src),  // %0
330     "+r"(dst),  // %1
331     "+r"(width)   // %2
332   :
333   : "memory", "cc", "eax", NACL_R14
334     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
335   );
336 }
337 
ARGB1555ToARGBRow_SSE2(const uint8 * src,uint8 * dst,int width)338 void ARGB1555ToARGBRow_SSE2(const uint8* src, uint8* dst, int width) {
339   asm volatile (
340     "mov       $0x1080108,%%eax                \n"
341     "movd      %%eax,%%xmm5                    \n"
342     "pshufd    $0x0,%%xmm5,%%xmm5              \n"
343     "mov       $0x42004200,%%eax               \n"
344     "movd      %%eax,%%xmm6                    \n"
345     "pshufd    $0x0,%%xmm6,%%xmm6              \n"
346     "pcmpeqb   %%xmm3,%%xmm3                   \n"
347     "psllw     $0xb,%%xmm3                     \n"
348     "movdqa    %%xmm3,%%xmm4                   \n"
349     "psrlw     $0x6,%%xmm4                     \n"
350     "pcmpeqb   %%xmm7,%%xmm7                   \n"
351     "psllw     $0x8,%%xmm7                     \n"
352     "sub       %0,%1                           \n"
353     "sub       %0,%1                           \n"
354     LABELALIGN
355     "1:                                        \n"
356     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
357     "movdqa    %%xmm0,%%xmm1                   \n"
358     "movdqa    %%xmm0,%%xmm2                   \n"
359     "psllw     $0x1,%%xmm1                     \n"
360     "psllw     $0xb,%%xmm2                     \n"
361     "pand      %%xmm3,%%xmm1                   \n"
362     "pmulhuw   %%xmm5,%%xmm2                   \n"
363     "pmulhuw   %%xmm5,%%xmm1                   \n"
364     "psllw     $0x8,%%xmm1                     \n"
365     "por       %%xmm2,%%xmm1                   \n"
366     "movdqa    %%xmm0,%%xmm2                   \n"
367     "pand      %%xmm4,%%xmm0                   \n"
368     "psraw     $0x8,%%xmm2                     \n"
369     "pmulhuw   %%xmm6,%%xmm0                   \n"
370     "pand      %%xmm7,%%xmm2                   \n"
371     "por       %%xmm2,%%xmm0                   \n"
372     "movdqa    %%xmm1,%%xmm2                   \n"
373     "punpcklbw %%xmm0,%%xmm1                   \n"
374     "punpckhbw %%xmm0,%%xmm2                   \n"
375     MEMOPMEM(movdqu,xmm1,0x00,1,0,2)           //  movdqu  %%xmm1,(%1,%0,2)
376     MEMOPMEM(movdqu,xmm2,0x10,1,0,2)           //  movdqu  %%xmm2,0x10(%1,%0,2)
377     "lea       " MEMLEA(0x10,0) ",%0           \n"
378     "sub       $0x8,%2                         \n"
379     "jg        1b                              \n"
380   : "+r"(src),  // %0
381     "+r"(dst),  // %1
382     "+r"(width)   // %2
383   :
384   : "memory", "cc", "eax", NACL_R14
385     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
386   );
387 }
388 
ARGB4444ToARGBRow_SSE2(const uint8 * src,uint8 * dst,int width)389 void ARGB4444ToARGBRow_SSE2(const uint8* src, uint8* dst, int width) {
390   asm volatile (
391     "mov       $0xf0f0f0f,%%eax                \n"
392     "movd      %%eax,%%xmm4                    \n"
393     "pshufd    $0x0,%%xmm4,%%xmm4              \n"
394     "movdqa    %%xmm4,%%xmm5                   \n"
395     "pslld     $0x4,%%xmm5                     \n"
396     "sub       %0,%1                           \n"
397     "sub       %0,%1                           \n"
398     LABELALIGN
399     "1:                                        \n"
400     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
401     "movdqa    %%xmm0,%%xmm2                   \n"
402     "pand      %%xmm4,%%xmm0                   \n"
403     "pand      %%xmm5,%%xmm2                   \n"
404     "movdqa    %%xmm0,%%xmm1                   \n"
405     "movdqa    %%xmm2,%%xmm3                   \n"
406     "psllw     $0x4,%%xmm1                     \n"
407     "psrlw     $0x4,%%xmm3                     \n"
408     "por       %%xmm1,%%xmm0                   \n"
409     "por       %%xmm3,%%xmm2                   \n"
410     "movdqa    %%xmm0,%%xmm1                   \n"
411     "punpcklbw %%xmm2,%%xmm0                   \n"
412     "punpckhbw %%xmm2,%%xmm1                   \n"
413     MEMOPMEM(movdqu,xmm0,0x00,1,0,2)           //  movdqu  %%xmm0,(%1,%0,2)
414     MEMOPMEM(movdqu,xmm1,0x10,1,0,2)           //  movdqu  %%xmm1,0x10(%1,%0,2)
415     "lea       " MEMLEA(0x10,0) ",%0           \n"
416     "sub       $0x8,%2                         \n"
417     "jg        1b                              \n"
418   : "+r"(src),  // %0
419     "+r"(dst),  // %1
420     "+r"(width)   // %2
421   :
422   : "memory", "cc", "eax", NACL_R14
423     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
424   );
425 }
426 
ARGBToRGB24Row_SSSE3(const uint8 * src,uint8 * dst,int width)427 void ARGBToRGB24Row_SSSE3(const uint8* src, uint8* dst, int width) {
428   asm volatile (
429     "movdqa    %3,%%xmm6                       \n"
430     LABELALIGN
431     "1:                                        \n"
432     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
433     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
434     "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
435     "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
436     "lea       " MEMLEA(0x40,0) ",%0           \n"
437     "pshufb    %%xmm6,%%xmm0                   \n"
438     "pshufb    %%xmm6,%%xmm1                   \n"
439     "pshufb    %%xmm6,%%xmm2                   \n"
440     "pshufb    %%xmm6,%%xmm3                   \n"
441     "movdqa    %%xmm1,%%xmm4                   \n"
442     "psrldq    $0x4,%%xmm1                     \n"
443     "pslldq    $0xc,%%xmm4                     \n"
444     "movdqa    %%xmm2,%%xmm5                   \n"
445     "por       %%xmm4,%%xmm0                   \n"
446     "pslldq    $0x8,%%xmm5                     \n"
447     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
448     "por       %%xmm5,%%xmm1                   \n"
449     "psrldq    $0x8,%%xmm2                     \n"
450     "pslldq    $0x4,%%xmm3                     \n"
451     "por       %%xmm3,%%xmm2                   \n"
452     "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
453     "movdqu    %%xmm2," MEMACCESS2(0x20,1) "   \n"
454     "lea       " MEMLEA(0x30,1) ",%1           \n"
455     "sub       $0x10,%2                        \n"
456     "jg        1b                              \n"
457   : "+r"(src),  // %0
458     "+r"(dst),  // %1
459     "+r"(width)   // %2
460   : "m"(kShuffleMaskARGBToRGB24)  // %3
461   : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
462   );
463 }
464 
ARGBToRAWRow_SSSE3(const uint8 * src,uint8 * dst,int width)465 void ARGBToRAWRow_SSSE3(const uint8* src, uint8* dst, int width) {
466   asm volatile (
467     "movdqa    %3,%%xmm6                       \n"
468     LABELALIGN
469     "1:                                        \n"
470     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
471     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
472     "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
473     "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
474     "lea       " MEMLEA(0x40,0) ",%0           \n"
475     "pshufb    %%xmm6,%%xmm0                   \n"
476     "pshufb    %%xmm6,%%xmm1                   \n"
477     "pshufb    %%xmm6,%%xmm2                   \n"
478     "pshufb    %%xmm6,%%xmm3                   \n"
479     "movdqa    %%xmm1,%%xmm4                   \n"
480     "psrldq    $0x4,%%xmm1                     \n"
481     "pslldq    $0xc,%%xmm4                     \n"
482     "movdqa    %%xmm2,%%xmm5                   \n"
483     "por       %%xmm4,%%xmm0                   \n"
484     "pslldq    $0x8,%%xmm5                     \n"
485     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
486     "por       %%xmm5,%%xmm1                   \n"
487     "psrldq    $0x8,%%xmm2                     \n"
488     "pslldq    $0x4,%%xmm3                     \n"
489     "por       %%xmm3,%%xmm2                   \n"
490     "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
491     "movdqu    %%xmm2," MEMACCESS2(0x20,1) "   \n"
492     "lea       " MEMLEA(0x30,1) ",%1           \n"
493     "sub       $0x10,%2                        \n"
494     "jg        1b                              \n"
495   : "+r"(src),  // %0
496     "+r"(dst),  // %1
497     "+r"(width)   // %2
498   : "m"(kShuffleMaskARGBToRAW)  // %3
499   : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
500   );
501 }
502 
ARGBToRGB565Row_SSE2(const uint8 * src,uint8 * dst,int width)503 void ARGBToRGB565Row_SSE2(const uint8* src, uint8* dst, int width) {
504   asm volatile (
505     "pcmpeqb   %%xmm3,%%xmm3                   \n"
506     "psrld     $0x1b,%%xmm3                    \n"
507     "pcmpeqb   %%xmm4,%%xmm4                   \n"
508     "psrld     $0x1a,%%xmm4                    \n"
509     "pslld     $0x5,%%xmm4                     \n"
510     "pcmpeqb   %%xmm5,%%xmm5                   \n"
511     "pslld     $0xb,%%xmm5                     \n"
512     LABELALIGN
513     "1:                                        \n"
514     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
515     "movdqa    %%xmm0,%%xmm1                   \n"
516     "movdqa    %%xmm0,%%xmm2                   \n"
517     "pslld     $0x8,%%xmm0                     \n"
518     "psrld     $0x3,%%xmm1                     \n"
519     "psrld     $0x5,%%xmm2                     \n"
520     "psrad     $0x10,%%xmm0                    \n"
521     "pand      %%xmm3,%%xmm1                   \n"
522     "pand      %%xmm4,%%xmm2                   \n"
523     "pand      %%xmm5,%%xmm0                   \n"
524     "por       %%xmm2,%%xmm1                   \n"
525     "por       %%xmm1,%%xmm0                   \n"
526     "packssdw  %%xmm0,%%xmm0                   \n"
527     "lea       " MEMLEA(0x10,0) ",%0           \n"
528     "movq      %%xmm0," MEMACCESS(1) "         \n"
529     "lea       " MEMLEA(0x8,1) ",%1            \n"
530     "sub       $0x4,%2                         \n"
531     "jg        1b                              \n"
532   : "+r"(src),  // %0
533     "+r"(dst),  // %1
534     "+r"(width)   // %2
535   :: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
536   );
537 }
538 
ARGBToRGB565DitherRow_SSE2(const uint8 * src,uint8 * dst,const uint32 dither4,int width)539 void ARGBToRGB565DitherRow_SSE2(const uint8* src,
540                                 uint8* dst,
541                                 const uint32 dither4,
542                                 int width) {
543   asm volatile(
544       "movd       %3,%%xmm6                      \n"
545       "punpcklbw  %%xmm6,%%xmm6                  \n"
546       "movdqa     %%xmm6,%%xmm7                  \n"
547       "punpcklwd  %%xmm6,%%xmm6                  \n"
548       "punpckhwd  %%xmm7,%%xmm7                  \n"
549       "pcmpeqb    %%xmm3,%%xmm3                  \n"
550       "psrld      $0x1b,%%xmm3                   \n"
551       "pcmpeqb    %%xmm4,%%xmm4                  \n"
552       "psrld      $0x1a,%%xmm4                   \n"
553       "pslld      $0x5,%%xmm4                    \n"
554       "pcmpeqb    %%xmm5,%%xmm5                  \n"
555       "pslld      $0xb,%%xmm5                    \n"
556 
557       LABELALIGN
558       "1:                                        \n"
559       "movdqu     (%0),%%xmm0                    \n"
560       "paddusb    %%xmm6,%%xmm0                  \n"
561       "movdqa     %%xmm0,%%xmm1                  \n"
562       "movdqa     %%xmm0,%%xmm2                  \n"
563       "pslld      $0x8,%%xmm0                    \n"
564       "psrld      $0x3,%%xmm1                    \n"
565       "psrld      $0x5,%%xmm2                    \n"
566       "psrad      $0x10,%%xmm0                   \n"
567       "pand       %%xmm3,%%xmm1                  \n"
568       "pand       %%xmm4,%%xmm2                  \n"
569       "pand       %%xmm5,%%xmm0                  \n"
570       "por        %%xmm2,%%xmm1                  \n"
571       "por        %%xmm1,%%xmm0                  \n"
572       "packssdw   %%xmm0,%%xmm0                  \n"
573       "lea        0x10(%0),%0                    \n"
574       "movq       %%xmm0,(%1)                    \n"
575       "lea        0x8(%1),%1                     \n"
576       "sub        $0x4,%2                        \n"
577       "jg        1b                              \n"
578       : "+r"(src),    // %0
579         "+r"(dst),    // %1
580         "+r"(width)   // %2
581       : "m"(dither4)  // %3
582       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
583         "xmm7");
584 }
585 
586 #ifdef HAS_ARGBTORGB565DITHERROW_AVX2
ARGBToRGB565DitherRow_AVX2(const uint8 * src,uint8 * dst,const uint32 dither4,int width)587 void ARGBToRGB565DitherRow_AVX2(const uint8* src,
588                                 uint8* dst,
589                                 const uint32 dither4,
590                                 int width) {
591   asm volatile(
592       "vbroadcastss %3,%%xmm6                    \n"
593       "vpunpcklbw %%xmm6,%%xmm6,%%xmm6           \n"
594       "vpermq     $0xd8,%%ymm6,%%ymm6            \n"
595       "vpunpcklwd %%ymm6,%%ymm6,%%ymm6           \n"
596       "vpcmpeqb   %%ymm3,%%ymm3,%%ymm3           \n"
597       "vpsrld     $0x1b,%%ymm3,%%ymm3            \n"
598       "vpcmpeqb   %%ymm4,%%ymm4,%%ymm4           \n"
599       "vpsrld     $0x1a,%%ymm4,%%ymm4            \n"
600       "vpslld     $0x5,%%ymm4,%%ymm4             \n"
601       "vpslld     $0xb,%%ymm3,%%ymm5             \n"
602 
603       LABELALIGN
604       "1:                                        \n"
605       "vmovdqu    (%0),%%ymm0                    \n"
606       "vpaddusb   %%ymm6,%%ymm0,%%ymm0           \n"
607       "vpsrld     $0x5,%%ymm0,%%ymm2             \n"
608       "vpsrld     $0x3,%%ymm0,%%ymm1             \n"
609       "vpsrld     $0x8,%%ymm0,%%ymm0             \n"
610       "vpand      %%ymm4,%%ymm2,%%ymm2           \n"
611       "vpand      %%ymm3,%%ymm1,%%ymm1           \n"
612       "vpand      %%ymm5,%%ymm0,%%ymm0           \n"
613       "vpor       %%ymm2,%%ymm1,%%ymm1           \n"
614       "vpor       %%ymm1,%%ymm0,%%ymm0           \n"
615       "vpackusdw  %%ymm0,%%ymm0,%%ymm0           \n"
616       "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
617       "lea        0x20(%0),%0                    \n"
618       "vmovdqu    %%xmm0,(%1)                    \n"
619       "lea        0x10(%1),%1                    \n"
620       "sub        $0x8,%2                        \n"
621       "jg         1b                             \n"
622       "vzeroupper                                \n"
623       : "+r"(src),    // %0
624         "+r"(dst),    // %1
625         "+r"(width)   // %2
626       : "m"(dither4)  // %3
627       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
628         "xmm7");
629 }
630 #endif  // HAS_ARGBTORGB565DITHERROW_AVX2
631 
ARGBToARGB1555Row_SSE2(const uint8 * src,uint8 * dst,int width)632 void ARGBToARGB1555Row_SSE2(const uint8* src, uint8* dst, int width) {
633   asm volatile (
634     "pcmpeqb   %%xmm4,%%xmm4                   \n"
635     "psrld     $0x1b,%%xmm4                    \n"
636     "movdqa    %%xmm4,%%xmm5                   \n"
637     "pslld     $0x5,%%xmm5                     \n"
638     "movdqa    %%xmm4,%%xmm6                   \n"
639     "pslld     $0xa,%%xmm6                     \n"
640     "pcmpeqb   %%xmm7,%%xmm7                   \n"
641     "pslld     $0xf,%%xmm7                     \n"
642 
643     LABELALIGN
644     "1:                                        \n"
645     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
646     "movdqa    %%xmm0,%%xmm1                   \n"
647     "movdqa    %%xmm0,%%xmm2                   \n"
648     "movdqa    %%xmm0,%%xmm3                   \n"
649     "psrad     $0x10,%%xmm0                    \n"
650     "psrld     $0x3,%%xmm1                     \n"
651     "psrld     $0x6,%%xmm2                     \n"
652     "psrld     $0x9,%%xmm3                     \n"
653     "pand      %%xmm7,%%xmm0                   \n"
654     "pand      %%xmm4,%%xmm1                   \n"
655     "pand      %%xmm5,%%xmm2                   \n"
656     "pand      %%xmm6,%%xmm3                   \n"
657     "por       %%xmm1,%%xmm0                   \n"
658     "por       %%xmm3,%%xmm2                   \n"
659     "por       %%xmm2,%%xmm0                   \n"
660     "packssdw  %%xmm0,%%xmm0                   \n"
661     "lea       " MEMLEA(0x10,0) ",%0           \n"
662     "movq      %%xmm0," MEMACCESS(1) "         \n"
663     "lea       " MEMLEA(0x8,1) ",%1            \n"
664     "sub       $0x4,%2                         \n"
665     "jg        1b                              \n"
666   : "+r"(src),  // %0
667     "+r"(dst),  // %1
668     "+r"(width)   // %2
669   :: "memory", "cc",
670     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
671   );
672 }
673 
ARGBToARGB4444Row_SSE2(const uint8 * src,uint8 * dst,int width)674 void ARGBToARGB4444Row_SSE2(const uint8* src, uint8* dst, int width) {
675   asm volatile (
676     "pcmpeqb   %%xmm4,%%xmm4                   \n"
677     "psllw     $0xc,%%xmm4                     \n"
678     "movdqa    %%xmm4,%%xmm3                   \n"
679     "psrlw     $0x8,%%xmm3                     \n"
680 
681     LABELALIGN
682     "1:                                        \n"
683     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
684     "movdqa    %%xmm0,%%xmm1                   \n"
685     "pand      %%xmm3,%%xmm0                   \n"
686     "pand      %%xmm4,%%xmm1                   \n"
687     "psrlq     $0x4,%%xmm0                     \n"
688     "psrlq     $0x8,%%xmm1                     \n"
689     "por       %%xmm1,%%xmm0                   \n"
690     "packuswb  %%xmm0,%%xmm0                   \n"
691     "lea       " MEMLEA(0x10,0) ",%0           \n"
692     "movq      %%xmm0," MEMACCESS(1) "         \n"
693     "lea       " MEMLEA(0x8,1) ",%1            \n"
694     "sub       $0x4,%2                         \n"
695     "jg        1b                              \n"
696   : "+r"(src),  // %0
697     "+r"(dst),  // %1
698     "+r"(width)   // %2
699   :: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
700   );
701 }
702 #endif  // HAS_RGB24TOARGBROW_SSSE3
703 
704 #ifdef HAS_ARGBTOYROW_SSSE3
705 // Convert 16 ARGB pixels (64 bytes) to 16 Y values.
ARGBToYRow_SSSE3(const uint8 * src_argb,uint8 * dst_y,int width)706 void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) {
707   asm volatile (
708     "movdqa    %3,%%xmm4                       \n"
709     "movdqa    %4,%%xmm5                       \n"
710 
711     LABELALIGN
712     "1:                                        \n"
713     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
714     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
715     "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
716     "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
717     "pmaddubsw %%xmm4,%%xmm0                   \n"
718     "pmaddubsw %%xmm4,%%xmm1                   \n"
719     "pmaddubsw %%xmm4,%%xmm2                   \n"
720     "pmaddubsw %%xmm4,%%xmm3                   \n"
721     "lea       " MEMLEA(0x40,0) ",%0           \n"
722     "phaddw    %%xmm1,%%xmm0                   \n"
723     "phaddw    %%xmm3,%%xmm2                   \n"
724     "psrlw     $0x7,%%xmm0                     \n"
725     "psrlw     $0x7,%%xmm2                     \n"
726     "packuswb  %%xmm2,%%xmm0                   \n"
727     "paddb     %%xmm5,%%xmm0                   \n"
728     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
729     "lea       " MEMLEA(0x10,1) ",%1           \n"
730     "sub       $0x10,%2                        \n"
731     "jg        1b                              \n"
732   : "+r"(src_argb),  // %0
733     "+r"(dst_y),     // %1
734     "+r"(width)        // %2
735   : "m"(kARGBToY),   // %3
736     "m"(kAddY16)     // %4
737   : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
738   );
739 }
740 #endif  // HAS_ARGBTOYROW_SSSE3
741 
742 #ifdef HAS_ARGBTOYJROW_SSSE3
743 // Convert 16 ARGB pixels (64 bytes) to 16 YJ values.
744 // Same as ARGBToYRow but different coefficients, no add 16, but do rounding.
ARGBToYJRow_SSSE3(const uint8 * src_argb,uint8 * dst_y,int width)745 void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) {
746   asm volatile (
747     "movdqa    %3,%%xmm4                       \n"
748     "movdqa    %4,%%xmm5                       \n"
749 
750     LABELALIGN
751     "1:                                        \n"
752     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
753     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
754     "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
755     "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
756     "pmaddubsw %%xmm4,%%xmm0                   \n"
757     "pmaddubsw %%xmm4,%%xmm1                   \n"
758     "pmaddubsw %%xmm4,%%xmm2                   \n"
759     "pmaddubsw %%xmm4,%%xmm3                   \n"
760     "lea       " MEMLEA(0x40,0) ",%0           \n"
761     "phaddw    %%xmm1,%%xmm0                   \n"
762     "phaddw    %%xmm3,%%xmm2                   \n"
763     "paddw     %%xmm5,%%xmm0                   \n"
764     "paddw     %%xmm5,%%xmm2                   \n"
765     "psrlw     $0x7,%%xmm0                     \n"
766     "psrlw     $0x7,%%xmm2                     \n"
767     "packuswb  %%xmm2,%%xmm0                   \n"
768     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
769     "lea       " MEMLEA(0x10,1) ",%1           \n"
770     "sub       $0x10,%2                        \n"
771     "jg        1b                              \n"
772   : "+r"(src_argb),  // %0
773     "+r"(dst_y),     // %1
774     "+r"(width)        // %2
775   : "m"(kARGBToYJ),  // %3
776     "m"(kAddYJ64)    // %4
777   : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
778   );
779 }
780 #endif  // HAS_ARGBTOYJROW_SSSE3
781 
782 #ifdef HAS_ARGBTOYROW_AVX2
783 // vpermd for vphaddw + vpackuswb vpermd.
784 static const lvec32 kPermdARGBToY_AVX = {0, 4, 1, 5, 2, 6, 3, 7};
785 
786 // Convert 32 ARGB pixels (128 bytes) to 32 Y values.
ARGBToYRow_AVX2(const uint8 * src_argb,uint8 * dst_y,int width)787 void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int width) {
788   asm volatile (
789     "vbroadcastf128 %3,%%ymm4                  \n"
790     "vbroadcastf128 %4,%%ymm5                  \n"
791     "vmovdqu    %5,%%ymm6                      \n"
792 
793     LABELALIGN
794     "1:                                        \n"
795     "vmovdqu    " MEMACCESS(0) ",%%ymm0        \n"
796     "vmovdqu    " MEMACCESS2(0x20,0) ",%%ymm1  \n"
797     "vmovdqu    " MEMACCESS2(0x40,0) ",%%ymm2  \n"
798     "vmovdqu    " MEMACCESS2(0x60,0) ",%%ymm3  \n"
799     "vpmaddubsw %%ymm4,%%ymm0,%%ymm0           \n"
800     "vpmaddubsw %%ymm4,%%ymm1,%%ymm1           \n"
801     "vpmaddubsw %%ymm4,%%ymm2,%%ymm2           \n"
802     "vpmaddubsw %%ymm4,%%ymm3,%%ymm3           \n"
803     "lea       " MEMLEA(0x80,0) ",%0           \n"
804     "vphaddw    %%ymm1,%%ymm0,%%ymm0           \n"  // mutates.
805     "vphaddw    %%ymm3,%%ymm2,%%ymm2           \n"
806     "vpsrlw     $0x7,%%ymm0,%%ymm0             \n"
807     "vpsrlw     $0x7,%%ymm2,%%ymm2             \n"
808     "vpackuswb  %%ymm2,%%ymm0,%%ymm0           \n"  // mutates.
809     "vpermd     %%ymm0,%%ymm6,%%ymm0           \n"  // unmutate.
810     "vpaddb     %%ymm5,%%ymm0,%%ymm0           \n"  // add 16 for Y
811     "vmovdqu    %%ymm0," MEMACCESS(1) "        \n"
812     "lea       " MEMLEA(0x20,1) ",%1           \n"
813     "sub       $0x20,%2                        \n"
814     "jg        1b                              \n"
815     "vzeroupper                                \n"
816   : "+r"(src_argb),  // %0
817     "+r"(dst_y),     // %1
818     "+r"(width)        // %2
819   : "m"(kARGBToY),   // %3
820     "m"(kAddY16),    // %4
821     "m"(kPermdARGBToY_AVX)  // %5
822   : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
823   );
824 }
825 #endif  // HAS_ARGBTOYROW_AVX2
826 
827 #ifdef HAS_ARGBTOYJROW_AVX2
828 // Convert 32 ARGB pixels (128 bytes) to 32 Y values.
ARGBToYJRow_AVX2(const uint8 * src_argb,uint8 * dst_y,int width)829 void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int width) {
830   asm volatile (
831     "vbroadcastf128 %3,%%ymm4                  \n"
832     "vbroadcastf128 %4,%%ymm5                  \n"
833     "vmovdqu    %5,%%ymm6                      \n"
834 
835     LABELALIGN
836     "1:                                        \n"
837     "vmovdqu    " MEMACCESS(0) ",%%ymm0        \n"
838     "vmovdqu    " MEMACCESS2(0x20,0) ",%%ymm1  \n"
839     "vmovdqu    " MEMACCESS2(0x40,0) ",%%ymm2  \n"
840     "vmovdqu    " MEMACCESS2(0x60,0) ",%%ymm3  \n"
841     "vpmaddubsw %%ymm4,%%ymm0,%%ymm0           \n"
842     "vpmaddubsw %%ymm4,%%ymm1,%%ymm1           \n"
843     "vpmaddubsw %%ymm4,%%ymm2,%%ymm2           \n"
844     "vpmaddubsw %%ymm4,%%ymm3,%%ymm3           \n"
845     "lea       " MEMLEA(0x80,0) ",%0           \n"
846     "vphaddw    %%ymm1,%%ymm0,%%ymm0           \n"  // mutates.
847     "vphaddw    %%ymm3,%%ymm2,%%ymm2           \n"
848     "vpaddw     %%ymm5,%%ymm0,%%ymm0           \n"  // Add .5 for rounding.
849     "vpaddw     %%ymm5,%%ymm2,%%ymm2           \n"
850     "vpsrlw     $0x7,%%ymm0,%%ymm0             \n"
851     "vpsrlw     $0x7,%%ymm2,%%ymm2             \n"
852     "vpackuswb  %%ymm2,%%ymm0,%%ymm0           \n"  // mutates.
853     "vpermd     %%ymm0,%%ymm6,%%ymm0           \n"  // unmutate.
854     "vmovdqu    %%ymm0," MEMACCESS(1) "        \n"
855     "lea       " MEMLEA(0x20,1) ",%1           \n"
856     "sub       $0x20,%2                        \n"
857     "jg        1b                              \n"
858     "vzeroupper                                \n"
859   : "+r"(src_argb),  // %0
860     "+r"(dst_y),     // %1
861     "+r"(width)        // %2
862   : "m"(kARGBToYJ),   // %3
863     "m"(kAddYJ64),    // %4
864     "m"(kPermdARGBToY_AVX)  // %5
865   : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
866   );
867 }
868 #endif  // HAS_ARGBTOYJROW_AVX2
869 
870 #ifdef HAS_ARGBTOUVROW_SSSE3
ARGBToUVRow_SSSE3(const uint8 * src_argb0,int src_stride_argb,uint8 * dst_u,uint8 * dst_v,int width)871 void ARGBToUVRow_SSSE3(const uint8* src_argb0,
872                        int src_stride_argb,
873                        uint8* dst_u,
874                        uint8* dst_v,
875                        int width) {
876   asm volatile (
877     "movdqa    %5,%%xmm3                       \n"
878     "movdqa    %6,%%xmm4                       \n"
879     "movdqa    %7,%%xmm5                       \n"
880     "sub       %1,%2                           \n"
881 
882     LABELALIGN
883     "1:                                        \n"
884     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
885     MEMOPREG(movdqu,0x00,0,4,1,xmm7)            //  movdqu (%0,%4,1),%%xmm7
886     "pavgb     %%xmm7,%%xmm0                   \n"
887     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
888     MEMOPREG(movdqu,0x10,0,4,1,xmm7)            //  movdqu 0x10(%0,%4,1),%%xmm7
889     "pavgb     %%xmm7,%%xmm1                   \n"
890     "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
891     MEMOPREG(movdqu,0x20,0,4,1,xmm7)            //  movdqu 0x20(%0,%4,1),%%xmm7
892     "pavgb     %%xmm7,%%xmm2                   \n"
893     "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
894     MEMOPREG(movdqu,0x30,0,4,1,xmm7)            //  movdqu 0x30(%0,%4,1),%%xmm7
895     "pavgb     %%xmm7,%%xmm6                   \n"
896 
897     "lea       " MEMLEA(0x40,0) ",%0           \n"
898     "movdqa    %%xmm0,%%xmm7                   \n"
899     "shufps    $0x88,%%xmm1,%%xmm0             \n"
900     "shufps    $0xdd,%%xmm1,%%xmm7             \n"
901     "pavgb     %%xmm7,%%xmm0                   \n"
902     "movdqa    %%xmm2,%%xmm7                   \n"
903     "shufps    $0x88,%%xmm6,%%xmm2             \n"
904     "shufps    $0xdd,%%xmm6,%%xmm7             \n"
905     "pavgb     %%xmm7,%%xmm2                   \n"
906     "movdqa    %%xmm0,%%xmm1                   \n"
907     "movdqa    %%xmm2,%%xmm6                   \n"
908     "pmaddubsw %%xmm4,%%xmm0                   \n"
909     "pmaddubsw %%xmm4,%%xmm2                   \n"
910     "pmaddubsw %%xmm3,%%xmm1                   \n"
911     "pmaddubsw %%xmm3,%%xmm6                   \n"
912     "phaddw    %%xmm2,%%xmm0                   \n"
913     "phaddw    %%xmm6,%%xmm1                   \n"
914     "psraw     $0x8,%%xmm0                     \n"
915     "psraw     $0x8,%%xmm1                     \n"
916     "packsswb  %%xmm1,%%xmm0                   \n"
917     "paddb     %%xmm5,%%xmm0                   \n"
918     "movlps    %%xmm0," MEMACCESS(1) "         \n"
919     MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps    %%xmm0,(%1,%2,1)
920     "lea       " MEMLEA(0x8,1) ",%1            \n"
921     "sub       $0x10,%3                        \n"
922     "jg        1b                              \n"
923   : "+r"(src_argb0),       // %0
924     "+r"(dst_u),           // %1
925     "+r"(dst_v),           // %2
926     "+rm"(width)           // %3
927   : "r"((intptr_t)(src_stride_argb)), // %4
928     "m"(kARGBToV),  // %5
929     "m"(kARGBToU),  // %6
930     "m"(kAddUV128)  // %7
931   : "memory", "cc", NACL_R14
932     "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
933   );
934 }
935 #endif  // HAS_ARGBTOUVROW_SSSE3
936 
937 #ifdef HAS_ARGBTOUVROW_AVX2
938 // vpshufb for vphaddw + vpackuswb packed to shorts.
939 static const lvec8 kShufARGBToUV_AVX = {
940     0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
941     0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15};
ARGBToUVRow_AVX2(const uint8 * src_argb0,int src_stride_argb,uint8 * dst_u,uint8 * dst_v,int width)942 void ARGBToUVRow_AVX2(const uint8* src_argb0,
943                       int src_stride_argb,
944                       uint8* dst_u,
945                       uint8* dst_v,
946                       int width) {
947   asm volatile (
948     "vbroadcastf128 %5,%%ymm5                  \n"
949     "vbroadcastf128 %6,%%ymm6                  \n"
950     "vbroadcastf128 %7,%%ymm7                  \n"
951     "sub        %1,%2                          \n"
952 
953     LABELALIGN
954     "1:                                        \n"
955     "vmovdqu    " MEMACCESS(0) ",%%ymm0        \n"
956     "vmovdqu    " MEMACCESS2(0x20,0) ",%%ymm1  \n"
957     "vmovdqu    " MEMACCESS2(0x40,0) ",%%ymm2  \n"
958     "vmovdqu    " MEMACCESS2(0x60,0) ",%%ymm3  \n"
959     VMEMOPREG(vpavgb,0x00,0,4,1,ymm0,ymm0)     // vpavgb (%0,%4,1),%%ymm0,%%ymm0
960     VMEMOPREG(vpavgb,0x20,0,4,1,ymm1,ymm1)
961     VMEMOPREG(vpavgb,0x40,0,4,1,ymm2,ymm2)
962     VMEMOPREG(vpavgb,0x60,0,4,1,ymm3,ymm3)
963     "lea        " MEMLEA(0x80,0) ",%0          \n"
964     "vshufps    $0x88,%%ymm1,%%ymm0,%%ymm4     \n"
965     "vshufps    $0xdd,%%ymm1,%%ymm0,%%ymm0     \n"
966     "vpavgb     %%ymm4,%%ymm0,%%ymm0           \n"
967     "vshufps    $0x88,%%ymm3,%%ymm2,%%ymm4     \n"
968     "vshufps    $0xdd,%%ymm3,%%ymm2,%%ymm2     \n"
969     "vpavgb     %%ymm4,%%ymm2,%%ymm2           \n"
970 
971     "vpmaddubsw %%ymm7,%%ymm0,%%ymm1           \n"
972     "vpmaddubsw %%ymm7,%%ymm2,%%ymm3           \n"
973     "vpmaddubsw %%ymm6,%%ymm0,%%ymm0           \n"
974     "vpmaddubsw %%ymm6,%%ymm2,%%ymm2           \n"
975     "vphaddw    %%ymm3,%%ymm1,%%ymm1           \n"
976     "vphaddw    %%ymm2,%%ymm0,%%ymm0           \n"
977     "vpsraw     $0x8,%%ymm1,%%ymm1             \n"
978     "vpsraw     $0x8,%%ymm0,%%ymm0             \n"
979     "vpacksswb  %%ymm0,%%ymm1,%%ymm0           \n"
980     "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
981     "vpshufb    %8,%%ymm0,%%ymm0               \n"
982     "vpaddb     %%ymm5,%%ymm0,%%ymm0           \n"
983 
984     "vextractf128 $0x0,%%ymm0," MEMACCESS(1) " \n"
985     VEXTOPMEM(vextractf128,1,ymm0,0x0,1,2,1) // vextractf128 $1,%%ymm0,(%1,%2,1)
986     "lea        " MEMLEA(0x10,1) ",%1          \n"
987     "sub        $0x20,%3                       \n"
988     "jg         1b                             \n"
989     "vzeroupper                                \n"
990   : "+r"(src_argb0),       // %0
991     "+r"(dst_u),           // %1
992     "+r"(dst_v),           // %2
993     "+rm"(width)           // %3
994   : "r"((intptr_t)(src_stride_argb)), // %4
995     "m"(kAddUV128),  // %5
996     "m"(kARGBToV),   // %6
997     "m"(kARGBToU),   // %7
998     "m"(kShufARGBToUV_AVX)  // %8
999   : "memory", "cc", NACL_R14
1000     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
1001   );
1002 }
1003 #endif  // HAS_ARGBTOUVROW_AVX2
1004 
1005 #ifdef HAS_ARGBTOUVJROW_AVX2
ARGBToUVJRow_AVX2(const uint8 * src_argb0,int src_stride_argb,uint8 * dst_u,uint8 * dst_v,int width)1006 void ARGBToUVJRow_AVX2(const uint8* src_argb0,
1007                        int src_stride_argb,
1008                        uint8* dst_u,
1009                        uint8* dst_v,
1010                        int width) {
1011   asm volatile (
1012     "vbroadcastf128 %5,%%ymm5                  \n"
1013     "vbroadcastf128 %6,%%ymm6                  \n"
1014     "vbroadcastf128 %7,%%ymm7                  \n"
1015     "sub        %1,%2                          \n"
1016 
1017     LABELALIGN
1018     "1:                                        \n"
1019     "vmovdqu    " MEMACCESS(0) ",%%ymm0        \n"
1020     "vmovdqu    " MEMACCESS2(0x20,0) ",%%ymm1  \n"
1021     "vmovdqu    " MEMACCESS2(0x40,0) ",%%ymm2  \n"
1022     "vmovdqu    " MEMACCESS2(0x60,0) ",%%ymm3  \n"
1023     VMEMOPREG(vpavgb,0x00,0,4,1,ymm0,ymm0)     // vpavgb (%0,%4,1),%%ymm0,%%ymm0
1024     VMEMOPREG(vpavgb,0x20,0,4,1,ymm1,ymm1)
1025     VMEMOPREG(vpavgb,0x40,0,4,1,ymm2,ymm2)
1026     VMEMOPREG(vpavgb,0x60,0,4,1,ymm3,ymm3)
1027     "lea       " MEMLEA(0x80,0) ",%0           \n"
1028     "vshufps    $0x88,%%ymm1,%%ymm0,%%ymm4     \n"
1029     "vshufps    $0xdd,%%ymm1,%%ymm0,%%ymm0     \n"
1030     "vpavgb     %%ymm4,%%ymm0,%%ymm0           \n"
1031     "vshufps    $0x88,%%ymm3,%%ymm2,%%ymm4     \n"
1032     "vshufps    $0xdd,%%ymm3,%%ymm2,%%ymm2     \n"
1033     "vpavgb     %%ymm4,%%ymm2,%%ymm2           \n"
1034 
1035     "vpmaddubsw %%ymm7,%%ymm0,%%ymm1           \n"
1036     "vpmaddubsw %%ymm7,%%ymm2,%%ymm3           \n"
1037     "vpmaddubsw %%ymm6,%%ymm0,%%ymm0           \n"
1038     "vpmaddubsw %%ymm6,%%ymm2,%%ymm2           \n"
1039     "vphaddw    %%ymm3,%%ymm1,%%ymm1           \n"
1040     "vphaddw    %%ymm2,%%ymm0,%%ymm0           \n"
1041     "vpaddw     %%ymm5,%%ymm0,%%ymm0           \n"
1042     "vpaddw     %%ymm5,%%ymm1,%%ymm1           \n"
1043     "vpsraw     $0x8,%%ymm1,%%ymm1             \n"
1044     "vpsraw     $0x8,%%ymm0,%%ymm0             \n"
1045     "vpacksswb  %%ymm0,%%ymm1,%%ymm0           \n"
1046     "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
1047     "vpshufb    %8,%%ymm0,%%ymm0               \n"
1048 
1049     "vextractf128 $0x0,%%ymm0," MEMACCESS(1) " \n"
1050     VEXTOPMEM(vextractf128,1,ymm0,0x0,1,2,1) // vextractf128 $1,%%ymm0,(%1,%2,1)
1051     "lea       " MEMLEA(0x10,1) ",%1           \n"
1052     "sub       $0x20,%3                        \n"
1053     "jg        1b                              \n"
1054     "vzeroupper                                \n"
1055   : "+r"(src_argb0),       // %0
1056     "+r"(dst_u),           // %1
1057     "+r"(dst_v),           // %2
1058     "+rm"(width)           // %3
1059   : "r"((intptr_t)(src_stride_argb)), // %4
1060     "m"(kAddUVJ128),  // %5
1061     "m"(kARGBToVJ),  // %6
1062     "m"(kARGBToUJ),  // %7
1063     "m"(kShufARGBToUV_AVX)  // %8
1064   : "memory", "cc", NACL_R14
1065     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
1066   );
1067 }
1068 #endif  // HAS_ARGBTOUVJROW_AVX2
1069 
1070 #ifdef HAS_ARGBTOUVJROW_SSSE3
ARGBToUVJRow_SSSE3(const uint8 * src_argb0,int src_stride_argb,uint8 * dst_u,uint8 * dst_v,int width)1071 void ARGBToUVJRow_SSSE3(const uint8* src_argb0,
1072                         int src_stride_argb,
1073                         uint8* dst_u,
1074                         uint8* dst_v,
1075                         int width) {
1076   asm volatile (
1077     "movdqa    %5,%%xmm3                       \n"
1078     "movdqa    %6,%%xmm4                       \n"
1079     "movdqa    %7,%%xmm5                       \n"
1080     "sub       %1,%2                           \n"
1081 
1082     LABELALIGN
1083     "1:                                        \n"
1084     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
1085     MEMOPREG(movdqu,0x00,0,4,1,xmm7)            //  movdqu (%0,%4,1),%%xmm7
1086     "pavgb     %%xmm7,%%xmm0                   \n"
1087     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
1088     MEMOPREG(movdqu,0x10,0,4,1,xmm7)            //  movdqu 0x10(%0,%4,1),%%xmm7
1089     "pavgb     %%xmm7,%%xmm1                   \n"
1090     "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
1091     MEMOPREG(movdqu,0x20,0,4,1,xmm7)            //  movdqu 0x20(%0,%4,1),%%xmm7
1092     "pavgb     %%xmm7,%%xmm2                   \n"
1093     "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
1094     MEMOPREG(movdqu,0x30,0,4,1,xmm7)            //  movdqu 0x30(%0,%4,1),%%xmm7
1095     "pavgb     %%xmm7,%%xmm6                   \n"
1096 
1097     "lea       " MEMLEA(0x40,0) ",%0           \n"
1098     "movdqa    %%xmm0,%%xmm7                   \n"
1099     "shufps    $0x88,%%xmm1,%%xmm0             \n"
1100     "shufps    $0xdd,%%xmm1,%%xmm7             \n"
1101     "pavgb     %%xmm7,%%xmm0                   \n"
1102     "movdqa    %%xmm2,%%xmm7                   \n"
1103     "shufps    $0x88,%%xmm6,%%xmm2             \n"
1104     "shufps    $0xdd,%%xmm6,%%xmm7             \n"
1105     "pavgb     %%xmm7,%%xmm2                   \n"
1106     "movdqa    %%xmm0,%%xmm1                   \n"
1107     "movdqa    %%xmm2,%%xmm6                   \n"
1108     "pmaddubsw %%xmm4,%%xmm0                   \n"
1109     "pmaddubsw %%xmm4,%%xmm2                   \n"
1110     "pmaddubsw %%xmm3,%%xmm1                   \n"
1111     "pmaddubsw %%xmm3,%%xmm6                   \n"
1112     "phaddw    %%xmm2,%%xmm0                   \n"
1113     "phaddw    %%xmm6,%%xmm1                   \n"
1114     "paddw     %%xmm5,%%xmm0                   \n"
1115     "paddw     %%xmm5,%%xmm1                   \n"
1116     "psraw     $0x8,%%xmm0                     \n"
1117     "psraw     $0x8,%%xmm1                     \n"
1118     "packsswb  %%xmm1,%%xmm0                   \n"
1119     "movlps    %%xmm0," MEMACCESS(1) "         \n"
1120     MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps  %%xmm0,(%1,%2,1)
1121     "lea       " MEMLEA(0x8,1) ",%1            \n"
1122     "sub       $0x10,%3                        \n"
1123     "jg        1b                              \n"
1124   : "+r"(src_argb0),       // %0
1125     "+r"(dst_u),           // %1
1126     "+r"(dst_v),           // %2
1127     "+rm"(width)           // %3
1128   : "r"((intptr_t)(src_stride_argb)), // %4
1129     "m"(kARGBToVJ),  // %5
1130     "m"(kARGBToUJ),  // %6
1131     "m"(kAddUVJ128)  // %7
1132   : "memory", "cc", NACL_R14
1133     "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1134   );
1135 }
1136 #endif  // HAS_ARGBTOUVJROW_SSSE3
1137 
1138 #ifdef HAS_ARGBTOUV444ROW_SSSE3
ARGBToUV444Row_SSSE3(const uint8 * src_argb,uint8 * dst_u,uint8 * dst_v,int width)1139 void ARGBToUV444Row_SSSE3(const uint8* src_argb,
1140                           uint8* dst_u,
1141                           uint8* dst_v,
1142                           int width) {
1143   asm volatile (
1144     "movdqa    %4,%%xmm3                       \n"
1145     "movdqa    %5,%%xmm4                       \n"
1146     "movdqa    %6,%%xmm5                       \n"
1147     "sub       %1,%2                           \n"
1148 
1149     LABELALIGN
1150     "1:                                        \n"
1151     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
1152     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
1153     "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
1154     "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
1155     "pmaddubsw %%xmm4,%%xmm0                   \n"
1156     "pmaddubsw %%xmm4,%%xmm1                   \n"
1157     "pmaddubsw %%xmm4,%%xmm2                   \n"
1158     "pmaddubsw %%xmm4,%%xmm6                   \n"
1159     "phaddw    %%xmm1,%%xmm0                   \n"
1160     "phaddw    %%xmm6,%%xmm2                   \n"
1161     "psraw     $0x8,%%xmm0                     \n"
1162     "psraw     $0x8,%%xmm2                     \n"
1163     "packsswb  %%xmm2,%%xmm0                   \n"
1164     "paddb     %%xmm5,%%xmm0                   \n"
1165     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
1166     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
1167     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
1168     "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
1169     "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
1170     "pmaddubsw %%xmm3,%%xmm0                   \n"
1171     "pmaddubsw %%xmm3,%%xmm1                   \n"
1172     "pmaddubsw %%xmm3,%%xmm2                   \n"
1173     "pmaddubsw %%xmm3,%%xmm6                   \n"
1174     "phaddw    %%xmm1,%%xmm0                   \n"
1175     "phaddw    %%xmm6,%%xmm2                   \n"
1176     "psraw     $0x8,%%xmm0                     \n"
1177     "psraw     $0x8,%%xmm2                     \n"
1178     "packsswb  %%xmm2,%%xmm0                   \n"
1179     "paddb     %%xmm5,%%xmm0                   \n"
1180     "lea       " MEMLEA(0x40,0) ",%0           \n"
1181     MEMOPMEM(movdqu,xmm0,0x00,1,2,1)           //  movdqu  %%xmm0,(%1,%2,1)
1182     "lea       " MEMLEA(0x10,1) ",%1           \n"
1183     "sub       $0x10,%3                        \n"
1184     "jg        1b                              \n"
1185   : "+r"(src_argb),        // %0
1186     "+r"(dst_u),           // %1
1187     "+r"(dst_v),           // %2
1188     "+rm"(width)           // %3
1189   : "m"(kARGBToV),  // %4
1190     "m"(kARGBToU),  // %5
1191     "m"(kAddUV128)  // %6
1192   : "memory", "cc", NACL_R14
1193     "xmm0", "xmm1", "xmm2", "xmm6"
1194   );
1195 }
1196 #endif  // HAS_ARGBTOUV444ROW_SSSE3
1197 
BGRAToYRow_SSSE3(const uint8 * src_bgra,uint8 * dst_y,int width)1198 void BGRAToYRow_SSSE3(const uint8* src_bgra, uint8* dst_y, int width) {
1199   asm volatile (
1200     "movdqa    %4,%%xmm5                       \n"
1201     "movdqa    %3,%%xmm4                       \n"
1202 
1203     LABELALIGN
1204     "1:                                        \n"
1205     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
1206     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
1207     "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
1208     "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
1209     "pmaddubsw %%xmm4,%%xmm0                   \n"
1210     "pmaddubsw %%xmm4,%%xmm1                   \n"
1211     "pmaddubsw %%xmm4,%%xmm2                   \n"
1212     "pmaddubsw %%xmm4,%%xmm3                   \n"
1213     "lea       " MEMLEA(0x40,0) ",%0           \n"
1214     "phaddw    %%xmm1,%%xmm0                   \n"
1215     "phaddw    %%xmm3,%%xmm2                   \n"
1216     "psrlw     $0x7,%%xmm0                     \n"
1217     "psrlw     $0x7,%%xmm2                     \n"
1218     "packuswb  %%xmm2,%%xmm0                   \n"
1219     "paddb     %%xmm5,%%xmm0                   \n"
1220     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
1221     "lea       " MEMLEA(0x10,1) ",%1           \n"
1222     "sub       $0x10,%2                        \n"
1223     "jg        1b                              \n"
1224   : "+r"(src_bgra),  // %0
1225     "+r"(dst_y),     // %1
1226     "+r"(width)        // %2
1227   : "m"(kBGRAToY),   // %3
1228     "m"(kAddY16)     // %4
1229   : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1230   );
1231 }
1232 
BGRAToUVRow_SSSE3(const uint8 * src_bgra0,int src_stride_bgra,uint8 * dst_u,uint8 * dst_v,int width)1233 void BGRAToUVRow_SSSE3(const uint8* src_bgra0,
1234                        int src_stride_bgra,
1235                        uint8* dst_u,
1236                        uint8* dst_v,
1237                        int width) {
1238   asm volatile (
1239     "movdqa    %5,%%xmm3                       \n"
1240     "movdqa    %6,%%xmm4                       \n"
1241     "movdqa    %7,%%xmm5                       \n"
1242     "sub       %1,%2                           \n"
1243 
1244     LABELALIGN
1245     "1:                                        \n"
1246     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
1247     MEMOPREG(movdqu,0x00,0,4,1,xmm7)            //  movdqu (%0,%4,1),%%xmm7
1248     "pavgb     %%xmm7,%%xmm0                   \n"
1249     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
1250     MEMOPREG(movdqu,0x10,0,4,1,xmm7)            //  movdqu 0x10(%0,%4,1),%%xmm7
1251     "pavgb     %%xmm7,%%xmm1                   \n"
1252     "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
1253     MEMOPREG(movdqu,0x20,0,4,1,xmm7)            //  movdqu 0x20(%0,%4,1),%%xmm7
1254     "pavgb     %%xmm7,%%xmm2                   \n"
1255     "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
1256     MEMOPREG(movdqu,0x30,0,4,1,xmm7)            //  movdqu 0x30(%0,%4,1),%%xmm7
1257     "pavgb     %%xmm7,%%xmm6                   \n"
1258 
1259     "lea       " MEMLEA(0x40,0) ",%0           \n"
1260     "movdqa    %%xmm0,%%xmm7                   \n"
1261     "shufps    $0x88,%%xmm1,%%xmm0             \n"
1262     "shufps    $0xdd,%%xmm1,%%xmm7             \n"
1263     "pavgb     %%xmm7,%%xmm0                   \n"
1264     "movdqa    %%xmm2,%%xmm7                   \n"
1265     "shufps    $0x88,%%xmm6,%%xmm2             \n"
1266     "shufps    $0xdd,%%xmm6,%%xmm7             \n"
1267     "pavgb     %%xmm7,%%xmm2                   \n"
1268     "movdqa    %%xmm0,%%xmm1                   \n"
1269     "movdqa    %%xmm2,%%xmm6                   \n"
1270     "pmaddubsw %%xmm4,%%xmm0                   \n"
1271     "pmaddubsw %%xmm4,%%xmm2                   \n"
1272     "pmaddubsw %%xmm3,%%xmm1                   \n"
1273     "pmaddubsw %%xmm3,%%xmm6                   \n"
1274     "phaddw    %%xmm2,%%xmm0                   \n"
1275     "phaddw    %%xmm6,%%xmm1                   \n"
1276     "psraw     $0x8,%%xmm0                     \n"
1277     "psraw     $0x8,%%xmm1                     \n"
1278     "packsswb  %%xmm1,%%xmm0                   \n"
1279     "paddb     %%xmm5,%%xmm0                   \n"
1280     "movlps    %%xmm0," MEMACCESS(1) "         \n"
1281     MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps  %%xmm0,(%1,%2,1)
1282     "lea       " MEMLEA(0x8,1) ",%1            \n"
1283     "sub       $0x10,%3                        \n"
1284     "jg        1b                              \n"
1285   : "+r"(src_bgra0),       // %0
1286     "+r"(dst_u),           // %1
1287     "+r"(dst_v),           // %2
1288     "+rm"(width)           // %3
1289   : "r"((intptr_t)(src_stride_bgra)), // %4
1290     "m"(kBGRAToV),  // %5
1291     "m"(kBGRAToU),  // %6
1292     "m"(kAddUV128)  // %7
1293   : "memory", "cc", NACL_R14
1294     "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1295   );
1296 }
1297 
ABGRToYRow_SSSE3(const uint8 * src_abgr,uint8 * dst_y,int width)1298 void ABGRToYRow_SSSE3(const uint8* src_abgr, uint8* dst_y, int width) {
1299   asm volatile (
1300     "movdqa    %4,%%xmm5                       \n"
1301     "movdqa    %3,%%xmm4                       \n"
1302 
1303     LABELALIGN
1304     "1:                                        \n"
1305     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
1306     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
1307     "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
1308     "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
1309     "pmaddubsw %%xmm4,%%xmm0                   \n"
1310     "pmaddubsw %%xmm4,%%xmm1                   \n"
1311     "pmaddubsw %%xmm4,%%xmm2                   \n"
1312     "pmaddubsw %%xmm4,%%xmm3                   \n"
1313     "lea       " MEMLEA(0x40,0) ",%0           \n"
1314     "phaddw    %%xmm1,%%xmm0                   \n"
1315     "phaddw    %%xmm3,%%xmm2                   \n"
1316     "psrlw     $0x7,%%xmm0                     \n"
1317     "psrlw     $0x7,%%xmm2                     \n"
1318     "packuswb  %%xmm2,%%xmm0                   \n"
1319     "paddb     %%xmm5,%%xmm0                   \n"
1320     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
1321     "lea       " MEMLEA(0x10,1) ",%1           \n"
1322     "sub       $0x10,%2                        \n"
1323     "jg        1b                              \n"
1324   : "+r"(src_abgr),  // %0
1325     "+r"(dst_y),     // %1
1326     "+r"(width)        // %2
1327   : "m"(kABGRToY),   // %3
1328     "m"(kAddY16)     // %4
1329   : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1330   );
1331 }
1332 
RGBAToYRow_SSSE3(const uint8 * src_rgba,uint8 * dst_y,int width)1333 void RGBAToYRow_SSSE3(const uint8* src_rgba, uint8* dst_y, int width) {
1334   asm volatile (
1335     "movdqa    %4,%%xmm5                       \n"
1336     "movdqa    %3,%%xmm4                       \n"
1337 
1338     LABELALIGN
1339     "1:                                        \n"
1340     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
1341     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
1342     "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
1343     "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
1344     "pmaddubsw %%xmm4,%%xmm0                   \n"
1345     "pmaddubsw %%xmm4,%%xmm1                   \n"
1346     "pmaddubsw %%xmm4,%%xmm2                   \n"
1347     "pmaddubsw %%xmm4,%%xmm3                   \n"
1348     "lea       " MEMLEA(0x40,0) ",%0           \n"
1349     "phaddw    %%xmm1,%%xmm0                   \n"
1350     "phaddw    %%xmm3,%%xmm2                   \n"
1351     "psrlw     $0x7,%%xmm0                     \n"
1352     "psrlw     $0x7,%%xmm2                     \n"
1353     "packuswb  %%xmm2,%%xmm0                   \n"
1354     "paddb     %%xmm5,%%xmm0                   \n"
1355     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
1356     "lea       " MEMLEA(0x10,1) ",%1           \n"
1357     "sub       $0x10,%2                        \n"
1358     "jg        1b                              \n"
1359   : "+r"(src_rgba),  // %0
1360     "+r"(dst_y),     // %1
1361     "+r"(width)        // %2
1362   : "m"(kRGBAToY),   // %3
1363     "m"(kAddY16)     // %4
1364   : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1365   );
1366 }
1367 
ABGRToUVRow_SSSE3(const uint8 * src_abgr0,int src_stride_abgr,uint8 * dst_u,uint8 * dst_v,int width)1368 void ABGRToUVRow_SSSE3(const uint8* src_abgr0,
1369                        int src_stride_abgr,
1370                        uint8* dst_u,
1371                        uint8* dst_v,
1372                        int width) {
1373   asm volatile (
1374     "movdqa    %5,%%xmm3                       \n"
1375     "movdqa    %6,%%xmm4                       \n"
1376     "movdqa    %7,%%xmm5                       \n"
1377     "sub       %1,%2                           \n"
1378 
1379     LABELALIGN
1380     "1:                                        \n"
1381     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
1382     MEMOPREG(movdqu,0x00,0,4,1,xmm7)            //  movdqu (%0,%4,1),%%xmm7
1383     "pavgb     %%xmm7,%%xmm0                   \n"
1384     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
1385     MEMOPREG(movdqu,0x10,0,4,1,xmm7)            //  movdqu 0x10(%0,%4,1),%%xmm7
1386     "pavgb     %%xmm7,%%xmm1                   \n"
1387     "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
1388     MEMOPREG(movdqu,0x20,0,4,1,xmm7)            //  movdqu 0x20(%0,%4,1),%%xmm7
1389     "pavgb     %%xmm7,%%xmm2                   \n"
1390     "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
1391     MEMOPREG(movdqu,0x30,0,4,1,xmm7)            //  movdqu 0x30(%0,%4,1),%%xmm7
1392     "pavgb     %%xmm7,%%xmm6                   \n"
1393 
1394     "lea       " MEMLEA(0x40,0) ",%0           \n"
1395     "movdqa    %%xmm0,%%xmm7                   \n"
1396     "shufps    $0x88,%%xmm1,%%xmm0             \n"
1397     "shufps    $0xdd,%%xmm1,%%xmm7             \n"
1398     "pavgb     %%xmm7,%%xmm0                   \n"
1399     "movdqa    %%xmm2,%%xmm7                   \n"
1400     "shufps    $0x88,%%xmm6,%%xmm2             \n"
1401     "shufps    $0xdd,%%xmm6,%%xmm7             \n"
1402     "pavgb     %%xmm7,%%xmm2                   \n"
1403     "movdqa    %%xmm0,%%xmm1                   \n"
1404     "movdqa    %%xmm2,%%xmm6                   \n"
1405     "pmaddubsw %%xmm4,%%xmm0                   \n"
1406     "pmaddubsw %%xmm4,%%xmm2                   \n"
1407     "pmaddubsw %%xmm3,%%xmm1                   \n"
1408     "pmaddubsw %%xmm3,%%xmm6                   \n"
1409     "phaddw    %%xmm2,%%xmm0                   \n"
1410     "phaddw    %%xmm6,%%xmm1                   \n"
1411     "psraw     $0x8,%%xmm0                     \n"
1412     "psraw     $0x8,%%xmm1                     \n"
1413     "packsswb  %%xmm1,%%xmm0                   \n"
1414     "paddb     %%xmm5,%%xmm0                   \n"
1415     "movlps    %%xmm0," MEMACCESS(1) "         \n"
1416     MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps  %%xmm0,(%1,%2,1)
1417     "lea       " MEMLEA(0x8,1) ",%1            \n"
1418     "sub       $0x10,%3                        \n"
1419     "jg        1b                              \n"
1420   : "+r"(src_abgr0),       // %0
1421     "+r"(dst_u),           // %1
1422     "+r"(dst_v),           // %2
1423     "+rm"(width)           // %3
1424   : "r"((intptr_t)(src_stride_abgr)), // %4
1425     "m"(kABGRToV),  // %5
1426     "m"(kABGRToU),  // %6
1427     "m"(kAddUV128)  // %7
1428   : "memory", "cc", NACL_R14
1429     "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1430   );
1431 }
1432 
RGBAToUVRow_SSSE3(const uint8 * src_rgba0,int src_stride_rgba,uint8 * dst_u,uint8 * dst_v,int width)1433 void RGBAToUVRow_SSSE3(const uint8* src_rgba0,
1434                        int src_stride_rgba,
1435                        uint8* dst_u,
1436                        uint8* dst_v,
1437                        int width) {
1438   asm volatile (
1439     "movdqa    %5,%%xmm3                       \n"
1440     "movdqa    %6,%%xmm4                       \n"
1441     "movdqa    %7,%%xmm5                       \n"
1442     "sub       %1,%2                           \n"
1443 
1444     LABELALIGN
1445     "1:                                        \n"
1446     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
1447     MEMOPREG(movdqu,0x00,0,4,1,xmm7)            //  movdqu (%0,%4,1),%%xmm7
1448     "pavgb     %%xmm7,%%xmm0                   \n"
1449     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
1450     MEMOPREG(movdqu,0x10,0,4,1,xmm7)            //  movdqu 0x10(%0,%4,1),%%xmm7
1451     "pavgb     %%xmm7,%%xmm1                   \n"
1452     "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
1453     MEMOPREG(movdqu,0x20,0,4,1,xmm7)            //  movdqu 0x20(%0,%4,1),%%xmm7
1454     "pavgb     %%xmm7,%%xmm2                   \n"
1455     "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
1456     MEMOPREG(movdqu,0x30,0,4,1,xmm7)            //  movdqu 0x30(%0,%4,1),%%xmm7
1457     "pavgb     %%xmm7,%%xmm6                   \n"
1458 
1459     "lea       " MEMLEA(0x40,0) ",%0           \n"
1460     "movdqa    %%xmm0,%%xmm7                   \n"
1461     "shufps    $0x88,%%xmm1,%%xmm0             \n"
1462     "shufps    $0xdd,%%xmm1,%%xmm7             \n"
1463     "pavgb     %%xmm7,%%xmm0                   \n"
1464     "movdqa    %%xmm2,%%xmm7                   \n"
1465     "shufps    $0x88,%%xmm6,%%xmm2             \n"
1466     "shufps    $0xdd,%%xmm6,%%xmm7             \n"
1467     "pavgb     %%xmm7,%%xmm2                   \n"
1468     "movdqa    %%xmm0,%%xmm1                   \n"
1469     "movdqa    %%xmm2,%%xmm6                   \n"
1470     "pmaddubsw %%xmm4,%%xmm0                   \n"
1471     "pmaddubsw %%xmm4,%%xmm2                   \n"
1472     "pmaddubsw %%xmm3,%%xmm1                   \n"
1473     "pmaddubsw %%xmm3,%%xmm6                   \n"
1474     "phaddw    %%xmm2,%%xmm0                   \n"
1475     "phaddw    %%xmm6,%%xmm1                   \n"
1476     "psraw     $0x8,%%xmm0                     \n"
1477     "psraw     $0x8,%%xmm1                     \n"
1478     "packsswb  %%xmm1,%%xmm0                   \n"
1479     "paddb     %%xmm5,%%xmm0                   \n"
1480     "movlps    %%xmm0," MEMACCESS(1) "         \n"
1481     MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps  %%xmm0,(%1,%2,1)
1482     "lea       " MEMLEA(0x8,1) ",%1            \n"
1483     "sub       $0x10,%3                        \n"
1484     "jg        1b                              \n"
1485   : "+r"(src_rgba0),       // %0
1486     "+r"(dst_u),           // %1
1487     "+r"(dst_v),           // %2
1488     "+rm"(width)           // %3
1489   : "r"((intptr_t)(src_stride_rgba)), // %4
1490     "m"(kRGBAToV),  // %5
1491     "m"(kRGBAToU),  // %6
1492     "m"(kAddUV128)  // %7
1493   : "memory", "cc", NACL_R14
1494     "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1495   );
1496 }
1497 
1498 #if defined(HAS_I422TOARGBROW_SSSE3) || defined(HAS_I422TOARGBROW_AVX2)
1499 
1500 // Read 8 UV from 444
1501 #define READYUV444 \
1502   "movq       " MEMACCESS([u_buf]) ",%%xmm0                     \n"            \
1503     MEMOPREG(movq, 0x00, [u_buf], [v_buf], 1, xmm1)                            \
1504     "lea        " MEMLEA(0x8, [u_buf]) ",%[u_buf]               \n"            \
1505     "punpcklbw  %%xmm1,%%xmm0                                   \n"            \
1506     "movq       " MEMACCESS([y_buf]) ",%%xmm4                   \n"            \
1507     "punpcklbw  %%xmm4,%%xmm4                                   \n"            \
1508     "lea        " MEMLEA(0x8, [y_buf]) ",%[y_buf]               \n"
1509 
1510 // Read 4 UV from 422, upsample to 8 UV
1511 #define READYUV422 \
1512   "movd       " MEMACCESS([u_buf]) ",%%xmm0                     \n"            \
1513     MEMOPREG(movd, 0x00, [u_buf], [v_buf], 1, xmm1)                            \
1514     "lea        " MEMLEA(0x4, [u_buf]) ",%[u_buf]               \n"            \
1515     "punpcklbw  %%xmm1,%%xmm0                                   \n"            \
1516     "punpcklwd  %%xmm0,%%xmm0                                   \n"            \
1517     "movq       " MEMACCESS([y_buf]) ",%%xmm4                   \n"            \
1518     "punpcklbw  %%xmm4,%%xmm4                                   \n"            \
1519     "lea        " MEMLEA(0x8, [y_buf]) ",%[y_buf]               \n"
1520 
1521 // Read 4 UV from 422, upsample to 8 UV.  With 8 Alpha.
1522 #define READYUVA422 \
1523   "movd       " MEMACCESS([u_buf]) ",%%xmm0                     \n"            \
1524     MEMOPREG(movd, 0x00, [u_buf], [v_buf], 1, xmm1)                            \
1525     "lea        " MEMLEA(0x4, [u_buf]) ",%[u_buf]               \n"            \
1526     "punpcklbw  %%xmm1,%%xmm0                                   \n"            \
1527     "punpcklwd  %%xmm0,%%xmm0                                   \n"            \
1528     "movq       " MEMACCESS([y_buf]) ",%%xmm4                   \n"            \
1529     "punpcklbw  %%xmm4,%%xmm4                                   \n"            \
1530     "lea        " MEMLEA(0x8, [y_buf]) ",%[y_buf]               \n"            \
1531     "movq       " MEMACCESS([a_buf]) ",%%xmm5                   \n"            \
1532     "lea        " MEMLEA(0x8, [a_buf]) ",%[a_buf]               \n"
1533 
1534 // Read 4 UV from NV12, upsample to 8 UV
1535 #define READNV12 \
1536   "movq       " MEMACCESS([uv_buf]) ",%%xmm0                    \n"            \
1537     "lea        " MEMLEA(0x8, [uv_buf]) ",%[uv_buf]             \n"            \
1538     "punpcklwd  %%xmm0,%%xmm0                                   \n"            \
1539     "movq       " MEMACCESS([y_buf]) ",%%xmm4                   \n"            \
1540     "punpcklbw  %%xmm4,%%xmm4                                   \n"            \
1541     "lea        " MEMLEA(0x8, [y_buf]) ",%[y_buf]               \n"
1542 
1543 // Read 4 VU from NV21, upsample to 8 UV
1544 #define READNV21 \
1545   "movq       " MEMACCESS([vu_buf]) ",%%xmm0                    \n"            \
1546     "lea        " MEMLEA(0x8, [vu_buf]) ",%[vu_buf]             \n"            \
1547     "pshufb     %[kShuffleNV21], %%xmm0                         \n"            \
1548     "movq       " MEMACCESS([y_buf]) ",%%xmm4                   \n"            \
1549     "punpcklbw  %%xmm4,%%xmm4                                   \n"            \
1550     "lea        " MEMLEA(0x8, [y_buf]) ",%[y_buf]               \n"
1551 
1552 // Read 4 YUY2 with 8 Y and update 4 UV to 8 UV.
1553 #define READYUY2 \
1554   "movdqu     " MEMACCESS([yuy2_buf]) ",%%xmm4                  \n"            \
1555     "pshufb     %[kShuffleYUY2Y], %%xmm4                        \n"            \
1556     "movdqu     " MEMACCESS([yuy2_buf]) ",%%xmm0                \n"            \
1557     "pshufb     %[kShuffleYUY2UV], %%xmm0                       \n"            \
1558     "lea        " MEMLEA(0x10, [yuy2_buf]) ",%[yuy2_buf]        \n"
1559 
1560 // Read 4 UYVY with 8 Y and update 4 UV to 8 UV.
1561 #define READUYVY \
1562   "movdqu     " MEMACCESS([uyvy_buf]) ",%%xmm4                  \n"            \
1563     "pshufb     %[kShuffleUYVYY], %%xmm4                        \n"            \
1564     "movdqu     " MEMACCESS([uyvy_buf]) ",%%xmm0                \n"            \
1565     "pshufb     %[kShuffleUYVYUV], %%xmm0                       \n"            \
1566     "lea        " MEMLEA(0x10, [uyvy_buf]) ",%[uyvy_buf]        \n"
1567 
1568 #if defined(__x86_64__)
1569 #define YUVTORGB_SETUP(yuvconstants) \
1570   "movdqa     " MEMACCESS([yuvconstants]) ",%%xmm8              \n"            \
1571     "movdqa     " MEMACCESS2(32, [yuvconstants]) ",%%xmm9       \n"            \
1572     "movdqa     " MEMACCESS2(64, [yuvconstants]) ",%%xmm10      \n"            \
1573     "movdqa     " MEMACCESS2(96, [yuvconstants]) ",%%xmm11      \n"            \
1574     "movdqa     " MEMACCESS2(128, [yuvconstants]) ",%%xmm12     \n"            \
1575     "movdqa     " MEMACCESS2(160, [yuvconstants]) ",%%xmm13     \n"            \
1576     "movdqa     " MEMACCESS2(192, [yuvconstants]) ",%%xmm14     \n"
1577 // Convert 8 pixels: 8 UV and 8 Y
1578 #define YUVTORGB(yuvconstants)                                    \
1579   "movdqa     %%xmm0,%%xmm1                                   \n" \
1580   "movdqa     %%xmm0,%%xmm2                                   \n" \
1581   "movdqa     %%xmm0,%%xmm3                                   \n" \
1582   "movdqa     %%xmm11,%%xmm0                                  \n" \
1583   "pmaddubsw  %%xmm8,%%xmm1                                   \n" \
1584   "psubw      %%xmm1,%%xmm0                                   \n" \
1585   "movdqa     %%xmm12,%%xmm1                                  \n" \
1586   "pmaddubsw  %%xmm9,%%xmm2                                   \n" \
1587   "psubw      %%xmm2,%%xmm1                                   \n" \
1588   "movdqa     %%xmm13,%%xmm2                                  \n" \
1589   "pmaddubsw  %%xmm10,%%xmm3                                  \n" \
1590   "psubw      %%xmm3,%%xmm2                                   \n" \
1591   "pmulhuw    %%xmm14,%%xmm4                                  \n" \
1592   "paddsw     %%xmm4,%%xmm0                                   \n" \
1593   "paddsw     %%xmm4,%%xmm1                                   \n" \
1594   "paddsw     %%xmm4,%%xmm2                                   \n" \
1595   "psraw      $0x6,%%xmm0                                     \n" \
1596   "psraw      $0x6,%%xmm1                                     \n" \
1597   "psraw      $0x6,%%xmm2                                     \n" \
1598   "packuswb   %%xmm0,%%xmm0                                   \n" \
1599   "packuswb   %%xmm1,%%xmm1                                   \n" \
1600   "packuswb   %%xmm2,%%xmm2                                   \n"
1601 #define YUVTORGB_REGS \
1602   "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14",
1603 
1604 #else
1605 #define YUVTORGB_SETUP(yuvconstants)
1606 // Convert 8 pixels: 8 UV and 8 Y
1607 #define YUVTORGB(yuvconstants) \
1608   "movdqa     %%xmm0,%%xmm1                                     \n"            \
1609     "movdqa     %%xmm0,%%xmm2                                   \n"            \
1610     "movdqa     %%xmm0,%%xmm3                                   \n"            \
1611     "movdqa     " MEMACCESS2(96, [yuvconstants]) ",%%xmm0       \n"            \
1612     "pmaddubsw  " MEMACCESS([yuvconstants]) ",%%xmm1            \n"            \
1613     "psubw      %%xmm1,%%xmm0                                   \n"            \
1614     "movdqa     " MEMACCESS2(128, [yuvconstants]) ",%%xmm1      \n"            \
1615     "pmaddubsw  " MEMACCESS2(32, [yuvconstants]) ",%%xmm2       \n"            \
1616     "psubw      %%xmm2,%%xmm1                                   \n"            \
1617     "movdqa     " MEMACCESS2(160, [yuvconstants]) ",%%xmm2      \n"            \
1618     "pmaddubsw  " MEMACCESS2(64, [yuvconstants]) ",%%xmm3       \n"            \
1619     "psubw      %%xmm3,%%xmm2                                   \n"            \
1620     "pmulhuw    " MEMACCESS2(192, [yuvconstants]) ",%%xmm4      \n"            \
1621     "paddsw     %%xmm4,%%xmm0                                   \n"            \
1622     "paddsw     %%xmm4,%%xmm1                                   \n"            \
1623     "paddsw     %%xmm4,%%xmm2                                   \n"            \
1624     "psraw      $0x6,%%xmm0                                     \n"            \
1625     "psraw      $0x6,%%xmm1                                     \n"            \
1626     "psraw      $0x6,%%xmm2                                     \n"            \
1627     "packuswb   %%xmm0,%%xmm0                                   \n"            \
1628     "packuswb   %%xmm1,%%xmm1                                   \n"            \
1629     "packuswb   %%xmm2,%%xmm2                                   \n"
1630 #define YUVTORGB_REGS
1631 #endif
1632 
1633 // Store 8 ARGB values.
1634 #define STOREARGB \
1635   "punpcklbw  %%xmm1,%%xmm0                                      \n"           \
1636     "punpcklbw  %%xmm5,%%xmm2                                    \n"           \
1637     "movdqa     %%xmm0,%%xmm1                                    \n"           \
1638     "punpcklwd  %%xmm2,%%xmm0                                    \n"           \
1639     "punpckhwd  %%xmm2,%%xmm1                                    \n"           \
1640     "movdqu     %%xmm0," MEMACCESS([dst_argb]) "                 \n"           \
1641     "movdqu     %%xmm1," MEMACCESS2(0x10, [dst_argb]) "          \n"           \
1642     "lea        " MEMLEA(0x20, [dst_argb]) ", %[dst_argb]        \n"
1643 
1644 // Store 8 RGBA values.
1645 #define STORERGBA \
1646   "pcmpeqb   %%xmm5,%%xmm5                                       \n"           \
1647     "punpcklbw %%xmm2,%%xmm1                                     \n"           \
1648     "punpcklbw %%xmm0,%%xmm5                                     \n"           \
1649     "movdqa    %%xmm5,%%xmm0                                     \n"           \
1650     "punpcklwd %%xmm1,%%xmm5                                     \n"           \
1651     "punpckhwd %%xmm1,%%xmm0                                     \n"           \
1652     "movdqu    %%xmm5," MEMACCESS([dst_rgba]) "                  \n"           \
1653     "movdqu    %%xmm0," MEMACCESS2(0x10, [dst_rgba]) "           \n"           \
1654     "lea       " MEMLEA(0x20, [dst_rgba]) ",%[dst_rgba]          \n"
1655 
I444ToARGBRow_SSSE3(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * dst_argb,const struct YuvConstants * yuvconstants,int width)1656 void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf,
1657                                 const uint8* u_buf,
1658                                 const uint8* v_buf,
1659                                 uint8* dst_argb,
1660                                 const struct YuvConstants* yuvconstants,
1661                                 int width) {
1662   asm volatile (
1663     YUVTORGB_SETUP(yuvconstants)
1664     "sub       %[u_buf],%[v_buf]               \n"
1665     "pcmpeqb   %%xmm5,%%xmm5                   \n"
1666 
1667     LABELALIGN
1668     "1:                                        \n"
1669     READYUV444
1670     YUVTORGB(yuvconstants)
1671     STOREARGB
1672     "sub       $0x8,%[width]                   \n"
1673     "jg        1b                              \n"
1674   : [y_buf]"+r"(y_buf),    // %[y_buf]
1675     [u_buf]"+r"(u_buf),    // %[u_buf]
1676     [v_buf]"+r"(v_buf),    // %[v_buf]
1677     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
1678     [width]"+rm"(width)    // %[width]
1679   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
1680   : "memory", "cc", NACL_R14 YUVTORGB_REGS
1681     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1682   );
1683 }
1684 
I422ToRGB24Row_SSSE3(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * dst_rgb24,const struct YuvConstants * yuvconstants,int width)1685 void OMITFP I422ToRGB24Row_SSSE3(const uint8* y_buf,
1686                                  const uint8* u_buf,
1687                                  const uint8* v_buf,
1688                                  uint8* dst_rgb24,
1689                                  const struct YuvConstants* yuvconstants,
1690                                  int width) {
1691   asm volatile (
1692     YUVTORGB_SETUP(yuvconstants)
1693     "movdqa    %[kShuffleMaskARGBToRGB24_0],%%xmm5 \n"
1694     "movdqa    %[kShuffleMaskARGBToRGB24],%%xmm6   \n"
1695     "sub       %[u_buf],%[v_buf]               \n"
1696 
1697     LABELALIGN
1698     "1:                                        \n"
1699     READYUV422
1700     YUVTORGB(yuvconstants)
1701     "punpcklbw %%xmm1,%%xmm0                   \n"
1702     "punpcklbw %%xmm2,%%xmm2                   \n"
1703     "movdqa    %%xmm0,%%xmm1                   \n"
1704     "punpcklwd %%xmm2,%%xmm0                   \n"
1705     "punpckhwd %%xmm2,%%xmm1                   \n"
1706     "pshufb    %%xmm5,%%xmm0                   \n"
1707     "pshufb    %%xmm6,%%xmm1                   \n"
1708     "palignr   $0xc,%%xmm0,%%xmm1              \n"
1709     "movq      %%xmm0," MEMACCESS([dst_rgb24]) "\n"
1710     "movdqu    %%xmm1," MEMACCESS2(0x8,[dst_rgb24]) "\n"
1711     "lea       " MEMLEA(0x18,[dst_rgb24]) ",%[dst_rgb24] \n"
1712     "subl      $0x8,%[width]                   \n"
1713     "jg        1b                              \n"
1714   : [y_buf]"+r"(y_buf),    // %[y_buf]
1715     [u_buf]"+r"(u_buf),    // %[u_buf]
1716     [v_buf]"+r"(v_buf),    // %[v_buf]
1717     [dst_rgb24]"+r"(dst_rgb24),  // %[dst_rgb24]
1718 #if defined(__i386__)
1719     [width]"+m"(width)     // %[width]
1720 #else
1721     [width]"+rm"(width)    // %[width]
1722 #endif
1723   : [yuvconstants]"r"(yuvconstants),  // %[yuvconstants]
1724     [kShuffleMaskARGBToRGB24_0]"m"(kShuffleMaskARGBToRGB24_0),
1725     [kShuffleMaskARGBToRGB24]"m"(kShuffleMaskARGBToRGB24)
1726   : "memory", "cc", NACL_R14 YUVTORGB_REGS
1727     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
1728   );
1729 }
1730 
I422ToARGBRow_SSSE3(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * dst_argb,const struct YuvConstants * yuvconstants,int width)1731 void OMITFP I422ToARGBRow_SSSE3(const uint8* y_buf,
1732                                 const uint8* u_buf,
1733                                 const uint8* v_buf,
1734                                 uint8* dst_argb,
1735                                 const struct YuvConstants* yuvconstants,
1736                                 int width) {
1737   asm volatile (
1738     YUVTORGB_SETUP(yuvconstants)
1739     "sub       %[u_buf],%[v_buf]               \n"
1740     "pcmpeqb   %%xmm5,%%xmm5                   \n"
1741 
1742     LABELALIGN
1743     "1:                                        \n"
1744     READYUV422
1745     YUVTORGB(yuvconstants)
1746     STOREARGB
1747     "sub       $0x8,%[width]                   \n"
1748     "jg        1b                              \n"
1749   : [y_buf]"+r"(y_buf),    // %[y_buf]
1750     [u_buf]"+r"(u_buf),    // %[u_buf]
1751     [v_buf]"+r"(v_buf),    // %[v_buf]
1752     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
1753     [width]"+rm"(width)    // %[width]
1754   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
1755   : "memory", "cc", NACL_R14 YUVTORGB_REGS
1756     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1757   );
1758 }
1759 
1760 #ifdef HAS_I422ALPHATOARGBROW_SSSE3
I422AlphaToARGBRow_SSSE3(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,const uint8 * a_buf,uint8 * dst_argb,const struct YuvConstants * yuvconstants,int width)1761 void OMITFP I422AlphaToARGBRow_SSSE3(const uint8* y_buf,
1762                                      const uint8* u_buf,
1763                                      const uint8* v_buf,
1764                                      const uint8* a_buf,
1765                                      uint8* dst_argb,
1766                                      const struct YuvConstants* yuvconstants,
1767                                      int width) {
1768   // clang-format off
1769   asm volatile (
1770     YUVTORGB_SETUP(yuvconstants)
1771     "sub       %[u_buf],%[v_buf]               \n"
1772 
1773     LABELALIGN
1774     "1:                                        \n"
1775     READYUVA422
1776     YUVTORGB(yuvconstants)
1777     STOREARGB
1778     "subl      $0x8,%[width]                   \n"
1779     "jg        1b                              \n"
1780   : [y_buf]"+r"(y_buf),    // %[y_buf]
1781     [u_buf]"+r"(u_buf),    // %[u_buf]
1782     [v_buf]"+r"(v_buf),    // %[v_buf]
1783     [a_buf]"+r"(a_buf),    // %[a_buf]
1784     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
1785 #if defined(__i386__)
1786     [width]"+m"(width)     // %[width]
1787 #else
1788     [width]"+rm"(width)    // %[width]
1789 #endif
1790   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
1791   : "memory", "cc", NACL_R14 YUVTORGB_REGS
1792     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1793   );
1794   // clang-format on
1795 }
1796 #endif  // HAS_I422ALPHATOARGBROW_SSSE3
1797 
NV12ToARGBRow_SSSE3(const uint8 * y_buf,const uint8 * uv_buf,uint8 * dst_argb,const struct YuvConstants * yuvconstants,int width)1798 void OMITFP NV12ToARGBRow_SSSE3(const uint8* y_buf,
1799                                 const uint8* uv_buf,
1800                                 uint8* dst_argb,
1801                                 const struct YuvConstants* yuvconstants,
1802                                 int width) {
1803   // clang-format off
1804   asm volatile (
1805     YUVTORGB_SETUP(yuvconstants)
1806     "pcmpeqb   %%xmm5,%%xmm5                   \n"
1807 
1808     LABELALIGN
1809     "1:                                        \n"
1810     READNV12
1811     YUVTORGB(yuvconstants)
1812     STOREARGB
1813     "sub       $0x8,%[width]                   \n"
1814     "jg        1b                              \n"
1815   : [y_buf]"+r"(y_buf),    // %[y_buf]
1816     [uv_buf]"+r"(uv_buf),    // %[uv_buf]
1817     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
1818     [width]"+rm"(width)    // %[width]
1819   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
1820     : "memory", "cc", YUVTORGB_REGS  // Does not use r14.
1821       "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1822   );
1823   // clang-format on
1824 }
1825 
NV21ToARGBRow_SSSE3(const uint8 * y_buf,const uint8 * vu_buf,uint8 * dst_argb,const struct YuvConstants * yuvconstants,int width)1826 void OMITFP NV21ToARGBRow_SSSE3(const uint8* y_buf,
1827                                 const uint8* vu_buf,
1828                                 uint8* dst_argb,
1829                                 const struct YuvConstants* yuvconstants,
1830                                 int width) {
1831   // clang-format off
1832   asm volatile (
1833     YUVTORGB_SETUP(yuvconstants)
1834     "pcmpeqb   %%xmm5,%%xmm5                   \n"
1835 
1836     LABELALIGN
1837     "1:                                        \n"
1838     READNV21
1839     YUVTORGB(yuvconstants)
1840     STOREARGB
1841     "sub       $0x8,%[width]                   \n"
1842     "jg        1b                              \n"
1843   : [y_buf]"+r"(y_buf),    // %[y_buf]
1844     [vu_buf]"+r"(vu_buf),    // %[vu_buf]
1845     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
1846     [width]"+rm"(width)    // %[width]
1847   : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
1848     [kShuffleNV21]"m"(kShuffleNV21)
1849     : "memory", "cc", YUVTORGB_REGS  // Does not use r14.
1850       "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1851   );
1852   // clang-format on
1853 }
1854 
YUY2ToARGBRow_SSSE3(const uint8 * yuy2_buf,uint8 * dst_argb,const struct YuvConstants * yuvconstants,int width)1855 void OMITFP YUY2ToARGBRow_SSSE3(const uint8* yuy2_buf,
1856                                 uint8* dst_argb,
1857                                 const struct YuvConstants* yuvconstants,
1858                                 int width) {
1859   // clang-format off
1860   asm volatile (
1861     YUVTORGB_SETUP(yuvconstants)
1862     "pcmpeqb   %%xmm5,%%xmm5                   \n"
1863 
1864     LABELALIGN
1865     "1:                                        \n"
1866     READYUY2
1867     YUVTORGB(yuvconstants)
1868     STOREARGB
1869     "sub       $0x8,%[width]                   \n"
1870     "jg        1b                              \n"
1871   : [yuy2_buf]"+r"(yuy2_buf),    // %[yuy2_buf]
1872     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
1873     [width]"+rm"(width)    // %[width]
1874   : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
1875     [kShuffleYUY2Y]"m"(kShuffleYUY2Y),
1876     [kShuffleYUY2UV]"m"(kShuffleYUY2UV)
1877     : "memory", "cc", YUVTORGB_REGS  // Does not use r14.
1878       "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1879   );
1880   // clang-format on
1881 }
1882 
UYVYToARGBRow_SSSE3(const uint8 * uyvy_buf,uint8 * dst_argb,const struct YuvConstants * yuvconstants,int width)1883 void OMITFP UYVYToARGBRow_SSSE3(const uint8* uyvy_buf,
1884                                 uint8* dst_argb,
1885                                 const struct YuvConstants* yuvconstants,
1886                                 int width) {
1887   // clang-format off
1888   asm volatile (
1889     YUVTORGB_SETUP(yuvconstants)
1890     "pcmpeqb   %%xmm5,%%xmm5                   \n"
1891 
1892     LABELALIGN
1893     "1:                                        \n"
1894     READUYVY
1895     YUVTORGB(yuvconstants)
1896     STOREARGB
1897     "sub       $0x8,%[width]                   \n"
1898     "jg        1b                              \n"
1899   : [uyvy_buf]"+r"(uyvy_buf),    // %[uyvy_buf]
1900     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
1901     [width]"+rm"(width)    // %[width]
1902   : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
1903     [kShuffleUYVYY]"m"(kShuffleUYVYY),
1904     [kShuffleUYVYUV]"m"(kShuffleUYVYUV)
1905     : "memory", "cc", YUVTORGB_REGS  // Does not use r14.
1906       "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1907   );
1908   // clang-format on
1909 }
1910 
I422ToRGBARow_SSSE3(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * dst_rgba,const struct YuvConstants * yuvconstants,int width)1911 void OMITFP I422ToRGBARow_SSSE3(const uint8* y_buf,
1912                                 const uint8* u_buf,
1913                                 const uint8* v_buf,
1914                                 uint8* dst_rgba,
1915                                 const struct YuvConstants* yuvconstants,
1916                                 int width) {
1917   asm volatile (
1918     YUVTORGB_SETUP(yuvconstants)
1919     "sub       %[u_buf],%[v_buf]               \n"
1920     "pcmpeqb   %%xmm5,%%xmm5                   \n"
1921 
1922     LABELALIGN
1923     "1:                                        \n"
1924     READYUV422
1925     YUVTORGB(yuvconstants)
1926     STORERGBA
1927     "sub       $0x8,%[width]                   \n"
1928     "jg        1b                              \n"
1929   : [y_buf]"+r"(y_buf),    // %[y_buf]
1930     [u_buf]"+r"(u_buf),    // %[u_buf]
1931     [v_buf]"+r"(v_buf),    // %[v_buf]
1932     [dst_rgba]"+r"(dst_rgba),  // %[dst_rgba]
1933     [width]"+rm"(width)    // %[width]
1934   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
1935   : "memory", "cc", NACL_R14 YUVTORGB_REGS
1936     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1937   );
1938 }
1939 
1940 #endif  // HAS_I422TOARGBROW_SSSE3
1941 
1942 // Read 16 UV from 444
1943 #define READYUV444_AVX2 \
1944   "vmovdqu    " MEMACCESS([u_buf]) ",%%xmm0                         \n"        \
1945     MEMOPREG(vmovdqu, 0x00, [u_buf], [v_buf], 1, xmm1)                         \
1946     "lea        " MEMLEA(0x10, [u_buf]) ",%[u_buf]                  \n"        \
1947     "vpermq     $0xd8,%%ymm0,%%ymm0                                 \n"        \
1948     "vpermq     $0xd8,%%ymm1,%%ymm1                                 \n"        \
1949     "vpunpcklbw %%ymm1,%%ymm0,%%ymm0                                \n"        \
1950     "vmovdqu    " MEMACCESS([y_buf]) ",%%xmm4                       \n"        \
1951     "vpermq     $0xd8,%%ymm4,%%ymm4                                 \n"        \
1952     "vpunpcklbw %%ymm4,%%ymm4,%%ymm4                                \n"        \
1953     "lea        " MEMLEA(0x10, [y_buf]) ",%[y_buf]                  \n"
1954 
1955 // Read 8 UV from 422, upsample to 16 UV.
1956 #define READYUV422_AVX2 \
1957   "vmovq      " MEMACCESS([u_buf]) ",%%xmm0                         \n"        \
1958     MEMOPREG(vmovq, 0x00, [u_buf], [v_buf], 1, xmm1)                           \
1959     "lea        " MEMLEA(0x8, [u_buf]) ",%[u_buf]                   \n"        \
1960     "vpunpcklbw %%ymm1,%%ymm0,%%ymm0                                \n"        \
1961     "vpermq     $0xd8,%%ymm0,%%ymm0                                 \n"        \
1962     "vpunpcklwd %%ymm0,%%ymm0,%%ymm0                                \n"        \
1963     "vmovdqu    " MEMACCESS([y_buf]) ",%%xmm4                       \n"        \
1964     "vpermq     $0xd8,%%ymm4,%%ymm4                                 \n"        \
1965     "vpunpcklbw %%ymm4,%%ymm4,%%ymm4                                \n"        \
1966     "lea        " MEMLEA(0x10, [y_buf]) ",%[y_buf]                  \n"
1967 
1968 // Read 8 UV from 422, upsample to 16 UV.  With 16 Alpha.
1969 #define READYUVA422_AVX2 \
1970   "vmovq      " MEMACCESS([u_buf]) ",%%xmm0                         \n"        \
1971     MEMOPREG(vmovq, 0x00, [u_buf], [v_buf], 1, xmm1)                           \
1972     "lea        " MEMLEA(0x8, [u_buf]) ",%[u_buf]                   \n"        \
1973     "vpunpcklbw %%ymm1,%%ymm0,%%ymm0                                \n"        \
1974     "vpermq     $0xd8,%%ymm0,%%ymm0                                 \n"        \
1975     "vpunpcklwd %%ymm0,%%ymm0,%%ymm0                                \n"        \
1976     "vmovdqu    " MEMACCESS([y_buf]) ",%%xmm4                       \n"        \
1977     "vpermq     $0xd8,%%ymm4,%%ymm4                                 \n"        \
1978     "vpunpcklbw %%ymm4,%%ymm4,%%ymm4                                \n"        \
1979     "lea        " MEMLEA(0x10, [y_buf]) ",%[y_buf]                  \n"        \
1980     "vmovdqu    " MEMACCESS([a_buf]) ",%%xmm5                       \n"        \
1981     "vpermq     $0xd8,%%ymm5,%%ymm5                                 \n"        \
1982     "lea        " MEMLEA(0x10, [a_buf]) ",%[a_buf]                  \n"
1983 
1984 // Read 8 UV from NV12, upsample to 16 UV.
1985 #define READNV12_AVX2 \
1986   "vmovdqu    " MEMACCESS([uv_buf]) ",%%xmm0                        \n"        \
1987     "lea        " MEMLEA(0x10, [uv_buf]) ",%[uv_buf]                \n"        \
1988     "vpermq     $0xd8,%%ymm0,%%ymm0                                 \n"        \
1989     "vpunpcklwd %%ymm0,%%ymm0,%%ymm0                                \n"        \
1990     "vmovdqu    " MEMACCESS([y_buf]) ",%%xmm4                       \n"        \
1991     "vpermq     $0xd8,%%ymm4,%%ymm4                                 \n"        \
1992     "vpunpcklbw %%ymm4,%%ymm4,%%ymm4                                \n"        \
1993     "lea        " MEMLEA(0x10, [y_buf]) ",%[y_buf]                  \n"
1994 
1995 // Read 8 VU from NV21, upsample to 16 UV.
1996 #define READNV21_AVX2 \
1997   "vmovdqu    " MEMACCESS([vu_buf]) ",%%xmm0                        \n"        \
1998     "lea        " MEMLEA(0x10, [vu_buf]) ",%[vu_buf]                \n"        \
1999     "vpermq     $0xd8,%%ymm0,%%ymm0                                 \n"        \
2000     "vpshufb     %[kShuffleNV21], %%ymm0, %%ymm0                    \n"        \
2001     "vmovdqu    " MEMACCESS([y_buf]) ",%%xmm4                       \n"        \
2002     "vpermq     $0xd8,%%ymm4,%%ymm4                                 \n"        \
2003     "vpunpcklbw %%ymm4,%%ymm4,%%ymm4                                \n"        \
2004     "lea        " MEMLEA(0x10, [y_buf]) ",%[y_buf]                  \n"
2005 
2006 // Read 8 YUY2 with 16 Y and upsample 8 UV to 16 UV.
2007 #define READYUY2_AVX2 \
2008   "vmovdqu    " MEMACCESS([yuy2_buf]) ",%%ymm4                      \n"        \
2009     "vpshufb    %[kShuffleYUY2Y], %%ymm4, %%ymm4                    \n"        \
2010     "vmovdqu    " MEMACCESS([yuy2_buf]) ",%%ymm0                    \n"        \
2011     "vpshufb    %[kShuffleYUY2UV], %%ymm0, %%ymm0                   \n"        \
2012     "lea        " MEMLEA(0x20, [yuy2_buf]) ",%[yuy2_buf]            \n"
2013 
2014 // Read 8 UYVY with 16 Y and upsample 8 UV to 16 UV.
2015 #define READUYVY_AVX2 \
2016   "vmovdqu     " MEMACCESS([uyvy_buf]) ",%%ymm4                     \n"        \
2017     "vpshufb     %[kShuffleUYVYY], %%ymm4, %%ymm4                   \n"        \
2018     "vmovdqu     " MEMACCESS([uyvy_buf]) ",%%ymm0                   \n"        \
2019     "vpshufb     %[kShuffleUYVYUV], %%ymm0, %%ymm0                  \n"        \
2020     "lea        " MEMLEA(0x20, [uyvy_buf]) ",%[uyvy_buf]            \n"
2021 
2022 #if defined(__x86_64__)
2023 #define YUVTORGB_SETUP_AVX2(yuvconstants) \
2024   "vmovdqa     " MEMACCESS([yuvconstants]) ",%%ymm8              \n"           \
2025     "vmovdqa     " MEMACCESS2(32, [yuvconstants]) ",%%ymm9       \n"           \
2026     "vmovdqa     " MEMACCESS2(64, [yuvconstants]) ",%%ymm10      \n"           \
2027     "vmovdqa     " MEMACCESS2(96, [yuvconstants]) ",%%ymm11      \n"           \
2028     "vmovdqa     " MEMACCESS2(128, [yuvconstants]) ",%%ymm12     \n"           \
2029     "vmovdqa     " MEMACCESS2(160, [yuvconstants]) ",%%ymm13     \n"           \
2030     "vmovdqa     " MEMACCESS2(192, [yuvconstants]) ",%%ymm14     \n"
2031 
2032 #define YUVTORGB_AVX2(yuvconstants)                                   \
2033   "vpmaddubsw  %%ymm10,%%ymm0,%%ymm2                              \n" \
2034   "vpmaddubsw  %%ymm9,%%ymm0,%%ymm1                               \n" \
2035   "vpmaddubsw  %%ymm8,%%ymm0,%%ymm0                               \n" \
2036   "vpsubw      %%ymm2,%%ymm13,%%ymm2                              \n" \
2037   "vpsubw      %%ymm1,%%ymm12,%%ymm1                              \n" \
2038   "vpsubw      %%ymm0,%%ymm11,%%ymm0                              \n" \
2039   "vpmulhuw    %%ymm14,%%ymm4,%%ymm4                              \n" \
2040   "vpaddsw     %%ymm4,%%ymm0,%%ymm0                               \n" \
2041   "vpaddsw     %%ymm4,%%ymm1,%%ymm1                               \n" \
2042   "vpaddsw     %%ymm4,%%ymm2,%%ymm2                               \n" \
2043   "vpsraw      $0x6,%%ymm0,%%ymm0                                 \n" \
2044   "vpsraw      $0x6,%%ymm1,%%ymm1                                 \n" \
2045   "vpsraw      $0x6,%%ymm2,%%ymm2                                 \n" \
2046   "vpackuswb   %%ymm0,%%ymm0,%%ymm0                               \n" \
2047   "vpackuswb   %%ymm1,%%ymm1,%%ymm1                               \n" \
2048   "vpackuswb   %%ymm2,%%ymm2,%%ymm2                               \n"
2049 
2050 #define YUVTORGB_REGS_AVX2 \
2051   "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14",
2052 
2053 #else  // Convert 16 pixels: 16 UV and 16 Y.
2054 
2055 #define YUVTORGB_SETUP_AVX2(yuvconstants)
2056 #define YUVTORGB_AVX2(yuvconstants) \
2057   "vpmaddubsw  " MEMACCESS2(64, [yuvconstants]) ",%%ymm0,%%ymm2     \n"        \
2058     "vpmaddubsw  " MEMACCESS2(32, [yuvconstants]) ",%%ymm0,%%ymm1   \n"        \
2059     "vpmaddubsw  " MEMACCESS([yuvconstants]) ",%%ymm0,%%ymm0        \n"        \
2060     "vmovdqu     " MEMACCESS2(160, [yuvconstants]) ",%%ymm3         \n"        \
2061     "vpsubw      %%ymm2,%%ymm3,%%ymm2                               \n"        \
2062     "vmovdqu     " MEMACCESS2(128, [yuvconstants]) ",%%ymm3         \n"        \
2063     "vpsubw      %%ymm1,%%ymm3,%%ymm1                               \n"        \
2064     "vmovdqu     " MEMACCESS2(96, [yuvconstants]) ",%%ymm3          \n"        \
2065     "vpsubw      %%ymm0,%%ymm3,%%ymm0                               \n"        \
2066     "vpmulhuw    " MEMACCESS2(192, [yuvconstants]) ",%%ymm4,%%ymm4  \n"        \
2067     "vpaddsw     %%ymm4,%%ymm0,%%ymm0                               \n"        \
2068     "vpaddsw     %%ymm4,%%ymm1,%%ymm1                               \n"        \
2069     "vpaddsw     %%ymm4,%%ymm2,%%ymm2                               \n"        \
2070     "vpsraw      $0x6,%%ymm0,%%ymm0                                 \n"        \
2071     "vpsraw      $0x6,%%ymm1,%%ymm1                                 \n"        \
2072     "vpsraw      $0x6,%%ymm2,%%ymm2                                 \n"        \
2073     "vpackuswb   %%ymm0,%%ymm0,%%ymm0                               \n"        \
2074     "vpackuswb   %%ymm1,%%ymm1,%%ymm1                               \n"        \
2075     "vpackuswb   %%ymm2,%%ymm2,%%ymm2                               \n"
2076 #define YUVTORGB_REGS_AVX2
2077 #endif
2078 
2079 // Store 16 ARGB values.
2080 #define STOREARGB_AVX2 \
2081   "vpunpcklbw %%ymm1,%%ymm0,%%ymm0                                  \n"        \
2082     "vpermq     $0xd8,%%ymm0,%%ymm0                                 \n"        \
2083     "vpunpcklbw %%ymm5,%%ymm2,%%ymm2                                \n"        \
2084     "vpermq     $0xd8,%%ymm2,%%ymm2                                 \n"        \
2085     "vpunpcklwd %%ymm2,%%ymm0,%%ymm1                                \n"        \
2086     "vpunpckhwd %%ymm2,%%ymm0,%%ymm0                                \n"        \
2087     "vmovdqu    %%ymm1," MEMACCESS([dst_argb]) "                    \n"        \
2088     "vmovdqu    %%ymm0," MEMACCESS2(0x20, [dst_argb]) "             \n"        \
2089     "lea       " MEMLEA(0x40, [dst_argb]) ", %[dst_argb]            \n"
2090 
2091 #ifdef HAS_I444TOARGBROW_AVX2
2092 // 16 pixels
2093 // 16 UV values with 16 Y producing 16 ARGB (64 bytes).
I444ToARGBRow_AVX2(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * dst_argb,const struct YuvConstants * yuvconstants,int width)2094 void OMITFP I444ToARGBRow_AVX2(const uint8* y_buf,
2095                                const uint8* u_buf,
2096                                const uint8* v_buf,
2097                                uint8* dst_argb,
2098                                const struct YuvConstants* yuvconstants,
2099                                int width) {
2100   asm volatile (
2101     YUVTORGB_SETUP_AVX2(yuvconstants)
2102     "sub       %[u_buf],%[v_buf]               \n"
2103     "vpcmpeqb  %%ymm5,%%ymm5,%%ymm5            \n"
2104 
2105     LABELALIGN
2106     "1:                                        \n"
2107     READYUV444_AVX2
2108     YUVTORGB_AVX2(yuvconstants)
2109     STOREARGB_AVX2
2110     "sub       $0x10,%[width]                  \n"
2111     "jg        1b                              \n"
2112     "vzeroupper                                \n"
2113   : [y_buf]"+r"(y_buf),    // %[y_buf]
2114     [u_buf]"+r"(u_buf),    // %[u_buf]
2115     [v_buf]"+r"(v_buf),    // %[v_buf]
2116     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
2117     [width]"+rm"(width)    // %[width]
2118   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
2119   : "memory", "cc", NACL_R14 YUVTORGB_REGS_AVX2
2120     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2121   );
2122 }
2123 #endif  // HAS_I444TOARGBROW_AVX2
2124 
2125 #if defined(HAS_I422TOARGBROW_AVX2)
2126 // 16 pixels
2127 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
I422ToARGBRow_AVX2(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * dst_argb,const struct YuvConstants * yuvconstants,int width)2128 void OMITFP I422ToARGBRow_AVX2(const uint8* y_buf,
2129                                const uint8* u_buf,
2130                                const uint8* v_buf,
2131                                uint8* dst_argb,
2132                                const struct YuvConstants* yuvconstants,
2133                                int width) {
2134   asm volatile (
2135     YUVTORGB_SETUP_AVX2(yuvconstants)
2136     "sub       %[u_buf],%[v_buf]               \n"
2137     "vpcmpeqb  %%ymm5,%%ymm5,%%ymm5            \n"
2138 
2139     LABELALIGN
2140     "1:                                        \n"
2141     READYUV422_AVX2
2142     YUVTORGB_AVX2(yuvconstants)
2143     STOREARGB_AVX2
2144     "sub       $0x10,%[width]                  \n"
2145     "jg        1b                              \n"
2146 
2147     "vzeroupper                                \n"
2148   : [y_buf]"+r"(y_buf),    // %[y_buf]
2149     [u_buf]"+r"(u_buf),    // %[u_buf]
2150     [v_buf]"+r"(v_buf),    // %[v_buf]
2151     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
2152     [width]"+rm"(width)    // %[width]
2153   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
2154   : "memory", "cc", NACL_R14 YUVTORGB_REGS_AVX2
2155     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2156   );
2157 }
2158 #endif  // HAS_I422TOARGBROW_AVX2
2159 
2160 #if defined(HAS_I422ALPHATOARGBROW_AVX2)
2161 // 16 pixels
2162 // 8 UV values upsampled to 16 UV, mixed with 16 Y and 16 A producing 16 ARGB.
I422AlphaToARGBRow_AVX2(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,const uint8 * a_buf,uint8 * dst_argb,const struct YuvConstants * yuvconstants,int width)2163 void OMITFP I422AlphaToARGBRow_AVX2(const uint8* y_buf,
2164                                     const uint8* u_buf,
2165                                     const uint8* v_buf,
2166                                     const uint8* a_buf,
2167                                     uint8* dst_argb,
2168                                     const struct YuvConstants* yuvconstants,
2169                                     int width) {
2170   // clang-format off
2171   asm volatile (
2172     YUVTORGB_SETUP_AVX2(yuvconstants)
2173     "sub       %[u_buf],%[v_buf]               \n"
2174 
2175     LABELALIGN
2176     "1:                                        \n"
2177     READYUVA422_AVX2
2178     YUVTORGB_AVX2(yuvconstants)
2179     STOREARGB_AVX2
2180     "subl      $0x10,%[width]                  \n"
2181     "jg        1b                              \n"
2182     "vzeroupper                                \n"
2183   : [y_buf]"+r"(y_buf),    // %[y_buf]
2184     [u_buf]"+r"(u_buf),    // %[u_buf]
2185     [v_buf]"+r"(v_buf),    // %[v_buf]
2186     [a_buf]"+r"(a_buf),    // %[a_buf]
2187     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
2188 #if defined(__i386__)
2189     [width]"+m"(width)     // %[width]
2190 #else
2191     [width]"+rm"(width)    // %[width]
2192 #endif
2193   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
2194   : "memory", "cc", NACL_R14 YUVTORGB_REGS_AVX2
2195     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2196   );
2197   // clang-format on
2198 }
2199 #endif  // HAS_I422ALPHATOARGBROW_AVX2
2200 
2201 #if defined(HAS_I422TORGBAROW_AVX2)
2202 // 16 pixels
2203 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 RGBA (64 bytes).
I422ToRGBARow_AVX2(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * dst_argb,const struct YuvConstants * yuvconstants,int width)2204 void OMITFP I422ToRGBARow_AVX2(const uint8* y_buf,
2205                                const uint8* u_buf,
2206                                const uint8* v_buf,
2207                                uint8* dst_argb,
2208                                const struct YuvConstants* yuvconstants,
2209                                int width) {
2210   asm volatile (
2211     YUVTORGB_SETUP_AVX2(yuvconstants)
2212     "sub       %[u_buf],%[v_buf]               \n"
2213     "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
2214 
2215     LABELALIGN
2216     "1:                                        \n"
2217     READYUV422_AVX2
2218     YUVTORGB_AVX2(yuvconstants)
2219 
2220     // Step 3: Weave into RGBA
2221     "vpunpcklbw %%ymm2,%%ymm1,%%ymm1           \n"
2222     "vpermq     $0xd8,%%ymm1,%%ymm1            \n"
2223     "vpunpcklbw %%ymm0,%%ymm5,%%ymm2           \n"
2224     "vpermq     $0xd8,%%ymm2,%%ymm2            \n"
2225     "vpunpcklwd %%ymm1,%%ymm2,%%ymm0           \n"
2226     "vpunpckhwd %%ymm1,%%ymm2,%%ymm1           \n"
2227     "vmovdqu    %%ymm0," MEMACCESS([dst_argb]) "\n"
2228     "vmovdqu    %%ymm1," MEMACCESS2(0x20,[dst_argb]) "\n"
2229     "lea       " MEMLEA(0x40,[dst_argb]) ",%[dst_argb] \n"
2230     "sub       $0x10,%[width]                  \n"
2231     "jg        1b                              \n"
2232     "vzeroupper                                \n"
2233   : [y_buf]"+r"(y_buf),    // %[y_buf]
2234     [u_buf]"+r"(u_buf),    // %[u_buf]
2235     [v_buf]"+r"(v_buf),    // %[v_buf]
2236     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
2237     [width]"+rm"(width)    // %[width]
2238   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
2239   : "memory", "cc", NACL_R14 YUVTORGB_REGS_AVX2
2240     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2241   );
2242 }
2243 #endif  // HAS_I422TORGBAROW_AVX2
2244 
2245 #if defined(HAS_NV12TOARGBROW_AVX2)
2246 // 16 pixels.
2247 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
NV12ToARGBRow_AVX2(const uint8 * y_buf,const uint8 * uv_buf,uint8 * dst_argb,const struct YuvConstants * yuvconstants,int width)2248 void OMITFP NV12ToARGBRow_AVX2(const uint8* y_buf,
2249                                const uint8* uv_buf,
2250                                uint8* dst_argb,
2251                                const struct YuvConstants* yuvconstants,
2252                                int width) {
2253   // clang-format off
2254   asm volatile (
2255     YUVTORGB_SETUP_AVX2(yuvconstants)
2256     "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
2257 
2258     LABELALIGN
2259     "1:                                        \n"
2260     READNV12_AVX2
2261     YUVTORGB_AVX2(yuvconstants)
2262     STOREARGB_AVX2
2263     "sub       $0x10,%[width]                  \n"
2264     "jg        1b                              \n"
2265     "vzeroupper                                \n"
2266   : [y_buf]"+r"(y_buf),    // %[y_buf]
2267     [uv_buf]"+r"(uv_buf),    // %[uv_buf]
2268     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
2269     [width]"+rm"(width)    // %[width]
2270   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
2271     : "memory", "cc", YUVTORGB_REGS_AVX2  // Does not use r14.
2272     "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2273   );
2274   // clang-format on
2275 }
2276 #endif  // HAS_NV12TOARGBROW_AVX2
2277 
2278 #if defined(HAS_NV21TOARGBROW_AVX2)
2279 // 16 pixels.
2280 // 8 VU values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
NV21ToARGBRow_AVX2(const uint8 * y_buf,const uint8 * vu_buf,uint8 * dst_argb,const struct YuvConstants * yuvconstants,int width)2281 void OMITFP NV21ToARGBRow_AVX2(const uint8* y_buf,
2282                                const uint8* vu_buf,
2283                                uint8* dst_argb,
2284                                const struct YuvConstants* yuvconstants,
2285                                int width) {
2286   // clang-format off
2287   asm volatile (
2288     YUVTORGB_SETUP_AVX2(yuvconstants)
2289     "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
2290 
2291     LABELALIGN
2292     "1:                                        \n"
2293     READNV21_AVX2
2294     YUVTORGB_AVX2(yuvconstants)
2295     STOREARGB_AVX2
2296     "sub       $0x10,%[width]                  \n"
2297     "jg        1b                              \n"
2298     "vzeroupper                                \n"
2299   : [y_buf]"+r"(y_buf),    // %[y_buf]
2300     [vu_buf]"+r"(vu_buf),    // %[vu_buf]
2301     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
2302     [width]"+rm"(width)    // %[width]
2303   : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
2304     [kShuffleNV21]"m"(kShuffleNV21)
2305     : "memory", "cc", YUVTORGB_REGS_AVX2  // Does not use r14.
2306       "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2307   );
2308   // clang-format on
2309 }
2310 #endif  // HAS_NV21TOARGBROW_AVX2
2311 
2312 #if defined(HAS_YUY2TOARGBROW_AVX2)
2313 // 16 pixels.
2314 // 8 YUY2 values with 16 Y and 8 UV producing 16 ARGB (64 bytes).
YUY2ToARGBRow_AVX2(const uint8 * yuy2_buf,uint8 * dst_argb,const struct YuvConstants * yuvconstants,int width)2315 void OMITFP YUY2ToARGBRow_AVX2(const uint8* yuy2_buf,
2316                                uint8* dst_argb,
2317                                const struct YuvConstants* yuvconstants,
2318                                int width) {
2319   // clang-format off
2320   asm volatile (
2321     YUVTORGB_SETUP_AVX2(yuvconstants)
2322     "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
2323 
2324     LABELALIGN
2325     "1:                                        \n"
2326     READYUY2_AVX2
2327     YUVTORGB_AVX2(yuvconstants)
2328     STOREARGB_AVX2
2329     "sub       $0x10,%[width]                  \n"
2330     "jg        1b                              \n"
2331     "vzeroupper                                \n"
2332   : [yuy2_buf]"+r"(yuy2_buf),    // %[yuy2_buf]
2333     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
2334     [width]"+rm"(width)    // %[width]
2335   : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
2336     [kShuffleYUY2Y]"m"(kShuffleYUY2Y),
2337     [kShuffleYUY2UV]"m"(kShuffleYUY2UV)
2338     : "memory", "cc", YUVTORGB_REGS_AVX2  // Does not use r14.
2339       "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2340   );
2341   // clang-format on
2342 }
2343 #endif  // HAS_YUY2TOARGBROW_AVX2
2344 
2345 #if defined(HAS_UYVYTOARGBROW_AVX2)
2346 // 16 pixels.
2347 // 8 UYVY values with 16 Y and 8 UV producing 16 ARGB (64 bytes).
UYVYToARGBRow_AVX2(const uint8 * uyvy_buf,uint8 * dst_argb,const struct YuvConstants * yuvconstants,int width)2348 void OMITFP UYVYToARGBRow_AVX2(const uint8* uyvy_buf,
2349                                uint8* dst_argb,
2350                                const struct YuvConstants* yuvconstants,
2351                                int width) {
2352   // clang-format off
2353   asm volatile (
2354     YUVTORGB_SETUP_AVX2(yuvconstants)
2355     "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
2356 
2357     LABELALIGN
2358     "1:                                        \n"
2359     READUYVY_AVX2
2360     YUVTORGB_AVX2(yuvconstants)
2361     STOREARGB_AVX2
2362     "sub       $0x10,%[width]                  \n"
2363     "jg        1b                              \n"
2364     "vzeroupper                                \n"
2365   : [uyvy_buf]"+r"(uyvy_buf),    // %[uyvy_buf]
2366     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
2367     [width]"+rm"(width)    // %[width]
2368   : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
2369     [kShuffleUYVYY]"m"(kShuffleUYVYY),
2370     [kShuffleUYVYUV]"m"(kShuffleUYVYUV)
2371     : "memory", "cc", YUVTORGB_REGS_AVX2  // Does not use r14.
2372       "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2373   );
2374   // clang-format on
2375 }
2376 #endif  // HAS_UYVYTOARGBROW_AVX2
2377 
2378 #ifdef HAS_I400TOARGBROW_SSE2
I400ToARGBRow_SSE2(const uint8 * y_buf,uint8 * dst_argb,int width)2379 void I400ToARGBRow_SSE2(const uint8* y_buf, uint8* dst_argb, int width) {
2380   asm volatile (
2381     "mov       $0x4a354a35,%%eax               \n"  // 4a35 = 18997 = 1.164
2382     "movd      %%eax,%%xmm2                    \n"
2383     "pshufd    $0x0,%%xmm2,%%xmm2              \n"
2384     "mov       $0x04880488,%%eax               \n"  // 0488 = 1160 = 1.164 * 16
2385     "movd      %%eax,%%xmm3                    \n"
2386     "pshufd    $0x0,%%xmm3,%%xmm3              \n"
2387     "pcmpeqb   %%xmm4,%%xmm4                   \n"
2388     "pslld     $0x18,%%xmm4                    \n"
2389 
2390     LABELALIGN
2391     "1:                                        \n"
2392     // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
2393     "movq      " MEMACCESS(0) ",%%xmm0         \n"
2394     "lea       " MEMLEA(0x8,0) ",%0            \n"
2395     "punpcklbw %%xmm0,%%xmm0                   \n"
2396     "pmulhuw   %%xmm2,%%xmm0                   \n"
2397     "psubusw   %%xmm3,%%xmm0                   \n"
2398     "psrlw     $6, %%xmm0                      \n"
2399     "packuswb  %%xmm0,%%xmm0                   \n"
2400 
2401     // Step 2: Weave into ARGB
2402     "punpcklbw %%xmm0,%%xmm0                   \n"
2403     "movdqa    %%xmm0,%%xmm1                   \n"
2404     "punpcklwd %%xmm0,%%xmm0                   \n"
2405     "punpckhwd %%xmm1,%%xmm1                   \n"
2406     "por       %%xmm4,%%xmm0                   \n"
2407     "por       %%xmm4,%%xmm1                   \n"
2408     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
2409     "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
2410     "lea       " MEMLEA(0x20,1) ",%1           \n"
2411 
2412     "sub       $0x8,%2                         \n"
2413     "jg        1b                              \n"
2414   : "+r"(y_buf),     // %0
2415     "+r"(dst_argb),  // %1
2416     "+rm"(width)     // %2
2417   :
2418   : "memory", "cc", "eax"
2419     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
2420   );
2421 }
2422 #endif  // HAS_I400TOARGBROW_SSE2
2423 
2424 #ifdef HAS_I400TOARGBROW_AVX2
2425 // 16 pixels of Y converted to 16 pixels of ARGB (64 bytes).
2426 // note: vpunpcklbw mutates and vpackuswb unmutates.
I400ToARGBRow_AVX2(const uint8 * y_buf,uint8 * dst_argb,int width)2427 void I400ToARGBRow_AVX2(const uint8* y_buf, uint8* dst_argb, int width) {
2428   asm volatile (
2429     "mov        $0x4a354a35,%%eax              \n" // 0488 = 1160 = 1.164 * 16
2430     "vmovd      %%eax,%%xmm2                   \n"
2431     "vbroadcastss %%xmm2,%%ymm2                \n"
2432     "mov        $0x4880488,%%eax               \n" // 4a35 = 18997 = 1.164
2433     "vmovd      %%eax,%%xmm3                   \n"
2434     "vbroadcastss %%xmm3,%%ymm3                \n"
2435     "vpcmpeqb   %%ymm4,%%ymm4,%%ymm4           \n"
2436     "vpslld     $0x18,%%ymm4,%%ymm4            \n"
2437 
2438     LABELALIGN
2439     "1:                                        \n"
2440     // Step 1: Scale Y contribution to 16 G values. G = (y - 16) * 1.164
2441     "vmovdqu    " MEMACCESS(0) ",%%xmm0        \n"
2442     "lea        " MEMLEA(0x10,0) ",%0          \n"
2443     "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
2444     "vpunpcklbw %%ymm0,%%ymm0,%%ymm0           \n"
2445     "vpmulhuw   %%ymm2,%%ymm0,%%ymm0           \n"
2446     "vpsubusw   %%ymm3,%%ymm0,%%ymm0           \n"
2447     "vpsrlw     $0x6,%%ymm0,%%ymm0             \n"
2448     "vpackuswb  %%ymm0,%%ymm0,%%ymm0           \n"
2449     "vpunpcklbw %%ymm0,%%ymm0,%%ymm1           \n"
2450     "vpermq     $0xd8,%%ymm1,%%ymm1            \n"
2451     "vpunpcklwd %%ymm1,%%ymm1,%%ymm0           \n"
2452     "vpunpckhwd %%ymm1,%%ymm1,%%ymm1           \n"
2453     "vpor       %%ymm4,%%ymm0,%%ymm0           \n"
2454     "vpor       %%ymm4,%%ymm1,%%ymm1           \n"
2455     "vmovdqu    %%ymm0," MEMACCESS(1) "        \n"
2456     "vmovdqu    %%ymm1," MEMACCESS2(0x20,1) "  \n"
2457     "lea       " MEMLEA(0x40,1) ",%1           \n"
2458     "sub        $0x10,%2                       \n"
2459     "jg        1b                              \n"
2460     "vzeroupper                                \n"
2461   : "+r"(y_buf),     // %0
2462     "+r"(dst_argb),  // %1
2463     "+rm"(width)     // %2
2464   :
2465   : "memory", "cc", "eax"
2466     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
2467   );
2468 }
2469 #endif  // HAS_I400TOARGBROW_AVX2
2470 
2471 #ifdef HAS_MIRRORROW_SSSE3
2472 // Shuffle table for reversing the bytes.
2473 static uvec8 kShuffleMirror = {15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u,
2474                                7u,  6u,  5u,  4u,  3u,  2u,  1u, 0u};
2475 
MirrorRow_SSSE3(const uint8 * src,uint8 * dst,int width)2476 void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
2477   intptr_t temp_width = (intptr_t)(width);
2478   asm volatile (
2479     "movdqa    %3,%%xmm5                       \n"
2480 
2481     LABELALIGN
2482     "1:                                        \n"
2483     MEMOPREG(movdqu,-0x10,0,2,1,xmm0)          //  movdqu -0x10(%0,%2),%%xmm0
2484     "pshufb    %%xmm5,%%xmm0                   \n"
2485     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
2486     "lea       " MEMLEA(0x10,1) ",%1           \n"
2487     "sub       $0x10,%2                        \n"
2488     "jg        1b                              \n"
2489   : "+r"(src),  // %0
2490     "+r"(dst),  // %1
2491     "+r"(temp_width)  // %2
2492   : "m"(kShuffleMirror) // %3
2493   : "memory", "cc", NACL_R14
2494     "xmm0", "xmm5"
2495   );
2496 }
2497 #endif  // HAS_MIRRORROW_SSSE3
2498 
2499 #ifdef HAS_MIRRORROW_AVX2
MirrorRow_AVX2(const uint8 * src,uint8 * dst,int width)2500 void MirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
2501   intptr_t temp_width = (intptr_t)(width);
2502   asm volatile (
2503     "vbroadcastf128 %3,%%ymm5                  \n"
2504 
2505     LABELALIGN
2506     "1:                                        \n"
2507     MEMOPREG(vmovdqu,-0x20,0,2,1,ymm0)         //  vmovdqu -0x20(%0,%2),%%ymm0
2508     "vpshufb    %%ymm5,%%ymm0,%%ymm0           \n"
2509     "vpermq     $0x4e,%%ymm0,%%ymm0            \n"
2510     "vmovdqu    %%ymm0," MEMACCESS(1) "        \n"
2511     "lea       " MEMLEA(0x20,1) ",%1           \n"
2512     "sub       $0x20,%2                        \n"
2513     "jg        1b                              \n"
2514     "vzeroupper                                \n"
2515   : "+r"(src),  // %0
2516     "+r"(dst),  // %1
2517     "+r"(temp_width)  // %2
2518   : "m"(kShuffleMirror) // %3
2519   : "memory", "cc", NACL_R14
2520     "xmm0", "xmm5"
2521   );
2522 }
2523 #endif  // HAS_MIRRORROW_AVX2
2524 
2525 #ifdef HAS_MIRRORUVROW_SSSE3
2526 // Shuffle table for reversing the bytes of UV channels.
2527 static uvec8 kShuffleMirrorUV = {14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u,
2528                                  15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u};
MirrorUVRow_SSSE3(const uint8 * src,uint8 * dst_u,uint8 * dst_v,int width)2529 void MirrorUVRow_SSSE3(const uint8* src,
2530                        uint8* dst_u,
2531                        uint8* dst_v,
2532                        int width) {
2533   intptr_t temp_width = (intptr_t)(width);
2534   asm volatile (
2535     "movdqa    %4,%%xmm1                       \n"
2536     "lea       " MEMLEA4(-0x10,0,3,2) ",%0     \n"
2537     "sub       %1,%2                           \n"
2538 
2539     LABELALIGN
2540     "1:                                        \n"
2541     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
2542     "lea       " MEMLEA(-0x10,0) ",%0          \n"
2543     "pshufb    %%xmm1,%%xmm0                   \n"
2544     "movlpd    %%xmm0," MEMACCESS(1) "         \n"
2545     MEMOPMEM(movhpd,xmm0,0x00,1,2,1)           //  movhpd    %%xmm0,(%1,%2)
2546     "lea       " MEMLEA(0x8,1) ",%1            \n"
2547     "sub       $8,%3                           \n"
2548     "jg        1b                              \n"
2549   : "+r"(src),      // %0
2550     "+r"(dst_u),    // %1
2551     "+r"(dst_v),    // %2
2552     "+r"(temp_width)  // %3
2553   : "m"(kShuffleMirrorUV)  // %4
2554   : "memory", "cc", NACL_R14
2555     "xmm0", "xmm1"
2556   );
2557 }
2558 #endif  // HAS_MIRRORUVROW_SSSE3
2559 
2560 #ifdef HAS_ARGBMIRRORROW_SSE2
2561 
ARGBMirrorRow_SSE2(const uint8 * src,uint8 * dst,int width)2562 void ARGBMirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
2563   intptr_t temp_width = (intptr_t)(width);
2564   asm volatile (
2565     "lea       " MEMLEA4(-0x10,0,2,4) ",%0     \n"
2566 
2567     LABELALIGN
2568     "1:                                        \n"
2569     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
2570     "pshufd    $0x1b,%%xmm0,%%xmm0             \n"
2571     "lea       " MEMLEA(-0x10,0) ",%0          \n"
2572     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
2573     "lea       " MEMLEA(0x10,1) ",%1           \n"
2574     "sub       $0x4,%2                         \n"
2575     "jg        1b                              \n"
2576   : "+r"(src),  // %0
2577     "+r"(dst),  // %1
2578     "+r"(temp_width)  // %2
2579   :
2580   : "memory", "cc"
2581     , "xmm0"
2582   );
2583 }
2584 #endif  // HAS_ARGBMIRRORROW_SSE2
2585 
2586 #ifdef HAS_ARGBMIRRORROW_AVX2
2587 // Shuffle table for reversing the bytes.
2588 static const ulvec32 kARGBShuffleMirror_AVX2 = {7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u};
ARGBMirrorRow_AVX2(const uint8 * src,uint8 * dst,int width)2589 void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
2590   intptr_t temp_width = (intptr_t)(width);
2591   asm volatile (
2592     "vmovdqu    %3,%%ymm5                      \n"
2593 
2594     LABELALIGN
2595     "1:                                        \n"
2596     VMEMOPREG(vpermd,-0x20,0,2,4,ymm5,ymm0) // vpermd -0x20(%0,%2,4),ymm5,ymm0
2597     "vmovdqu    %%ymm0," MEMACCESS(1) "        \n"
2598     "lea        " MEMLEA(0x20,1) ",%1          \n"
2599     "sub        $0x8,%2                        \n"
2600     "jg         1b                             \n"
2601     "vzeroupper                                \n"
2602   : "+r"(src),  // %0
2603     "+r"(dst),  // %1
2604     "+r"(temp_width)  // %2
2605   : "m"(kARGBShuffleMirror_AVX2) // %3
2606   : "memory", "cc", NACL_R14
2607     "xmm0", "xmm5"
2608   );
2609 }
2610 #endif  // HAS_ARGBMIRRORROW_AVX2
2611 
2612 #ifdef HAS_SPLITUVROW_AVX2
SplitUVRow_AVX2(const uint8 * src_uv,uint8 * dst_u,uint8 * dst_v,int width)2613 void SplitUVRow_AVX2(const uint8* src_uv,
2614                      uint8* dst_u,
2615                      uint8* dst_v,
2616                      int width) {
2617   asm volatile (
2618     "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
2619     "vpsrlw     $0x8,%%ymm5,%%ymm5             \n"
2620     "sub        %1,%2                          \n"
2621 
2622     LABELALIGN
2623     "1:                                        \n"
2624     "vmovdqu    " MEMACCESS(0) ",%%ymm0        \n"
2625     "vmovdqu    " MEMACCESS2(0x20,0) ",%%ymm1  \n"
2626     "lea        " MEMLEA(0x40,0) ",%0          \n"
2627     "vpsrlw     $0x8,%%ymm0,%%ymm2             \n"
2628     "vpsrlw     $0x8,%%ymm1,%%ymm3             \n"
2629     "vpand      %%ymm5,%%ymm0,%%ymm0           \n"
2630     "vpand      %%ymm5,%%ymm1,%%ymm1           \n"
2631     "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"
2632     "vpackuswb  %%ymm3,%%ymm2,%%ymm2           \n"
2633     "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
2634     "vpermq     $0xd8,%%ymm2,%%ymm2            \n"
2635     "vmovdqu    %%ymm0," MEMACCESS(1) "        \n"
2636     MEMOPMEM(vmovdqu,ymm2,0x00,1,2,1)           //  vmovdqu %%ymm2,(%1,%2)
2637     "lea        " MEMLEA(0x20,1) ",%1          \n"
2638     "sub        $0x20,%3                       \n"
2639     "jg         1b                             \n"
2640     "vzeroupper                                \n"
2641   : "+r"(src_uv),     // %0
2642     "+r"(dst_u),      // %1
2643     "+r"(dst_v),      // %2
2644     "+r"(width)         // %3
2645   :
2646   : "memory", "cc", NACL_R14
2647     "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
2648   );
2649 }
2650 #endif  // HAS_SPLITUVROW_AVX2
2651 
2652 #ifdef HAS_SPLITUVROW_SSE2
SplitUVRow_SSE2(const uint8 * src_uv,uint8 * dst_u,uint8 * dst_v,int width)2653 void SplitUVRow_SSE2(const uint8* src_uv,
2654                      uint8* dst_u,
2655                      uint8* dst_v,
2656                      int width) {
2657   asm volatile (
2658     "pcmpeqb    %%xmm5,%%xmm5                  \n"
2659     "psrlw      $0x8,%%xmm5                    \n"
2660     "sub        %1,%2                          \n"
2661 
2662     LABELALIGN
2663     "1:                                        \n"
2664     "movdqu     " MEMACCESS(0) ",%%xmm0        \n"
2665     "movdqu     " MEMACCESS2(0x10,0) ",%%xmm1  \n"
2666     "lea        " MEMLEA(0x20,0) ",%0          \n"
2667     "movdqa     %%xmm0,%%xmm2                  \n"
2668     "movdqa     %%xmm1,%%xmm3                  \n"
2669     "pand       %%xmm5,%%xmm0                  \n"
2670     "pand       %%xmm5,%%xmm1                  \n"
2671     "packuswb   %%xmm1,%%xmm0                  \n"
2672     "psrlw      $0x8,%%xmm2                    \n"
2673     "psrlw      $0x8,%%xmm3                    \n"
2674     "packuswb   %%xmm3,%%xmm2                  \n"
2675     "movdqu     %%xmm0," MEMACCESS(1) "        \n"
2676     MEMOPMEM(movdqu,xmm2,0x00,1,2,1)           //  movdqu     %%xmm2,(%1,%2)
2677     "lea        " MEMLEA(0x10,1) ",%1          \n"
2678     "sub        $0x10,%3                       \n"
2679     "jg         1b                             \n"
2680   : "+r"(src_uv),     // %0
2681     "+r"(dst_u),      // %1
2682     "+r"(dst_v),      // %2
2683     "+r"(width)         // %3
2684   :
2685   : "memory", "cc", NACL_R14
2686     "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
2687   );
2688 }
2689 #endif  // HAS_SPLITUVROW_SSE2
2690 
2691 #ifdef HAS_MERGEUVROW_AVX2
MergeUVRow_AVX2(const uint8 * src_u,const uint8 * src_v,uint8 * dst_uv,int width)2692 void MergeUVRow_AVX2(const uint8* src_u,
2693                      const uint8* src_v,
2694                      uint8* dst_uv,
2695                      int width) {
2696   asm volatile (
2697     "sub       %0,%1                           \n"
2698 
2699     LABELALIGN
2700     "1:                                        \n"
2701     "vmovdqu   " MEMACCESS(0) ",%%ymm0         \n"
2702     MEMOPREG(vmovdqu,0x00,0,1,1,ymm1)           //  vmovdqu (%0,%1,1),%%ymm1
2703     "lea       " MEMLEA(0x20,0) ",%0           \n"
2704     "vpunpcklbw %%ymm1,%%ymm0,%%ymm2           \n"
2705     "vpunpckhbw %%ymm1,%%ymm0,%%ymm0           \n"
2706     "vextractf128 $0x0,%%ymm2," MEMACCESS(2) " \n"
2707     "vextractf128 $0x0,%%ymm0," MEMACCESS2(0x10,2) "\n"
2708     "vextractf128 $0x1,%%ymm2," MEMACCESS2(0x20,2) "\n"
2709     "vextractf128 $0x1,%%ymm0," MEMACCESS2(0x30,2) "\n"
2710     "lea       " MEMLEA(0x40,2) ",%2           \n"
2711     "sub       $0x20,%3                        \n"
2712     "jg        1b                              \n"
2713     "vzeroupper                                \n"
2714   : "+r"(src_u),     // %0
2715     "+r"(src_v),     // %1
2716     "+r"(dst_uv),    // %2
2717     "+r"(width)      // %3
2718   :
2719   : "memory", "cc", NACL_R14
2720     "xmm0", "xmm1", "xmm2"
2721   );
2722 }
2723 #endif  // HAS_MERGEUVROW_AVX2
2724 
2725 #ifdef HAS_MERGEUVROW_SSE2
MergeUVRow_SSE2(const uint8 * src_u,const uint8 * src_v,uint8 * dst_uv,int width)2726 void MergeUVRow_SSE2(const uint8* src_u,
2727                      const uint8* src_v,
2728                      uint8* dst_uv,
2729                      int width) {
2730   asm volatile (
2731     "sub       %0,%1                           \n"
2732 
2733     LABELALIGN
2734     "1:                                        \n"
2735     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
2736     MEMOPREG(movdqu,0x00,0,1,1,xmm1)           //  movdqu    (%0,%1,1),%%xmm1
2737     "lea       " MEMLEA(0x10,0) ",%0           \n"
2738     "movdqa    %%xmm0,%%xmm2                   \n"
2739     "punpcklbw %%xmm1,%%xmm0                   \n"
2740     "punpckhbw %%xmm1,%%xmm2                   \n"
2741     "movdqu    %%xmm0," MEMACCESS(2) "         \n"
2742     "movdqu    %%xmm2," MEMACCESS2(0x10,2) "   \n"
2743     "lea       " MEMLEA(0x20,2) ",%2           \n"
2744     "sub       $0x10,%3                        \n"
2745     "jg        1b                              \n"
2746   : "+r"(src_u),     // %0
2747     "+r"(src_v),     // %1
2748     "+r"(dst_uv),    // %2
2749     "+r"(width)      // %3
2750   :
2751   : "memory", "cc", NACL_R14
2752     "xmm0", "xmm1", "xmm2"
2753   );
2754 }
2755 #endif  // HAS_MERGEUVROW_SSE2
2756 
2757 #ifdef HAS_COPYROW_SSE2
CopyRow_SSE2(const uint8 * src,uint8 * dst,int count)2758 void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
2759   asm volatile (
2760     "test       $0xf,%0                        \n"
2761     "jne        2f                             \n"
2762     "test       $0xf,%1                        \n"
2763     "jne        2f                             \n"
2764 
2765     LABELALIGN
2766     "1:                                        \n"
2767     "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
2768     "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
2769     "lea       " MEMLEA(0x20,0) ",%0           \n"
2770     "movdqa    %%xmm0," MEMACCESS(1) "         \n"
2771     "movdqa    %%xmm1," MEMACCESS2(0x10,1) "   \n"
2772     "lea       " MEMLEA(0x20,1) ",%1           \n"
2773     "sub       $0x20,%2                        \n"
2774     "jg        1b                              \n"
2775     "jmp       9f                              \n"
2776 
2777     LABELALIGN
2778   "2:                                          \n"
2779     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
2780     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
2781     "lea       " MEMLEA(0x20,0) ",%0           \n"
2782     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
2783     "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
2784     "lea       " MEMLEA(0x20,1) ",%1           \n"
2785     "sub       $0x20,%2                        \n"
2786     "jg        2b                              \n"
2787   "9:                                          \n"
2788   : "+r"(src),   // %0
2789     "+r"(dst),   // %1
2790     "+r"(count)  // %2
2791   :
2792   : "memory", "cc"
2793     , "xmm0", "xmm1"
2794   );
2795 }
2796 #endif  // HAS_COPYROW_SSE2
2797 
2798 #ifdef HAS_COPYROW_AVX
CopyRow_AVX(const uint8 * src,uint8 * dst,int count)2799 void CopyRow_AVX(const uint8* src, uint8* dst, int count) {
2800   asm volatile (
2801     LABELALIGN
2802     "1:                                        \n"
2803     "vmovdqu   " MEMACCESS(0) ",%%ymm0         \n"
2804     "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm1   \n"
2805     "lea       " MEMLEA(0x40,0) ",%0           \n"
2806     "vmovdqu   %%ymm0," MEMACCESS(1) "         \n"
2807     "vmovdqu   %%ymm1," MEMACCESS2(0x20,1) "   \n"
2808     "lea       " MEMLEA(0x40,1) ",%1           \n"
2809     "sub       $0x40,%2                        \n"
2810     "jg        1b                              \n"
2811   : "+r"(src),   // %0
2812     "+r"(dst),   // %1
2813     "+r"(count)  // %2
2814   :
2815   : "memory", "cc"
2816     , "xmm0", "xmm1"
2817   );
2818 }
2819 #endif  // HAS_COPYROW_AVX
2820 
2821 #ifdef HAS_COPYROW_ERMS
2822 // Multiple of 1.
CopyRow_ERMS(const uint8 * src,uint8 * dst,int width)2823 void CopyRow_ERMS(const uint8* src, uint8* dst, int width) {
2824   size_t width_tmp = (size_t)(width);
2825   asm volatile("rep movsb " MEMMOVESTRING(0, 1) "          \n"
2826                : "+S"(src),       // %0
2827                  "+D"(dst),       // %1
2828                  "+c"(width_tmp)  // %2
2829                :
2830                : "memory", "cc");
2831 }
2832 #endif  // HAS_COPYROW_ERMS
2833 
2834 #ifdef HAS_ARGBCOPYALPHAROW_SSE2
2835 // width in pixels
ARGBCopyAlphaRow_SSE2(const uint8 * src,uint8 * dst,int width)2836 void ARGBCopyAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
2837   asm volatile (
2838     "pcmpeqb   %%xmm0,%%xmm0                   \n"
2839     "pslld     $0x18,%%xmm0                    \n"
2840     "pcmpeqb   %%xmm1,%%xmm1                   \n"
2841     "psrld     $0x8,%%xmm1                     \n"
2842 
2843     LABELALIGN
2844     "1:                                        \n"
2845     "movdqu    " MEMACCESS(0) ",%%xmm2         \n"
2846     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm3   \n"
2847     "lea       " MEMLEA(0x20,0) ",%0           \n"
2848     "movdqu    " MEMACCESS(1) ",%%xmm4         \n"
2849     "movdqu    " MEMACCESS2(0x10,1) ",%%xmm5   \n"
2850     "pand      %%xmm0,%%xmm2                   \n"
2851     "pand      %%xmm0,%%xmm3                   \n"
2852     "pand      %%xmm1,%%xmm4                   \n"
2853     "pand      %%xmm1,%%xmm5                   \n"
2854     "por       %%xmm4,%%xmm2                   \n"
2855     "por       %%xmm5,%%xmm3                   \n"
2856     "movdqu    %%xmm2," MEMACCESS(1) "         \n"
2857     "movdqu    %%xmm3," MEMACCESS2(0x10,1) "   \n"
2858     "lea       " MEMLEA(0x20,1) ",%1           \n"
2859     "sub       $0x8,%2                         \n"
2860     "jg        1b                              \n"
2861   : "+r"(src),   // %0
2862     "+r"(dst),   // %1
2863     "+r"(width)  // %2
2864   :
2865   : "memory", "cc"
2866     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2867   );
2868 }
2869 #endif  // HAS_ARGBCOPYALPHAROW_SSE2
2870 
2871 #ifdef HAS_ARGBCOPYALPHAROW_AVX2
2872 // width in pixels
ARGBCopyAlphaRow_AVX2(const uint8 * src,uint8 * dst,int width)2873 void ARGBCopyAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
2874   asm volatile (
2875     "vpcmpeqb  %%ymm0,%%ymm0,%%ymm0            \n"
2876     "vpsrld    $0x8,%%ymm0,%%ymm0              \n"
2877 
2878     LABELALIGN
2879     "1:                                        \n"
2880     "vmovdqu   " MEMACCESS(0) ",%%ymm1         \n"
2881     "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm2   \n"
2882     "lea       " MEMLEA(0x40,0) ",%0           \n"
2883     "vpblendvb %%ymm0," MEMACCESS(1) ",%%ymm1,%%ymm1        \n"
2884     "vpblendvb %%ymm0," MEMACCESS2(0x20,1) ",%%ymm2,%%ymm2  \n"
2885     "vmovdqu   %%ymm1," MEMACCESS(1) "         \n"
2886     "vmovdqu   %%ymm2," MEMACCESS2(0x20,1) "   \n"
2887     "lea       " MEMLEA(0x40,1) ",%1           \n"
2888     "sub       $0x10,%2                        \n"
2889     "jg        1b                              \n"
2890     "vzeroupper                                \n"
2891   : "+r"(src),   // %0
2892     "+r"(dst),   // %1
2893     "+r"(width)  // %2
2894   :
2895   : "memory", "cc"
2896     , "xmm0", "xmm1", "xmm2"
2897   );
2898 }
2899 #endif  // HAS_ARGBCOPYALPHAROW_AVX2
2900 
2901 #ifdef HAS_ARGBEXTRACTALPHAROW_SSE2
2902 // width in pixels
ARGBExtractAlphaRow_SSE2(const uint8 * src_argb,uint8 * dst_a,int width)2903 void ARGBExtractAlphaRow_SSE2(const uint8* src_argb, uint8* dst_a, int width) {
2904   asm volatile (
2905     LABELALIGN
2906     "1:                                        \n"
2907     "movdqu    " MEMACCESS(0) ", %%xmm0        \n"
2908     "movdqu    " MEMACCESS2(0x10, 0) ", %%xmm1 \n"
2909     "lea       " MEMLEA(0x20, 0) ", %0         \n"
2910     "psrld     $0x18, %%xmm0                   \n"
2911     "psrld     $0x18, %%xmm1                   \n"
2912     "packssdw  %%xmm1, %%xmm0                  \n"
2913     "packuswb  %%xmm0, %%xmm0                  \n"
2914     "movq      %%xmm0," MEMACCESS(1) "         \n"
2915     "lea       " MEMLEA(0x8, 1) ", %1          \n"
2916     "sub       $0x8, %2                        \n"
2917     "jg        1b                              \n"
2918   : "+r"(src_argb),  // %0
2919     "+r"(dst_a),     // %1
2920     "+rm"(width)     // %2
2921   :
2922   : "memory", "cc"
2923     , "xmm0", "xmm1"
2924   );
2925 }
2926 #endif  // HAS_ARGBEXTRACTALPHAROW_SSE2
2927 
2928 #ifdef HAS_ARGBEXTRACTALPHAROW_AVX2
2929 static const uvec8 kShuffleAlphaShort_AVX2 = {
2930     3u,  128u, 128u, 128u, 7u,  128u, 128u, 128u,
2931     11u, 128u, 128u, 128u, 15u, 128u, 128u, 128u};
2932 
ARGBExtractAlphaRow_AVX2(const uint8 * src_argb,uint8 * dst_a,int width)2933 void ARGBExtractAlphaRow_AVX2(const uint8* src_argb, uint8* dst_a, int width) {
2934   asm volatile (
2935     "vmovdqa    %3,%%ymm4                      \n"
2936     "vbroadcastf128 %4,%%ymm5                  \n"
2937 
2938     LABELALIGN
2939     "1:                                        \n"
2940     "vmovdqu   " MEMACCESS(0) ", %%ymm0        \n"
2941     "vmovdqu   " MEMACCESS2(0x20, 0) ", %%ymm1 \n"
2942     "vpshufb    %%ymm5,%%ymm0,%%ymm0           \n" // vpsrld $0x18, %%ymm0
2943     "vpshufb    %%ymm5,%%ymm1,%%ymm1           \n"
2944     "vmovdqu   " MEMACCESS2(0x40, 0) ", %%ymm2 \n"
2945     "vmovdqu   " MEMACCESS2(0x60, 0) ", %%ymm3 \n"
2946     "lea       " MEMLEA(0x80, 0) ", %0         \n"
2947     "vpackssdw  %%ymm1, %%ymm0, %%ymm0         \n"  // mutates
2948     "vpshufb    %%ymm5,%%ymm2,%%ymm2           \n"
2949     "vpshufb    %%ymm5,%%ymm3,%%ymm3           \n"
2950     "vpackssdw  %%ymm3, %%ymm2, %%ymm2         \n"  // mutates
2951     "vpackuswb  %%ymm2,%%ymm0,%%ymm0           \n"  // mutates.
2952     "vpermd     %%ymm0,%%ymm4,%%ymm0           \n"  // unmutate.
2953     "vmovdqu    %%ymm0," MEMACCESS(1) "        \n"
2954     "lea       " MEMLEA(0x20,1) ",%1           \n"
2955     "sub        $0x20, %2                      \n"
2956     "jg         1b                             \n"
2957     "vzeroupper                                \n"
2958   : "+r"(src_argb),  // %0
2959     "+r"(dst_a),     // %1
2960     "+rm"(width)     // %2
2961   : "m"(kPermdARGBToY_AVX),  // %3
2962     "m"(kShuffleAlphaShort_AVX2)  // %4
2963   : "memory", "cc"
2964     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2965   );
2966 }
2967 #endif  // HAS_ARGBEXTRACTALPHAROW_AVX2
2968 
2969 #ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2
2970 // width in pixels
ARGBCopyYToAlphaRow_SSE2(const uint8 * src,uint8 * dst,int width)2971 void ARGBCopyYToAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
2972   asm volatile (
2973     "pcmpeqb   %%xmm0,%%xmm0                   \n"
2974     "pslld     $0x18,%%xmm0                    \n"
2975     "pcmpeqb   %%xmm1,%%xmm1                   \n"
2976     "psrld     $0x8,%%xmm1                     \n"
2977 
2978     LABELALIGN
2979     "1:                                        \n"
2980     "movq      " MEMACCESS(0) ",%%xmm2         \n"
2981     "lea       " MEMLEA(0x8,0) ",%0            \n"
2982     "punpcklbw %%xmm2,%%xmm2                   \n"
2983     "punpckhwd %%xmm2,%%xmm3                   \n"
2984     "punpcklwd %%xmm2,%%xmm2                   \n"
2985     "movdqu    " MEMACCESS(1) ",%%xmm4         \n"
2986     "movdqu    " MEMACCESS2(0x10,1) ",%%xmm5   \n"
2987     "pand      %%xmm0,%%xmm2                   \n"
2988     "pand      %%xmm0,%%xmm3                   \n"
2989     "pand      %%xmm1,%%xmm4                   \n"
2990     "pand      %%xmm1,%%xmm5                   \n"
2991     "por       %%xmm4,%%xmm2                   \n"
2992     "por       %%xmm5,%%xmm3                   \n"
2993     "movdqu    %%xmm2," MEMACCESS(1) "         \n"
2994     "movdqu    %%xmm3," MEMACCESS2(0x10,1) "   \n"
2995     "lea       " MEMLEA(0x20,1) ",%1           \n"
2996     "sub       $0x8,%2                         \n"
2997     "jg        1b                              \n"
2998   : "+r"(src),   // %0
2999     "+r"(dst),   // %1
3000     "+r"(width)  // %2
3001   :
3002   : "memory", "cc"
3003     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
3004   );
3005 }
3006 #endif  // HAS_ARGBCOPYYTOALPHAROW_SSE2
3007 
3008 #ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2
3009 // width in pixels
ARGBCopyYToAlphaRow_AVX2(const uint8 * src,uint8 * dst,int width)3010 void ARGBCopyYToAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
3011   asm volatile (
3012     "vpcmpeqb  %%ymm0,%%ymm0,%%ymm0            \n"
3013     "vpsrld    $0x8,%%ymm0,%%ymm0              \n"
3014 
3015     LABELALIGN
3016     "1:                                        \n"
3017     "vpmovzxbd " MEMACCESS(0) ",%%ymm1         \n"
3018     "vpmovzxbd " MEMACCESS2(0x8,0) ",%%ymm2    \n"
3019     "lea       " MEMLEA(0x10,0) ",%0           \n"
3020     "vpslld    $0x18,%%ymm1,%%ymm1             \n"
3021     "vpslld    $0x18,%%ymm2,%%ymm2             \n"
3022     "vpblendvb %%ymm0," MEMACCESS(1) ",%%ymm1,%%ymm1        \n"
3023     "vpblendvb %%ymm0," MEMACCESS2(0x20,1) ",%%ymm2,%%ymm2  \n"
3024     "vmovdqu   %%ymm1," MEMACCESS(1) "         \n"
3025     "vmovdqu   %%ymm2," MEMACCESS2(0x20,1) "   \n"
3026     "lea       " MEMLEA(0x40,1) ",%1           \n"
3027     "sub       $0x10,%2                        \n"
3028     "jg        1b                              \n"
3029     "vzeroupper                                \n"
3030   : "+r"(src),   // %0
3031     "+r"(dst),   // %1
3032     "+r"(width)  // %2
3033   :
3034   : "memory", "cc"
3035     , "xmm0", "xmm1", "xmm2"
3036   );
3037 }
3038 #endif  // HAS_ARGBCOPYYTOALPHAROW_AVX2
3039 
3040 #ifdef HAS_SETROW_X86
SetRow_X86(uint8 * dst,uint8 v8,int width)3041 void SetRow_X86(uint8* dst, uint8 v8, int width) {
3042   size_t width_tmp = (size_t)(width >> 2);
3043   const uint32 v32 = v8 * 0x01010101u;  // Duplicate byte to all bytes.
3044   asm volatile("rep stosl " MEMSTORESTRING(eax, 0) "       \n"
3045                : "+D"(dst),       // %0
3046                  "+c"(width_tmp)  // %1
3047                : "a"(v32)         // %2
3048                : "memory", "cc");
3049 }
3050 
SetRow_ERMS(uint8 * dst,uint8 v8,int width)3051 void SetRow_ERMS(uint8* dst, uint8 v8, int width) {
3052   size_t width_tmp = (size_t)(width);
3053   asm volatile("rep stosb " MEMSTORESTRING(al, 0) "        \n"
3054                : "+D"(dst),       // %0
3055                  "+c"(width_tmp)  // %1
3056                : "a"(v8)          // %2
3057                : "memory", "cc");
3058 }
3059 
ARGBSetRow_X86(uint8 * dst_argb,uint32 v32,int width)3060 void ARGBSetRow_X86(uint8* dst_argb, uint32 v32, int width) {
3061   size_t width_tmp = (size_t)(width);
3062   asm volatile("rep stosl " MEMSTORESTRING(eax, 0) "       \n"
3063                : "+D"(dst_argb),  // %0
3064                  "+c"(width_tmp)  // %1
3065                : "a"(v32)         // %2
3066                : "memory", "cc");
3067 }
3068 #endif  // HAS_SETROW_X86
3069 
3070 #ifdef HAS_YUY2TOYROW_SSE2
YUY2ToYRow_SSE2(const uint8 * src_yuy2,uint8 * dst_y,int width)3071 void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int width) {
3072   asm volatile (
3073     "pcmpeqb   %%xmm5,%%xmm5                   \n"
3074     "psrlw     $0x8,%%xmm5                     \n"
3075 
3076     LABELALIGN
3077     "1:                                        \n"
3078     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
3079     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
3080     "lea       " MEMLEA(0x20,0) ",%0           \n"
3081     "pand      %%xmm5,%%xmm0                   \n"
3082     "pand      %%xmm5,%%xmm1                   \n"
3083     "packuswb  %%xmm1,%%xmm0                   \n"
3084     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
3085     "lea       " MEMLEA(0x10,1) ",%1           \n"
3086     "sub       $0x10,%2                        \n"
3087     "jg        1b                              \n"
3088   : "+r"(src_yuy2),  // %0
3089     "+r"(dst_y),     // %1
3090     "+r"(width)        // %2
3091   :
3092   : "memory", "cc"
3093     , "xmm0", "xmm1", "xmm5"
3094   );
3095 }
3096 
YUY2ToUVRow_SSE2(const uint8 * src_yuy2,int stride_yuy2,uint8 * dst_u,uint8 * dst_v,int width)3097 void YUY2ToUVRow_SSE2(const uint8* src_yuy2,
3098                       int stride_yuy2,
3099                       uint8* dst_u,
3100                       uint8* dst_v,
3101                       int width) {
3102   asm volatile (
3103     "pcmpeqb   %%xmm5,%%xmm5                   \n"
3104     "psrlw     $0x8,%%xmm5                     \n"
3105     "sub       %1,%2                           \n"
3106 
3107     LABELALIGN
3108     "1:                                        \n"
3109     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
3110     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
3111     MEMOPREG(movdqu,0x00,0,4,1,xmm2)           //  movdqu  (%0,%4,1),%%xmm2
3112     MEMOPREG(movdqu,0x10,0,4,1,xmm3)           //  movdqu  0x10(%0,%4,1),%%xmm3
3113     "lea       " MEMLEA(0x20,0) ",%0           \n"
3114     "pavgb     %%xmm2,%%xmm0                   \n"
3115     "pavgb     %%xmm3,%%xmm1                   \n"
3116     "psrlw     $0x8,%%xmm0                     \n"
3117     "psrlw     $0x8,%%xmm1                     \n"
3118     "packuswb  %%xmm1,%%xmm0                   \n"
3119     "movdqa    %%xmm0,%%xmm1                   \n"
3120     "pand      %%xmm5,%%xmm0                   \n"
3121     "packuswb  %%xmm0,%%xmm0                   \n"
3122     "psrlw     $0x8,%%xmm1                     \n"
3123     "packuswb  %%xmm1,%%xmm1                   \n"
3124     "movq      %%xmm0," MEMACCESS(1) "         \n"
3125     MEMOPMEM(movq,xmm1,0x00,1,2,1)             //  movq    %%xmm1,(%1,%2)
3126     "lea       " MEMLEA(0x8,1) ",%1            \n"
3127     "sub       $0x10,%3                        \n"
3128     "jg        1b                              \n"
3129   : "+r"(src_yuy2),    // %0
3130     "+r"(dst_u),       // %1
3131     "+r"(dst_v),       // %2
3132     "+r"(width)          // %3
3133   : "r"((intptr_t)(stride_yuy2))  // %4
3134   : "memory", "cc", NACL_R14
3135     "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
3136   );
3137 }
3138 
YUY2ToUV422Row_SSE2(const uint8 * src_yuy2,uint8 * dst_u,uint8 * dst_v,int width)3139 void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
3140                          uint8* dst_u,
3141                          uint8* dst_v,
3142                          int width) {
3143   asm volatile (
3144     "pcmpeqb   %%xmm5,%%xmm5                   \n"
3145     "psrlw     $0x8,%%xmm5                     \n"
3146     "sub       %1,%2                           \n"
3147 
3148     LABELALIGN
3149     "1:                                        \n"
3150     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
3151     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
3152     "lea       " MEMLEA(0x20,0) ",%0           \n"
3153     "psrlw     $0x8,%%xmm0                     \n"
3154     "psrlw     $0x8,%%xmm1                     \n"
3155     "packuswb  %%xmm1,%%xmm0                   \n"
3156     "movdqa    %%xmm0,%%xmm1                   \n"
3157     "pand      %%xmm5,%%xmm0                   \n"
3158     "packuswb  %%xmm0,%%xmm0                   \n"
3159     "psrlw     $0x8,%%xmm1                     \n"
3160     "packuswb  %%xmm1,%%xmm1                   \n"
3161     "movq      %%xmm0," MEMACCESS(1) "         \n"
3162     MEMOPMEM(movq,xmm1,0x00,1,2,1)             //  movq    %%xmm1,(%1,%2)
3163     "lea       " MEMLEA(0x8,1) ",%1            \n"
3164     "sub       $0x10,%3                        \n"
3165     "jg        1b                              \n"
3166   : "+r"(src_yuy2),    // %0
3167     "+r"(dst_u),       // %1
3168     "+r"(dst_v),       // %2
3169     "+r"(width)          // %3
3170   :
3171   : "memory", "cc", NACL_R14
3172     "xmm0", "xmm1", "xmm5"
3173   );
3174 }
3175 
UYVYToYRow_SSE2(const uint8 * src_uyvy,uint8 * dst_y,int width)3176 void UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int width) {
3177   asm volatile (
3178     LABELALIGN
3179     "1:                                        \n"
3180     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
3181     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
3182     "lea       " MEMLEA(0x20,0) ",%0           \n"
3183     "psrlw     $0x8,%%xmm0                     \n"
3184     "psrlw     $0x8,%%xmm1                     \n"
3185     "packuswb  %%xmm1,%%xmm0                   \n"
3186     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
3187     "lea       " MEMLEA(0x10,1) ",%1           \n"
3188     "sub       $0x10,%2                        \n"
3189     "jg        1b                              \n"
3190   : "+r"(src_uyvy),  // %0
3191     "+r"(dst_y),     // %1
3192     "+r"(width)        // %2
3193   :
3194   : "memory", "cc"
3195     , "xmm0", "xmm1"
3196   );
3197 }
3198 
UYVYToUVRow_SSE2(const uint8 * src_uyvy,int stride_uyvy,uint8 * dst_u,uint8 * dst_v,int width)3199 void UYVYToUVRow_SSE2(const uint8* src_uyvy,
3200                       int stride_uyvy,
3201                       uint8* dst_u,
3202                       uint8* dst_v,
3203                       int width) {
3204   asm volatile (
3205     "pcmpeqb   %%xmm5,%%xmm5                   \n"
3206     "psrlw     $0x8,%%xmm5                     \n"
3207     "sub       %1,%2                           \n"
3208 
3209     LABELALIGN
3210     "1:                                        \n"
3211     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
3212     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
3213     MEMOPREG(movdqu,0x00,0,4,1,xmm2)           //  movdqu  (%0,%4,1),%%xmm2
3214     MEMOPREG(movdqu,0x10,0,4,1,xmm3)           //  movdqu  0x10(%0,%4,1),%%xmm3
3215     "lea       " MEMLEA(0x20,0) ",%0           \n"
3216     "pavgb     %%xmm2,%%xmm0                   \n"
3217     "pavgb     %%xmm3,%%xmm1                   \n"
3218     "pand      %%xmm5,%%xmm0                   \n"
3219     "pand      %%xmm5,%%xmm1                   \n"
3220     "packuswb  %%xmm1,%%xmm0                   \n"
3221     "movdqa    %%xmm0,%%xmm1                   \n"
3222     "pand      %%xmm5,%%xmm0                   \n"
3223     "packuswb  %%xmm0,%%xmm0                   \n"
3224     "psrlw     $0x8,%%xmm1                     \n"
3225     "packuswb  %%xmm1,%%xmm1                   \n"
3226     "movq      %%xmm0," MEMACCESS(1) "         \n"
3227     MEMOPMEM(movq,xmm1,0x00,1,2,1)             //  movq    %%xmm1,(%1,%2)
3228     "lea       " MEMLEA(0x8,1) ",%1            \n"
3229     "sub       $0x10,%3                        \n"
3230     "jg        1b                              \n"
3231   : "+r"(src_uyvy),    // %0
3232     "+r"(dst_u),       // %1
3233     "+r"(dst_v),       // %2
3234     "+r"(width)          // %3
3235   : "r"((intptr_t)(stride_uyvy))  // %4
3236   : "memory", "cc", NACL_R14
3237     "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
3238   );
3239 }
3240 
UYVYToUV422Row_SSE2(const uint8 * src_uyvy,uint8 * dst_u,uint8 * dst_v,int width)3241 void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
3242                          uint8* dst_u,
3243                          uint8* dst_v,
3244                          int width) {
3245   asm volatile (
3246     "pcmpeqb   %%xmm5,%%xmm5                   \n"
3247     "psrlw     $0x8,%%xmm5                     \n"
3248     "sub       %1,%2                           \n"
3249 
3250     LABELALIGN
3251     "1:                                        \n"
3252     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
3253     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
3254     "lea       " MEMLEA(0x20,0) ",%0           \n"
3255     "pand      %%xmm5,%%xmm0                   \n"
3256     "pand      %%xmm5,%%xmm1                   \n"
3257     "packuswb  %%xmm1,%%xmm0                   \n"
3258     "movdqa    %%xmm0,%%xmm1                   \n"
3259     "pand      %%xmm5,%%xmm0                   \n"
3260     "packuswb  %%xmm0,%%xmm0                   \n"
3261     "psrlw     $0x8,%%xmm1                     \n"
3262     "packuswb  %%xmm1,%%xmm1                   \n"
3263     "movq      %%xmm0," MEMACCESS(1) "         \n"
3264     MEMOPMEM(movq,xmm1,0x00,1,2,1)             //  movq    %%xmm1,(%1,%2)
3265     "lea       " MEMLEA(0x8,1) ",%1            \n"
3266     "sub       $0x10,%3                        \n"
3267     "jg        1b                              \n"
3268   : "+r"(src_uyvy),    // %0
3269     "+r"(dst_u),       // %1
3270     "+r"(dst_v),       // %2
3271     "+r"(width)          // %3
3272   :
3273   : "memory", "cc", NACL_R14
3274     "xmm0", "xmm1", "xmm5"
3275   );
3276 }
3277 #endif  // HAS_YUY2TOYROW_SSE2
3278 
3279 #ifdef HAS_YUY2TOYROW_AVX2
YUY2ToYRow_AVX2(const uint8 * src_yuy2,uint8 * dst_y,int width)3280 void YUY2ToYRow_AVX2(const uint8* src_yuy2, uint8* dst_y, int width) {
3281   asm volatile (
3282     "vpcmpeqb  %%ymm5,%%ymm5,%%ymm5            \n"
3283     "vpsrlw    $0x8,%%ymm5,%%ymm5              \n"
3284 
3285     LABELALIGN
3286     "1:                                        \n"
3287     "vmovdqu   " MEMACCESS(0) ",%%ymm0         \n"
3288     "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm1   \n"
3289     "lea       " MEMLEA(0x40,0) ",%0           \n"
3290     "vpand     %%ymm5,%%ymm0,%%ymm0            \n"
3291     "vpand     %%ymm5,%%ymm1,%%ymm1            \n"
3292     "vpackuswb %%ymm1,%%ymm0,%%ymm0            \n"
3293     "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
3294     "vmovdqu   %%ymm0," MEMACCESS(1) "         \n"
3295     "lea      " MEMLEA(0x20,1) ",%1            \n"
3296     "sub       $0x20,%2                        \n"
3297     "jg        1b                              \n"
3298     "vzeroupper                                \n"
3299   : "+r"(src_yuy2),  // %0
3300     "+r"(dst_y),     // %1
3301     "+r"(width)        // %2
3302   :
3303   : "memory", "cc"
3304     , "xmm0", "xmm1", "xmm5"
3305   );
3306 }
3307 
YUY2ToUVRow_AVX2(const uint8 * src_yuy2,int stride_yuy2,uint8 * dst_u,uint8 * dst_v,int width)3308 void YUY2ToUVRow_AVX2(const uint8* src_yuy2,
3309                       int stride_yuy2,
3310                       uint8* dst_u,
3311                       uint8* dst_v,
3312                       int width) {
3313   asm volatile (
3314     "vpcmpeqb  %%ymm5,%%ymm5,%%ymm5            \n"
3315     "vpsrlw    $0x8,%%ymm5,%%ymm5              \n"
3316     "sub       %1,%2                           \n"
3317 
3318     LABELALIGN
3319     "1:                                        \n"
3320     "vmovdqu   " MEMACCESS(0) ",%%ymm0         \n"
3321     "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm1   \n"
3322     VMEMOPREG(vpavgb,0x00,0,4,1,ymm0,ymm0)     // vpavgb (%0,%4,1),%%ymm0,%%ymm0
3323     VMEMOPREG(vpavgb,0x20,0,4,1,ymm1,ymm1)
3324     "lea       " MEMLEA(0x40,0) ",%0           \n"
3325     "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"
3326     "vpsrlw    $0x8,%%ymm1,%%ymm1              \n"
3327     "vpackuswb %%ymm1,%%ymm0,%%ymm0            \n"
3328     "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
3329     "vpand     %%ymm5,%%ymm0,%%ymm1            \n"
3330     "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"
3331     "vpackuswb %%ymm1,%%ymm1,%%ymm1            \n"
3332     "vpackuswb %%ymm0,%%ymm0,%%ymm0            \n"
3333     "vpermq    $0xd8,%%ymm1,%%ymm1             \n"
3334     "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
3335     "vextractf128 $0x0,%%ymm1," MEMACCESS(1) " \n"
3336     VEXTOPMEM(vextractf128,0,ymm0,0x00,1,2,1) // vextractf128 $0x0,%%ymm0,(%1,%2,1)
3337     "lea      " MEMLEA(0x10,1) ",%1            \n"
3338     "sub       $0x20,%3                        \n"
3339     "jg        1b                              \n"
3340     "vzeroupper                                \n"
3341   : "+r"(src_yuy2),    // %0
3342     "+r"(dst_u),       // %1
3343     "+r"(dst_v),       // %2
3344     "+r"(width)          // %3
3345   : "r"((intptr_t)(stride_yuy2))  // %4
3346   : "memory", "cc", NACL_R14
3347     "xmm0", "xmm1", "xmm5"
3348   );
3349 }
3350 
YUY2ToUV422Row_AVX2(const uint8 * src_yuy2,uint8 * dst_u,uint8 * dst_v,int width)3351 void YUY2ToUV422Row_AVX2(const uint8* src_yuy2,
3352                          uint8* dst_u,
3353                          uint8* dst_v,
3354                          int width) {
3355   asm volatile (
3356     "vpcmpeqb  %%ymm5,%%ymm5,%%ymm5            \n"
3357     "vpsrlw    $0x8,%%ymm5,%%ymm5              \n"
3358     "sub       %1,%2                           \n"
3359 
3360     LABELALIGN
3361     "1:                                        \n"
3362     "vmovdqu   " MEMACCESS(0) ",%%ymm0         \n"
3363     "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm1   \n"
3364     "lea       " MEMLEA(0x40,0) ",%0           \n"
3365     "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"
3366     "vpsrlw    $0x8,%%ymm1,%%ymm1              \n"
3367     "vpackuswb %%ymm1,%%ymm0,%%ymm0            \n"
3368     "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
3369     "vpand     %%ymm5,%%ymm0,%%ymm1            \n"
3370     "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"
3371     "vpackuswb %%ymm1,%%ymm1,%%ymm1            \n"
3372     "vpackuswb %%ymm0,%%ymm0,%%ymm0            \n"
3373     "vpermq    $0xd8,%%ymm1,%%ymm1             \n"
3374     "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
3375     "vextractf128 $0x0,%%ymm1," MEMACCESS(1) " \n"
3376     VEXTOPMEM(vextractf128,0,ymm0,0x00,1,2,1) // vextractf128 $0x0,%%ymm0,(%1,%2,1)
3377     "lea      " MEMLEA(0x10,1) ",%1            \n"
3378     "sub       $0x20,%3                        \n"
3379     "jg        1b                              \n"
3380     "vzeroupper                                \n"
3381   : "+r"(src_yuy2),    // %0
3382     "+r"(dst_u),       // %1
3383     "+r"(dst_v),       // %2
3384     "+r"(width)          // %3
3385   :
3386   : "memory", "cc", NACL_R14
3387     "xmm0", "xmm1", "xmm5"
3388   );
3389 }
3390 
UYVYToYRow_AVX2(const uint8 * src_uyvy,uint8 * dst_y,int width)3391 void UYVYToYRow_AVX2(const uint8* src_uyvy, uint8* dst_y, int width) {
3392   asm volatile (
3393     LABELALIGN
3394     "1:                                        \n"
3395     "vmovdqu   " MEMACCESS(0) ",%%ymm0         \n"
3396     "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm1   \n"
3397     "lea       " MEMLEA(0x40,0) ",%0           \n"
3398     "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"
3399     "vpsrlw    $0x8,%%ymm1,%%ymm1              \n"
3400     "vpackuswb %%ymm1,%%ymm0,%%ymm0            \n"
3401     "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
3402     "vmovdqu   %%ymm0," MEMACCESS(1) "         \n"
3403     "lea      " MEMLEA(0x20,1) ",%1            \n"
3404     "sub       $0x20,%2                        \n"
3405     "jg        1b                              \n"
3406     "vzeroupper                                \n"
3407   : "+r"(src_uyvy),  // %0
3408     "+r"(dst_y),     // %1
3409     "+r"(width)        // %2
3410   :
3411   : "memory", "cc"
3412     , "xmm0", "xmm1", "xmm5"
3413   );
3414 }
UYVYToUVRow_AVX2(const uint8 * src_uyvy,int stride_uyvy,uint8 * dst_u,uint8 * dst_v,int width)3415 void UYVYToUVRow_AVX2(const uint8* src_uyvy,
3416                       int stride_uyvy,
3417                       uint8* dst_u,
3418                       uint8* dst_v,
3419                       int width) {
3420   asm volatile (
3421     "vpcmpeqb  %%ymm5,%%ymm5,%%ymm5            \n"
3422     "vpsrlw    $0x8,%%ymm5,%%ymm5              \n"
3423     "sub       %1,%2                           \n"
3424 
3425     LABELALIGN
3426     "1:                                        \n"
3427     "vmovdqu   " MEMACCESS(0) ",%%ymm0         \n"
3428     "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm1   \n"
3429     VMEMOPREG(vpavgb,0x00,0,4,1,ymm0,ymm0)     // vpavgb (%0,%4,1),%%ymm0,%%ymm0
3430     VMEMOPREG(vpavgb,0x20,0,4,1,ymm1,ymm1)
3431     "lea       " MEMLEA(0x40,0) ",%0           \n"
3432     "vpand     %%ymm5,%%ymm0,%%ymm0            \n"
3433     "vpand     %%ymm5,%%ymm1,%%ymm1            \n"
3434     "vpackuswb %%ymm1,%%ymm0,%%ymm0            \n"
3435     "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
3436     "vpand     %%ymm5,%%ymm0,%%ymm1            \n"
3437     "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"
3438     "vpackuswb %%ymm1,%%ymm1,%%ymm1            \n"
3439     "vpackuswb %%ymm0,%%ymm0,%%ymm0            \n"
3440     "vpermq    $0xd8,%%ymm1,%%ymm1             \n"
3441     "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
3442     "vextractf128 $0x0,%%ymm1," MEMACCESS(1) " \n"
3443     VEXTOPMEM(vextractf128,0,ymm0,0x00,1,2,1) // vextractf128 $0x0,%%ymm0,(%1,%2,1)
3444     "lea      " MEMLEA(0x10,1) ",%1            \n"
3445     "sub       $0x20,%3                        \n"
3446     "jg        1b                              \n"
3447     "vzeroupper                                \n"
3448   : "+r"(src_uyvy),    // %0
3449     "+r"(dst_u),       // %1
3450     "+r"(dst_v),       // %2
3451     "+r"(width)          // %3
3452   : "r"((intptr_t)(stride_uyvy))  // %4
3453   : "memory", "cc", NACL_R14
3454     "xmm0", "xmm1", "xmm5"
3455   );
3456 }
3457 
UYVYToUV422Row_AVX2(const uint8 * src_uyvy,uint8 * dst_u,uint8 * dst_v,int width)3458 void UYVYToUV422Row_AVX2(const uint8* src_uyvy,
3459                          uint8* dst_u,
3460                          uint8* dst_v,
3461                          int width) {
3462   asm volatile (
3463     "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
3464     "vpsrlw     $0x8,%%ymm5,%%ymm5             \n"
3465     "sub       %1,%2                           \n"
3466 
3467     LABELALIGN
3468     "1:                                        \n"
3469     "vmovdqu   " MEMACCESS(0) ",%%ymm0         \n"
3470     "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm1   \n"
3471     "lea       " MEMLEA(0x40,0) ",%0           \n"
3472     "vpand     %%ymm5,%%ymm0,%%ymm0            \n"
3473     "vpand     %%ymm5,%%ymm1,%%ymm1            \n"
3474     "vpackuswb %%ymm1,%%ymm0,%%ymm0            \n"
3475     "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
3476     "vpand     %%ymm5,%%ymm0,%%ymm1            \n"
3477     "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"
3478     "vpackuswb %%ymm1,%%ymm1,%%ymm1            \n"
3479     "vpackuswb %%ymm0,%%ymm0,%%ymm0            \n"
3480     "vpermq    $0xd8,%%ymm1,%%ymm1             \n"
3481     "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
3482     "vextractf128 $0x0,%%ymm1," MEMACCESS(1) " \n"
3483     VEXTOPMEM(vextractf128,0,ymm0,0x00,1,2,1) // vextractf128 $0x0,%%ymm0,(%1,%2,1)
3484     "lea      " MEMLEA(0x10,1) ",%1            \n"
3485     "sub       $0x20,%3                        \n"
3486     "jg        1b                              \n"
3487     "vzeroupper                                \n"
3488   : "+r"(src_uyvy),    // %0
3489     "+r"(dst_u),       // %1
3490     "+r"(dst_v),       // %2
3491     "+r"(width)          // %3
3492   :
3493   : "memory", "cc", NACL_R14
3494     "xmm0", "xmm1", "xmm5"
3495   );
3496 }
3497 #endif  // HAS_YUY2TOYROW_AVX2
3498 
3499 #ifdef HAS_ARGBBLENDROW_SSSE3
3500 // Shuffle table for isolating alpha.
3501 static uvec8 kShuffleAlpha = {3u,  0x80, 3u,  0x80, 7u,  0x80, 7u,  0x80,
3502                               11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80};
3503 
3504 // Blend 8 pixels at a time
ARGBBlendRow_SSSE3(const uint8 * src_argb0,const uint8 * src_argb1,uint8 * dst_argb,int width)3505 void ARGBBlendRow_SSSE3(const uint8* src_argb0,
3506                         const uint8* src_argb1,
3507                         uint8* dst_argb,
3508                         int width) {
3509   asm volatile (
3510     "pcmpeqb   %%xmm7,%%xmm7                   \n"
3511     "psrlw     $0xf,%%xmm7                     \n"
3512     "pcmpeqb   %%xmm6,%%xmm6                   \n"
3513     "psrlw     $0x8,%%xmm6                     \n"
3514     "pcmpeqb   %%xmm5,%%xmm5                   \n"
3515     "psllw     $0x8,%%xmm5                     \n"
3516     "pcmpeqb   %%xmm4,%%xmm4                   \n"
3517     "pslld     $0x18,%%xmm4                    \n"
3518     "sub       $0x4,%3                         \n"
3519     "jl        49f                             \n"
3520 
3521     // 4 pixel loop.
3522     LABELALIGN
3523   "40:                                         \n"
3524     "movdqu    " MEMACCESS(0) ",%%xmm3         \n"
3525     "lea       " MEMLEA(0x10,0) ",%0           \n"
3526     "movdqa    %%xmm3,%%xmm0                   \n"
3527     "pxor      %%xmm4,%%xmm3                   \n"
3528     "movdqu    " MEMACCESS(1) ",%%xmm2         \n"
3529     "pshufb    %4,%%xmm3                       \n"
3530     "pand      %%xmm6,%%xmm2                   \n"
3531     "paddw     %%xmm7,%%xmm3                   \n"
3532     "pmullw    %%xmm3,%%xmm2                   \n"
3533     "movdqu    " MEMACCESS(1) ",%%xmm1         \n"
3534     "lea       " MEMLEA(0x10,1) ",%1           \n"
3535     "psrlw     $0x8,%%xmm1                     \n"
3536     "por       %%xmm4,%%xmm0                   \n"
3537     "pmullw    %%xmm3,%%xmm1                   \n"
3538     "psrlw     $0x8,%%xmm2                     \n"
3539     "paddusb   %%xmm2,%%xmm0                   \n"
3540     "pand      %%xmm5,%%xmm1                   \n"
3541     "paddusb   %%xmm1,%%xmm0                   \n"
3542     "movdqu    %%xmm0," MEMACCESS(2) "         \n"
3543     "lea       " MEMLEA(0x10,2) ",%2           \n"
3544     "sub       $0x4,%3                         \n"
3545     "jge       40b                             \n"
3546 
3547   "49:                                         \n"
3548     "add       $0x3,%3                         \n"
3549     "jl        99f                             \n"
3550 
3551     // 1 pixel loop.
3552   "91:                                         \n"
3553     "movd      " MEMACCESS(0) ",%%xmm3         \n"
3554     "lea       " MEMLEA(0x4,0) ",%0            \n"
3555     "movdqa    %%xmm3,%%xmm0                   \n"
3556     "pxor      %%xmm4,%%xmm3                   \n"
3557     "movd      " MEMACCESS(1) ",%%xmm2         \n"
3558     "pshufb    %4,%%xmm3                       \n"
3559     "pand      %%xmm6,%%xmm2                   \n"
3560     "paddw     %%xmm7,%%xmm3                   \n"
3561     "pmullw    %%xmm3,%%xmm2                   \n"
3562     "movd      " MEMACCESS(1) ",%%xmm1         \n"
3563     "lea       " MEMLEA(0x4,1) ",%1            \n"
3564     "psrlw     $0x8,%%xmm1                     \n"
3565     "por       %%xmm4,%%xmm0                   \n"
3566     "pmullw    %%xmm3,%%xmm1                   \n"
3567     "psrlw     $0x8,%%xmm2                     \n"
3568     "paddusb   %%xmm2,%%xmm0                   \n"
3569     "pand      %%xmm5,%%xmm1                   \n"
3570     "paddusb   %%xmm1,%%xmm0                   \n"
3571     "movd      %%xmm0," MEMACCESS(2) "         \n"
3572     "lea       " MEMLEA(0x4,2) ",%2            \n"
3573     "sub       $0x1,%3                         \n"
3574     "jge       91b                             \n"
3575   "99:                                         \n"
3576   : "+r"(src_argb0),    // %0
3577     "+r"(src_argb1),    // %1
3578     "+r"(dst_argb),     // %2
3579     "+r"(width)         // %3
3580   : "m"(kShuffleAlpha)  // %4
3581   : "memory", "cc"
3582     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
3583   );
3584 }
3585 #endif  // HAS_ARGBBLENDROW_SSSE3
3586 
3587 #ifdef HAS_BLENDPLANEROW_SSSE3
3588 // Blend 8 pixels at a time.
3589 // unsigned version of math
3590 // =((A2*C2)+(B2*(255-C2))+255)/256
3591 // signed version of math
3592 // =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256
BlendPlaneRow_SSSE3(const uint8 * src0,const uint8 * src1,const uint8 * alpha,uint8 * dst,int width)3593 void BlendPlaneRow_SSSE3(const uint8* src0,
3594                          const uint8* src1,
3595                          const uint8* alpha,
3596                          uint8* dst,
3597                          int width) {
3598   asm volatile(
3599       "pcmpeqb    %%xmm5,%%xmm5                  \n"
3600       "psllw      $0x8,%%xmm5                    \n"
3601       "mov        $0x80808080,%%eax              \n"
3602       "movd       %%eax,%%xmm6                   \n"
3603       "pshufd     $0x0,%%xmm6,%%xmm6             \n"
3604       "mov        $0x807f807f,%%eax              \n"
3605       "movd       %%eax,%%xmm7                   \n"
3606       "pshufd     $0x0,%%xmm7,%%xmm7             \n"
3607       "sub        %2,%0                          \n"
3608       "sub        %2,%1                          \n"
3609       "sub        %2,%3                          \n"
3610 
3611       // 8 pixel loop.
3612       LABELALIGN
3613       "1:                                        \n"
3614       "movq       (%2),%%xmm0                    \n"
3615       "punpcklbw  %%xmm0,%%xmm0                  \n"
3616       "pxor       %%xmm5,%%xmm0                  \n"
3617       "movq       (%0,%2,1),%%xmm1               \n"
3618       "movq       (%1,%2,1),%%xmm2               \n"
3619       "punpcklbw  %%xmm2,%%xmm1                  \n"
3620       "psubb      %%xmm6,%%xmm1                  \n"
3621       "pmaddubsw  %%xmm1,%%xmm0                  \n"
3622       "paddw      %%xmm7,%%xmm0                  \n"
3623       "psrlw      $0x8,%%xmm0                    \n"
3624       "packuswb   %%xmm0,%%xmm0                  \n"
3625       "movq       %%xmm0,(%3,%2,1)               \n"
3626       "lea        0x8(%2),%2                     \n"
3627       "sub        $0x8,%4                        \n"
3628       "jg        1b                              \n"
3629       : "+r"(src0),   // %0
3630         "+r"(src1),   // %1
3631         "+r"(alpha),  // %2
3632         "+r"(dst),    // %3
3633         "+rm"(width)  // %4
3634         ::"memory",
3635         "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm5", "xmm6", "xmm7");
3636 }
3637 #endif  // HAS_BLENDPLANEROW_SSSE3
3638 
3639 #ifdef HAS_BLENDPLANEROW_AVX2
3640 // Blend 32 pixels at a time.
3641 // unsigned version of math
3642 // =((A2*C2)+(B2*(255-C2))+255)/256
3643 // signed version of math
3644 // =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256
BlendPlaneRow_AVX2(const uint8 * src0,const uint8 * src1,const uint8 * alpha,uint8 * dst,int width)3645 void BlendPlaneRow_AVX2(const uint8* src0,
3646                         const uint8* src1,
3647                         const uint8* alpha,
3648                         uint8* dst,
3649                         int width) {
3650   asm volatile(
3651       "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
3652       "vpsllw     $0x8,%%ymm5,%%ymm5             \n"
3653       "mov        $0x80808080,%%eax              \n"
3654       "vmovd      %%eax,%%xmm6                   \n"
3655       "vbroadcastss %%xmm6,%%ymm6                \n"
3656       "mov        $0x807f807f,%%eax              \n"
3657       "vmovd      %%eax,%%xmm7                   \n"
3658       "vbroadcastss %%xmm7,%%ymm7                \n"
3659       "sub        %2,%0                          \n"
3660       "sub        %2,%1                          \n"
3661       "sub        %2,%3                          \n"
3662 
3663       // 32 pixel loop.
3664       LABELALIGN
3665       "1:                                        \n"
3666       "vmovdqu    (%2),%%ymm0                    \n"
3667       "vpunpckhbw %%ymm0,%%ymm0,%%ymm3           \n"
3668       "vpunpcklbw %%ymm0,%%ymm0,%%ymm0           \n"
3669       "vpxor      %%ymm5,%%ymm3,%%ymm3           \n"
3670       "vpxor      %%ymm5,%%ymm0,%%ymm0           \n"
3671       "vmovdqu    (%0,%2,1),%%ymm1               \n"
3672       "vmovdqu    (%1,%2,1),%%ymm2               \n"
3673       "vpunpckhbw %%ymm2,%%ymm1,%%ymm4           \n"
3674       "vpunpcklbw %%ymm2,%%ymm1,%%ymm1           \n"
3675       "vpsubb     %%ymm6,%%ymm4,%%ymm4           \n"
3676       "vpsubb     %%ymm6,%%ymm1,%%ymm1           \n"
3677       "vpmaddubsw %%ymm4,%%ymm3,%%ymm3           \n"
3678       "vpmaddubsw %%ymm1,%%ymm0,%%ymm0           \n"
3679       "vpaddw     %%ymm7,%%ymm3,%%ymm3           \n"
3680       "vpaddw     %%ymm7,%%ymm0,%%ymm0           \n"
3681       "vpsrlw     $0x8,%%ymm3,%%ymm3             \n"
3682       "vpsrlw     $0x8,%%ymm0,%%ymm0             \n"
3683       "vpackuswb  %%ymm3,%%ymm0,%%ymm0           \n"
3684       "vmovdqu    %%ymm0,(%3,%2,1)               \n"
3685       "lea        0x20(%2),%2                    \n"
3686       "sub        $0x20,%4                       \n"
3687       "jg        1b                              \n"
3688       "vzeroupper                                \n"
3689       : "+r"(src0),   // %0
3690         "+r"(src1),   // %1
3691         "+r"(alpha),  // %2
3692         "+r"(dst),    // %3
3693         "+rm"(width)  // %4
3694         ::"memory",
3695         "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
3696         "xmm7");
3697 }
3698 #endif  // HAS_BLENDPLANEROW_AVX2
3699 
3700 #ifdef HAS_ARGBATTENUATEROW_SSSE3
3701 // Shuffle table duplicating alpha
3702 static uvec8 kShuffleAlpha0 = {3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u,
3703                                7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u};
3704 static uvec8 kShuffleAlpha1 = {11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
3705                                15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u};
3706 // Attenuate 4 pixels at a time.
ARGBAttenuateRow_SSSE3(const uint8 * src_argb,uint8 * dst_argb,int width)3707 void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
3708   asm volatile (
3709     "pcmpeqb   %%xmm3,%%xmm3                   \n"
3710     "pslld     $0x18,%%xmm3                    \n"
3711     "movdqa    %3,%%xmm4                       \n"
3712     "movdqa    %4,%%xmm5                       \n"
3713 
3714     // 4 pixel loop.
3715     LABELALIGN
3716     "1:                                        \n"
3717     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
3718     "pshufb    %%xmm4,%%xmm0                   \n"
3719     "movdqu    " MEMACCESS(0) ",%%xmm1         \n"
3720     "punpcklbw %%xmm1,%%xmm1                   \n"
3721     "pmulhuw   %%xmm1,%%xmm0                   \n"
3722     "movdqu    " MEMACCESS(0) ",%%xmm1         \n"
3723     "pshufb    %%xmm5,%%xmm1                   \n"
3724     "movdqu    " MEMACCESS(0) ",%%xmm2         \n"
3725     "punpckhbw %%xmm2,%%xmm2                   \n"
3726     "pmulhuw   %%xmm2,%%xmm1                   \n"
3727     "movdqu    " MEMACCESS(0) ",%%xmm2         \n"
3728     "lea       " MEMLEA(0x10,0) ",%0           \n"
3729     "pand      %%xmm3,%%xmm2                   \n"
3730     "psrlw     $0x8,%%xmm0                     \n"
3731     "psrlw     $0x8,%%xmm1                     \n"
3732     "packuswb  %%xmm1,%%xmm0                   \n"
3733     "por       %%xmm2,%%xmm0                   \n"
3734     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
3735     "lea       " MEMLEA(0x10,1) ",%1           \n"
3736     "sub       $0x4,%2                         \n"
3737     "jg        1b                              \n"
3738   : "+r"(src_argb),    // %0
3739     "+r"(dst_argb),    // %1
3740     "+r"(width)        // %2
3741   : "m"(kShuffleAlpha0),  // %3
3742     "m"(kShuffleAlpha1)  // %4
3743   : "memory", "cc"
3744     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
3745   );
3746 }
3747 #endif  // HAS_ARGBATTENUATEROW_SSSE3
3748 
3749 #ifdef HAS_ARGBATTENUATEROW_AVX2
3750 // Shuffle table duplicating alpha.
3751 static const uvec8 kShuffleAlpha_AVX2 = {6u,   7u,   6u,   7u,  6u,  7u,
3752                                          128u, 128u, 14u,  15u, 14u, 15u,
3753                                          14u,  15u,  128u, 128u};
3754 // Attenuate 8 pixels at a time.
ARGBAttenuateRow_AVX2(const uint8 * src_argb,uint8 * dst_argb,int width)3755 void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) {
3756   asm volatile (
3757     "vbroadcastf128 %3,%%ymm4                  \n"
3758     "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
3759     "vpslld     $0x18,%%ymm5,%%ymm5            \n"
3760     "sub        %0,%1                          \n"
3761 
3762     // 8 pixel loop.
3763     LABELALIGN
3764     "1:                                        \n"
3765     "vmovdqu    " MEMACCESS(0) ",%%ymm6        \n"
3766     "vpunpcklbw %%ymm6,%%ymm6,%%ymm0           \n"
3767     "vpunpckhbw %%ymm6,%%ymm6,%%ymm1           \n"
3768     "vpshufb    %%ymm4,%%ymm0,%%ymm2           \n"
3769     "vpshufb    %%ymm4,%%ymm1,%%ymm3           \n"
3770     "vpmulhuw   %%ymm2,%%ymm0,%%ymm0           \n"
3771     "vpmulhuw   %%ymm3,%%ymm1,%%ymm1           \n"
3772     "vpand      %%ymm5,%%ymm6,%%ymm6           \n"
3773     "vpsrlw     $0x8,%%ymm0,%%ymm0             \n"
3774     "vpsrlw     $0x8,%%ymm1,%%ymm1             \n"
3775     "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"
3776     "vpor       %%ymm6,%%ymm0,%%ymm0           \n"
3777     MEMOPMEM(vmovdqu,ymm0,0x00,0,1,1)          //  vmovdqu %%ymm0,(%0,%1)
3778     "lea       " MEMLEA(0x20,0) ",%0           \n"
3779     "sub        $0x8,%2                        \n"
3780     "jg        1b                              \n"
3781     "vzeroupper                                \n"
3782   : "+r"(src_argb),    // %0
3783     "+r"(dst_argb),    // %1
3784     "+r"(width)        // %2
3785   : "m"(kShuffleAlpha_AVX2)  // %3
3786   : "memory", "cc"
3787     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
3788   );
3789 }
3790 #endif  // HAS_ARGBATTENUATEROW_AVX2
3791 
3792 #ifdef HAS_ARGBUNATTENUATEROW_SSE2
3793 // Unattenuate 4 pixels at a time.
ARGBUnattenuateRow_SSE2(const uint8 * src_argb,uint8 * dst_argb,int width)3794 void ARGBUnattenuateRow_SSE2(const uint8* src_argb,
3795                              uint8* dst_argb,
3796                              int width) {
3797   uintptr_t alpha;
3798   asm volatile (
3799     // 4 pixel loop.
3800     LABELALIGN
3801     "1:                                        \n"
3802     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
3803     "movzb     " MEMACCESS2(0x03,0) ",%3       \n"
3804     "punpcklbw %%xmm0,%%xmm0                   \n"
3805     MEMOPREG(movd,0x00,4,3,4,xmm2)             //  movd      0x0(%4,%3,4),%%xmm2
3806     "movzb     " MEMACCESS2(0x07,0) ",%3       \n"
3807     MEMOPREG(movd,0x00,4,3,4,xmm3)             //  movd      0x0(%4,%3,4),%%xmm3
3808     "pshuflw   $0x40,%%xmm2,%%xmm2             \n"
3809     "pshuflw   $0x40,%%xmm3,%%xmm3             \n"
3810     "movlhps   %%xmm3,%%xmm2                   \n"
3811     "pmulhuw   %%xmm2,%%xmm0                   \n"
3812     "movdqu    " MEMACCESS(0) ",%%xmm1         \n"
3813     "movzb     " MEMACCESS2(0x0b,0) ",%3       \n"
3814     "punpckhbw %%xmm1,%%xmm1                   \n"
3815     MEMOPREG(movd,0x00,4,3,4,xmm2)             //  movd      0x0(%4,%3,4),%%xmm2
3816     "movzb     " MEMACCESS2(0x0f,0) ",%3       \n"
3817     MEMOPREG(movd,0x00,4,3,4,xmm3)             //  movd      0x0(%4,%3,4),%%xmm3
3818     "pshuflw   $0x40,%%xmm2,%%xmm2             \n"
3819     "pshuflw   $0x40,%%xmm3,%%xmm3             \n"
3820     "movlhps   %%xmm3,%%xmm2                   \n"
3821     "pmulhuw   %%xmm2,%%xmm1                   \n"
3822     "lea       " MEMLEA(0x10,0) ",%0           \n"
3823     "packuswb  %%xmm1,%%xmm0                   \n"
3824     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
3825     "lea       " MEMLEA(0x10,1) ",%1           \n"
3826     "sub       $0x4,%2                         \n"
3827     "jg        1b                              \n"
3828   : "+r"(src_argb),     // %0
3829     "+r"(dst_argb),     // %1
3830     "+r"(width),        // %2
3831     "=&r"(alpha)        // %3
3832   : "r"(fixed_invtbl8)  // %4
3833   : "memory", "cc", NACL_R14
3834     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
3835   );
3836 }
3837 #endif  // HAS_ARGBUNATTENUATEROW_SSE2
3838 
3839 #ifdef HAS_ARGBUNATTENUATEROW_AVX2
3840 // Shuffle table duplicating alpha.
3841 static const uvec8 kUnattenShuffleAlpha_AVX2 = {
3842     0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15u};
3843 // Unattenuate 8 pixels at a time.
ARGBUnattenuateRow_AVX2(const uint8 * src_argb,uint8 * dst_argb,int width)3844 void ARGBUnattenuateRow_AVX2(const uint8* src_argb,
3845                              uint8* dst_argb,
3846                              int width) {
3847   uintptr_t alpha;
3848   asm volatile (
3849     "sub        %0,%1                          \n"
3850     "vbroadcastf128 %5,%%ymm5                  \n"
3851 
3852     // 8 pixel loop.
3853     LABELALIGN
3854     "1:                                        \n"
3855     // replace VPGATHER
3856     "movzb     " MEMACCESS2(0x03,0) ",%3       \n"
3857     MEMOPREG(vmovd,0x00,4,3,4,xmm0)             //  vmovd 0x0(%4,%3,4),%%xmm0
3858     "movzb     " MEMACCESS2(0x07,0) ",%3       \n"
3859     MEMOPREG(vmovd,0x00,4,3,4,xmm1)             //  vmovd 0x0(%4,%3,4),%%xmm1
3860     "movzb     " MEMACCESS2(0x0b,0) ",%3       \n"
3861     "vpunpckldq %%xmm1,%%xmm0,%%xmm6           \n"
3862     MEMOPREG(vmovd,0x00,4,3,4,xmm2)             //  vmovd 0x0(%4,%3,4),%%xmm2
3863     "movzb     " MEMACCESS2(0x0f,0) ",%3       \n"
3864     MEMOPREG(vmovd,0x00,4,3,4,xmm3)             //  vmovd 0x0(%4,%3,4),%%xmm3
3865     "movzb     " MEMACCESS2(0x13,0) ",%3       \n"
3866     "vpunpckldq %%xmm3,%%xmm2,%%xmm7           \n"
3867     MEMOPREG(vmovd,0x00,4,3,4,xmm0)             //  vmovd 0x0(%4,%3,4),%%xmm0
3868     "movzb     " MEMACCESS2(0x17,0) ",%3       \n"
3869     MEMOPREG(vmovd,0x00,4,3,4,xmm1)             //  vmovd 0x0(%4,%3,4),%%xmm1
3870     "movzb     " MEMACCESS2(0x1b,0) ",%3       \n"
3871     "vpunpckldq %%xmm1,%%xmm0,%%xmm0           \n"
3872     MEMOPREG(vmovd,0x00,4,3,4,xmm2)             //  vmovd 0x0(%4,%3,4),%%xmm2
3873     "movzb     " MEMACCESS2(0x1f,0) ",%3       \n"
3874     MEMOPREG(vmovd,0x00,4,3,4,xmm3)             //  vmovd 0x0(%4,%3,4),%%xmm3
3875     "vpunpckldq %%xmm3,%%xmm2,%%xmm2           \n"
3876     "vpunpcklqdq %%xmm7,%%xmm6,%%xmm3          \n"
3877     "vpunpcklqdq %%xmm2,%%xmm0,%%xmm0          \n"
3878     "vinserti128 $0x1,%%xmm0,%%ymm3,%%ymm3     \n"
3879     // end of VPGATHER
3880 
3881     "vmovdqu    " MEMACCESS(0) ",%%ymm6        \n"
3882     "vpunpcklbw %%ymm6,%%ymm6,%%ymm0           \n"
3883     "vpunpckhbw %%ymm6,%%ymm6,%%ymm1           \n"
3884     "vpunpcklwd %%ymm3,%%ymm3,%%ymm2           \n"
3885     "vpunpckhwd %%ymm3,%%ymm3,%%ymm3           \n"
3886     "vpshufb    %%ymm5,%%ymm2,%%ymm2           \n"
3887     "vpshufb    %%ymm5,%%ymm3,%%ymm3           \n"
3888     "vpmulhuw   %%ymm2,%%ymm0,%%ymm0           \n"
3889     "vpmulhuw   %%ymm3,%%ymm1,%%ymm1           \n"
3890     "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"
3891     MEMOPMEM(vmovdqu,ymm0,0x00,0,1,1)          //  vmovdqu %%ymm0,(%0,%1)
3892     "lea       " MEMLEA(0x20,0) ",%0           \n"
3893     "sub        $0x8,%2                        \n"
3894     "jg        1b                              \n"
3895     "vzeroupper                                \n"
3896   : "+r"(src_argb),      // %0
3897     "+r"(dst_argb),      // %1
3898     "+r"(width),         // %2
3899     "=&r"(alpha)         // %3
3900   : "r"(fixed_invtbl8),  // %4
3901     "m"(kUnattenShuffleAlpha_AVX2)  // %5
3902   : "memory", "cc", NACL_R14
3903     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
3904   );
3905 }
3906 #endif  // HAS_ARGBUNATTENUATEROW_AVX2
3907 
3908 #ifdef HAS_ARGBGRAYROW_SSSE3
3909 // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
ARGBGrayRow_SSSE3(const uint8 * src_argb,uint8 * dst_argb,int width)3910 void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
3911   asm volatile (
3912     "movdqa    %3,%%xmm4                       \n"
3913     "movdqa    %4,%%xmm5                       \n"
3914 
3915     // 8 pixel loop.
3916     LABELALIGN
3917     "1:                                        \n"
3918     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
3919     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
3920     "pmaddubsw %%xmm4,%%xmm0                   \n"
3921     "pmaddubsw %%xmm4,%%xmm1                   \n"
3922     "phaddw    %%xmm1,%%xmm0                   \n"
3923     "paddw     %%xmm5,%%xmm0                   \n"
3924     "psrlw     $0x7,%%xmm0                     \n"
3925     "packuswb  %%xmm0,%%xmm0                   \n"
3926     "movdqu    " MEMACCESS(0) ",%%xmm2         \n"
3927     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm3   \n"
3928     "lea       " MEMLEA(0x20,0) ",%0           \n"
3929     "psrld     $0x18,%%xmm2                    \n"
3930     "psrld     $0x18,%%xmm3                    \n"
3931     "packuswb  %%xmm3,%%xmm2                   \n"
3932     "packuswb  %%xmm2,%%xmm2                   \n"
3933     "movdqa    %%xmm0,%%xmm3                   \n"
3934     "punpcklbw %%xmm0,%%xmm0                   \n"
3935     "punpcklbw %%xmm2,%%xmm3                   \n"
3936     "movdqa    %%xmm0,%%xmm1                   \n"
3937     "punpcklwd %%xmm3,%%xmm0                   \n"
3938     "punpckhwd %%xmm3,%%xmm1                   \n"
3939     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
3940     "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
3941     "lea       " MEMLEA(0x20,1) ",%1           \n"
3942     "sub       $0x8,%2                         \n"
3943     "jg        1b                              \n"
3944   : "+r"(src_argb),   // %0
3945     "+r"(dst_argb),   // %1
3946     "+r"(width)       // %2
3947   : "m"(kARGBToYJ),   // %3
3948     "m"(kAddYJ64)     // %4
3949   : "memory", "cc"
3950     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
3951   );
3952 }
3953 #endif  // HAS_ARGBGRAYROW_SSSE3
3954 
3955 #ifdef HAS_ARGBSEPIAROW_SSSE3
3956 //    b = (r * 35 + g * 68 + b * 17) >> 7
3957 //    g = (r * 45 + g * 88 + b * 22) >> 7
3958 //    r = (r * 50 + g * 98 + b * 24) >> 7
3959 // Constant for ARGB color to sepia tone
3960 static vec8 kARGBToSepiaB = {17, 68, 35, 0, 17, 68, 35, 0,
3961                              17, 68, 35, 0, 17, 68, 35, 0};
3962 
3963 static vec8 kARGBToSepiaG = {22, 88, 45, 0, 22, 88, 45, 0,
3964                              22, 88, 45, 0, 22, 88, 45, 0};
3965 
3966 static vec8 kARGBToSepiaR = {24, 98, 50, 0, 24, 98, 50, 0,
3967                              24, 98, 50, 0, 24, 98, 50, 0};
3968 
3969 // Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
ARGBSepiaRow_SSSE3(uint8 * dst_argb,int width)3970 void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
3971   asm volatile (
3972     "movdqa    %2,%%xmm2                       \n"
3973     "movdqa    %3,%%xmm3                       \n"
3974     "movdqa    %4,%%xmm4                       \n"
3975 
3976     // 8 pixel loop.
3977     LABELALIGN
3978     "1:                                        \n"
3979     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
3980     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm6   \n"
3981     "pmaddubsw %%xmm2,%%xmm0                   \n"
3982     "pmaddubsw %%xmm2,%%xmm6                   \n"
3983     "phaddw    %%xmm6,%%xmm0                   \n"
3984     "psrlw     $0x7,%%xmm0                     \n"
3985     "packuswb  %%xmm0,%%xmm0                   \n"
3986     "movdqu    " MEMACCESS(0) ",%%xmm5         \n"
3987     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
3988     "pmaddubsw %%xmm3,%%xmm5                   \n"
3989     "pmaddubsw %%xmm3,%%xmm1                   \n"
3990     "phaddw    %%xmm1,%%xmm5                   \n"
3991     "psrlw     $0x7,%%xmm5                     \n"
3992     "packuswb  %%xmm5,%%xmm5                   \n"
3993     "punpcklbw %%xmm5,%%xmm0                   \n"
3994     "movdqu    " MEMACCESS(0) ",%%xmm5         \n"
3995     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
3996     "pmaddubsw %%xmm4,%%xmm5                   \n"
3997     "pmaddubsw %%xmm4,%%xmm1                   \n"
3998     "phaddw    %%xmm1,%%xmm5                   \n"
3999     "psrlw     $0x7,%%xmm5                     \n"
4000     "packuswb  %%xmm5,%%xmm5                   \n"
4001     "movdqu    " MEMACCESS(0) ",%%xmm6         \n"
4002     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
4003     "psrld     $0x18,%%xmm6                    \n"
4004     "psrld     $0x18,%%xmm1                    \n"
4005     "packuswb  %%xmm1,%%xmm6                   \n"
4006     "packuswb  %%xmm6,%%xmm6                   \n"
4007     "punpcklbw %%xmm6,%%xmm5                   \n"
4008     "movdqa    %%xmm0,%%xmm1                   \n"
4009     "punpcklwd %%xmm5,%%xmm0                   \n"
4010     "punpckhwd %%xmm5,%%xmm1                   \n"
4011     "movdqu    %%xmm0," MEMACCESS(0) "         \n"
4012     "movdqu    %%xmm1," MEMACCESS2(0x10,0) "   \n"
4013     "lea       " MEMLEA(0x20,0) ",%0           \n"
4014     "sub       $0x8,%1                         \n"
4015     "jg        1b                              \n"
4016   : "+r"(dst_argb),      // %0
4017     "+r"(width)          // %1
4018   : "m"(kARGBToSepiaB),  // %2
4019     "m"(kARGBToSepiaG),  // %3
4020     "m"(kARGBToSepiaR)   // %4
4021   : "memory", "cc"
4022     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
4023   );
4024 }
4025 #endif  // HAS_ARGBSEPIAROW_SSSE3
4026 
4027 #ifdef HAS_ARGBCOLORMATRIXROW_SSSE3
4028 // Tranform 8 ARGB pixels (32 bytes) with color matrix.
4029 // Same as Sepia except matrix is provided.
ARGBColorMatrixRow_SSSE3(const uint8 * src_argb,uint8 * dst_argb,const int8 * matrix_argb,int width)4030 void ARGBColorMatrixRow_SSSE3(const uint8* src_argb,
4031                               uint8* dst_argb,
4032                               const int8* matrix_argb,
4033                               int width) {
4034   asm volatile (
4035     "movdqu    " MEMACCESS(3) ",%%xmm5         \n"
4036     "pshufd    $0x00,%%xmm5,%%xmm2             \n"
4037     "pshufd    $0x55,%%xmm5,%%xmm3             \n"
4038     "pshufd    $0xaa,%%xmm5,%%xmm4             \n"
4039     "pshufd    $0xff,%%xmm5,%%xmm5             \n"
4040 
4041     // 8 pixel loop.
4042     LABELALIGN
4043     "1:                                        \n"
4044     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
4045     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm7   \n"
4046     "pmaddubsw %%xmm2,%%xmm0                   \n"
4047     "pmaddubsw %%xmm2,%%xmm7                   \n"
4048     "movdqu    " MEMACCESS(0) ",%%xmm6         \n"
4049     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
4050     "pmaddubsw %%xmm3,%%xmm6                   \n"
4051     "pmaddubsw %%xmm3,%%xmm1                   \n"
4052     "phaddsw   %%xmm7,%%xmm0                   \n"
4053     "phaddsw   %%xmm1,%%xmm6                   \n"
4054     "psraw     $0x6,%%xmm0                     \n"
4055     "psraw     $0x6,%%xmm6                     \n"
4056     "packuswb  %%xmm0,%%xmm0                   \n"
4057     "packuswb  %%xmm6,%%xmm6                   \n"
4058     "punpcklbw %%xmm6,%%xmm0                   \n"
4059     "movdqu    " MEMACCESS(0) ",%%xmm1         \n"
4060     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm7   \n"
4061     "pmaddubsw %%xmm4,%%xmm1                   \n"
4062     "pmaddubsw %%xmm4,%%xmm7                   \n"
4063     "phaddsw   %%xmm7,%%xmm1                   \n"
4064     "movdqu    " MEMACCESS(0) ",%%xmm6         \n"
4065     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm7   \n"
4066     "pmaddubsw %%xmm5,%%xmm6                   \n"
4067     "pmaddubsw %%xmm5,%%xmm7                   \n"
4068     "phaddsw   %%xmm7,%%xmm6                   \n"
4069     "psraw     $0x6,%%xmm1                     \n"
4070     "psraw     $0x6,%%xmm6                     \n"
4071     "packuswb  %%xmm1,%%xmm1                   \n"
4072     "packuswb  %%xmm6,%%xmm6                   \n"
4073     "punpcklbw %%xmm6,%%xmm1                   \n"
4074     "movdqa    %%xmm0,%%xmm6                   \n"
4075     "punpcklwd %%xmm1,%%xmm0                   \n"
4076     "punpckhwd %%xmm1,%%xmm6                   \n"
4077     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
4078     "movdqu    %%xmm6," MEMACCESS2(0x10,1) "   \n"
4079     "lea       " MEMLEA(0x20,0) ",%0           \n"
4080     "lea       " MEMLEA(0x20,1) ",%1           \n"
4081     "sub       $0x8,%2                         \n"
4082     "jg        1b                              \n"
4083   : "+r"(src_argb),      // %0
4084     "+r"(dst_argb),      // %1
4085     "+r"(width)          // %2
4086   : "r"(matrix_argb)     // %3
4087   : "memory", "cc"
4088     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
4089   );
4090 }
4091 #endif  // HAS_ARGBCOLORMATRIXROW_SSSE3
4092 
4093 #ifdef HAS_ARGBQUANTIZEROW_SSE2
4094 // Quantize 4 ARGB pixels (16 bytes).
ARGBQuantizeRow_SSE2(uint8 * dst_argb,int scale,int interval_size,int interval_offset,int width)4095 void ARGBQuantizeRow_SSE2(uint8* dst_argb,
4096                           int scale,
4097                           int interval_size,
4098                           int interval_offset,
4099                           int width) {
4100   asm volatile (
4101     "movd      %2,%%xmm2                       \n"
4102     "movd      %3,%%xmm3                       \n"
4103     "movd      %4,%%xmm4                       \n"
4104     "pshuflw   $0x40,%%xmm2,%%xmm2             \n"
4105     "pshufd    $0x44,%%xmm2,%%xmm2             \n"
4106     "pshuflw   $0x40,%%xmm3,%%xmm3             \n"
4107     "pshufd    $0x44,%%xmm3,%%xmm3             \n"
4108     "pshuflw   $0x40,%%xmm4,%%xmm4             \n"
4109     "pshufd    $0x44,%%xmm4,%%xmm4             \n"
4110     "pxor      %%xmm5,%%xmm5                   \n"
4111     "pcmpeqb   %%xmm6,%%xmm6                   \n"
4112     "pslld     $0x18,%%xmm6                    \n"
4113 
4114     // 4 pixel loop.
4115     LABELALIGN
4116     "1:                                        \n"
4117     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
4118     "punpcklbw %%xmm5,%%xmm0                   \n"
4119     "pmulhuw   %%xmm2,%%xmm0                   \n"
4120     "movdqu    " MEMACCESS(0) ",%%xmm1         \n"
4121     "punpckhbw %%xmm5,%%xmm1                   \n"
4122     "pmulhuw   %%xmm2,%%xmm1                   \n"
4123     "pmullw    %%xmm3,%%xmm0                   \n"
4124     "movdqu    " MEMACCESS(0) ",%%xmm7         \n"
4125     "pmullw    %%xmm3,%%xmm1                   \n"
4126     "pand      %%xmm6,%%xmm7                   \n"
4127     "paddw     %%xmm4,%%xmm0                   \n"
4128     "paddw     %%xmm4,%%xmm1                   \n"
4129     "packuswb  %%xmm1,%%xmm0                   \n"
4130     "por       %%xmm7,%%xmm0                   \n"
4131     "movdqu    %%xmm0," MEMACCESS(0) "         \n"
4132     "lea       " MEMLEA(0x10,0) ",%0           \n"
4133     "sub       $0x4,%1                         \n"
4134     "jg        1b                              \n"
4135   : "+r"(dst_argb),       // %0
4136     "+r"(width)           // %1
4137   : "r"(scale),           // %2
4138     "r"(interval_size),   // %3
4139     "r"(interval_offset)  // %4
4140   : "memory", "cc"
4141     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
4142   );
4143 }
4144 #endif  // HAS_ARGBQUANTIZEROW_SSE2
4145 
4146 #ifdef HAS_ARGBSHADEROW_SSE2
4147 // Shade 4 pixels at a time by specified value.
ARGBShadeRow_SSE2(const uint8 * src_argb,uint8 * dst_argb,int width,uint32 value)4148 void ARGBShadeRow_SSE2(const uint8* src_argb,
4149                        uint8* dst_argb,
4150                        int width,
4151                        uint32 value) {
4152   asm volatile (
4153     "movd      %3,%%xmm2                       \n"
4154     "punpcklbw %%xmm2,%%xmm2                   \n"
4155     "punpcklqdq %%xmm2,%%xmm2                  \n"
4156 
4157     // 4 pixel loop.
4158     LABELALIGN
4159     "1:                                        \n"
4160     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
4161     "lea       " MEMLEA(0x10,0) ",%0           \n"
4162     "movdqa    %%xmm0,%%xmm1                   \n"
4163     "punpcklbw %%xmm0,%%xmm0                   \n"
4164     "punpckhbw %%xmm1,%%xmm1                   \n"
4165     "pmulhuw   %%xmm2,%%xmm0                   \n"
4166     "pmulhuw   %%xmm2,%%xmm1                   \n"
4167     "psrlw     $0x8,%%xmm0                     \n"
4168     "psrlw     $0x8,%%xmm1                     \n"
4169     "packuswb  %%xmm1,%%xmm0                   \n"
4170     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
4171     "lea       " MEMLEA(0x10,1) ",%1           \n"
4172     "sub       $0x4,%2                         \n"
4173     "jg        1b                              \n"
4174   : "+r"(src_argb),  // %0
4175     "+r"(dst_argb),  // %1
4176     "+r"(width)      // %2
4177   : "r"(value)       // %3
4178   : "memory", "cc"
4179     , "xmm0", "xmm1", "xmm2"
4180   );
4181 }
4182 #endif  // HAS_ARGBSHADEROW_SSE2
4183 
4184 #ifdef HAS_ARGBMULTIPLYROW_SSE2
4185 // Multiply 2 rows of ARGB pixels together, 4 pixels at a time.
ARGBMultiplyRow_SSE2(const uint8 * src_argb0,const uint8 * src_argb1,uint8 * dst_argb,int width)4186 void ARGBMultiplyRow_SSE2(const uint8* src_argb0,
4187                           const uint8* src_argb1,
4188                           uint8* dst_argb,
4189                           int width) {
4190   asm volatile (
4191     "pxor      %%xmm5,%%xmm5                   \n"
4192 
4193     // 4 pixel loop.
4194     LABELALIGN
4195     "1:                                        \n"
4196     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
4197     "lea       " MEMLEA(0x10,0) ",%0           \n"
4198     "movdqu    " MEMACCESS(1) ",%%xmm2         \n"
4199     "lea       " MEMLEA(0x10,1) ",%1           \n"
4200     "movdqu    %%xmm0,%%xmm1                   \n"
4201     "movdqu    %%xmm2,%%xmm3                   \n"
4202     "punpcklbw %%xmm0,%%xmm0                   \n"
4203     "punpckhbw %%xmm1,%%xmm1                   \n"
4204     "punpcklbw %%xmm5,%%xmm2                   \n"
4205     "punpckhbw %%xmm5,%%xmm3                   \n"
4206     "pmulhuw   %%xmm2,%%xmm0                   \n"
4207     "pmulhuw   %%xmm3,%%xmm1                   \n"
4208     "packuswb  %%xmm1,%%xmm0                   \n"
4209     "movdqu    %%xmm0," MEMACCESS(2) "         \n"
4210     "lea       " MEMLEA(0x10,2) ",%2           \n"
4211     "sub       $0x4,%3                         \n"
4212     "jg        1b                              \n"
4213   : "+r"(src_argb0),  // %0
4214     "+r"(src_argb1),  // %1
4215     "+r"(dst_argb),   // %2
4216     "+r"(width)       // %3
4217   :
4218   : "memory", "cc"
4219     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
4220   );
4221 }
4222 #endif  // HAS_ARGBMULTIPLYROW_SSE2
4223 
4224 #ifdef HAS_ARGBMULTIPLYROW_AVX2
4225 // Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
ARGBMultiplyRow_AVX2(const uint8 * src_argb0,const uint8 * src_argb1,uint8 * dst_argb,int width)4226 void ARGBMultiplyRow_AVX2(const uint8* src_argb0,
4227                           const uint8* src_argb1,
4228                           uint8* dst_argb,
4229                           int width) {
4230   asm volatile (
4231     "vpxor      %%ymm5,%%ymm5,%%ymm5           \n"
4232 
4233     // 4 pixel loop.
4234     LABELALIGN
4235     "1:                                        \n"
4236     "vmovdqu    " MEMACCESS(0) ",%%ymm1        \n"
4237     "lea        " MEMLEA(0x20,0) ",%0          \n"
4238     "vmovdqu    " MEMACCESS(1) ",%%ymm3        \n"
4239     "lea        " MEMLEA(0x20,1) ",%1          \n"
4240     "vpunpcklbw %%ymm1,%%ymm1,%%ymm0           \n"
4241     "vpunpckhbw %%ymm1,%%ymm1,%%ymm1           \n"
4242     "vpunpcklbw %%ymm5,%%ymm3,%%ymm2           \n"
4243     "vpunpckhbw %%ymm5,%%ymm3,%%ymm3           \n"
4244     "vpmulhuw   %%ymm2,%%ymm0,%%ymm0           \n"
4245     "vpmulhuw   %%ymm3,%%ymm1,%%ymm1           \n"
4246     "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"
4247     "vmovdqu    %%ymm0," MEMACCESS(2) "        \n"
4248     "lea       " MEMLEA(0x20,2) ",%2           \n"
4249     "sub        $0x8,%3                        \n"
4250     "jg        1b                              \n"
4251     "vzeroupper                                \n"
4252   : "+r"(src_argb0),  // %0
4253     "+r"(src_argb1),  // %1
4254     "+r"(dst_argb),   // %2
4255     "+r"(width)       // %3
4256   :
4257   : "memory", "cc"
4258 #if defined(__AVX2__)
4259     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
4260 #endif
4261   );
4262 }
4263 #endif  // HAS_ARGBMULTIPLYROW_AVX2
4264 
4265 #ifdef HAS_ARGBADDROW_SSE2
4266 // Add 2 rows of ARGB pixels together, 4 pixels at a time.
ARGBAddRow_SSE2(const uint8 * src_argb0,const uint8 * src_argb1,uint8 * dst_argb,int width)4267 void ARGBAddRow_SSE2(const uint8* src_argb0,
4268                      const uint8* src_argb1,
4269                      uint8* dst_argb,
4270                      int width) {
4271   asm volatile (
4272     // 4 pixel loop.
4273     LABELALIGN
4274     "1:                                        \n"
4275     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
4276     "lea       " MEMLEA(0x10,0) ",%0           \n"
4277     "movdqu    " MEMACCESS(1) ",%%xmm1         \n"
4278     "lea       " MEMLEA(0x10,1) ",%1           \n"
4279     "paddusb   %%xmm1,%%xmm0                   \n"
4280     "movdqu    %%xmm0," MEMACCESS(2) "         \n"
4281     "lea       " MEMLEA(0x10,2) ",%2           \n"
4282     "sub       $0x4,%3                         \n"
4283     "jg        1b                              \n"
4284   : "+r"(src_argb0),  // %0
4285     "+r"(src_argb1),  // %1
4286     "+r"(dst_argb),   // %2
4287     "+r"(width)       // %3
4288   :
4289   : "memory", "cc"
4290     , "xmm0", "xmm1"
4291   );
4292 }
4293 #endif  // HAS_ARGBADDROW_SSE2
4294 
4295 #ifdef HAS_ARGBADDROW_AVX2
4296 // Add 2 rows of ARGB pixels together, 4 pixels at a time.
ARGBAddRow_AVX2(const uint8 * src_argb0,const uint8 * src_argb1,uint8 * dst_argb,int width)4297 void ARGBAddRow_AVX2(const uint8* src_argb0,
4298                      const uint8* src_argb1,
4299                      uint8* dst_argb,
4300                      int width) {
4301   asm volatile (
4302     // 4 pixel loop.
4303     LABELALIGN
4304     "1:                                        \n"
4305     "vmovdqu    " MEMACCESS(0) ",%%ymm0        \n"
4306     "lea        " MEMLEA(0x20,0) ",%0          \n"
4307     "vpaddusb   " MEMACCESS(1) ",%%ymm0,%%ymm0 \n"
4308     "lea        " MEMLEA(0x20,1) ",%1          \n"
4309     "vmovdqu    %%ymm0," MEMACCESS(2) "        \n"
4310     "lea        " MEMLEA(0x20,2) ",%2          \n"
4311     "sub        $0x8,%3                        \n"
4312     "jg        1b                              \n"
4313     "vzeroupper                                \n"
4314   : "+r"(src_argb0),  // %0
4315     "+r"(src_argb1),  // %1
4316     "+r"(dst_argb),   // %2
4317     "+r"(width)       // %3
4318   :
4319   : "memory", "cc"
4320     , "xmm0"
4321   );
4322 }
4323 #endif  // HAS_ARGBADDROW_AVX2
4324 
4325 #ifdef HAS_ARGBSUBTRACTROW_SSE2
4326 // Subtract 2 rows of ARGB pixels, 4 pixels at a time.
ARGBSubtractRow_SSE2(const uint8 * src_argb0,const uint8 * src_argb1,uint8 * dst_argb,int width)4327 void ARGBSubtractRow_SSE2(const uint8* src_argb0,
4328                           const uint8* src_argb1,
4329                           uint8* dst_argb,
4330                           int width) {
4331   asm volatile (
4332     // 4 pixel loop.
4333     LABELALIGN
4334     "1:                                        \n"
4335     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
4336     "lea       " MEMLEA(0x10,0) ",%0           \n"
4337     "movdqu    " MEMACCESS(1) ",%%xmm1         \n"
4338     "lea       " MEMLEA(0x10,1) ",%1           \n"
4339     "psubusb   %%xmm1,%%xmm0                   \n"
4340     "movdqu    %%xmm0," MEMACCESS(2) "         \n"
4341     "lea       " MEMLEA(0x10,2) ",%2           \n"
4342     "sub       $0x4,%3                         \n"
4343     "jg        1b                              \n"
4344   : "+r"(src_argb0),  // %0
4345     "+r"(src_argb1),  // %1
4346     "+r"(dst_argb),   // %2
4347     "+r"(width)       // %3
4348   :
4349   : "memory", "cc"
4350     , "xmm0", "xmm1"
4351   );
4352 }
4353 #endif  // HAS_ARGBSUBTRACTROW_SSE2
4354 
4355 #ifdef HAS_ARGBSUBTRACTROW_AVX2
4356 // Subtract 2 rows of ARGB pixels, 8 pixels at a time.
ARGBSubtractRow_AVX2(const uint8 * src_argb0,const uint8 * src_argb1,uint8 * dst_argb,int width)4357 void ARGBSubtractRow_AVX2(const uint8* src_argb0,
4358                           const uint8* src_argb1,
4359                           uint8* dst_argb,
4360                           int width) {
4361   asm volatile (
4362     // 4 pixel loop.
4363     LABELALIGN
4364     "1:                                        \n"
4365     "vmovdqu    " MEMACCESS(0) ",%%ymm0        \n"
4366     "lea        " MEMLEA(0x20,0) ",%0          \n"
4367     "vpsubusb   " MEMACCESS(1) ",%%ymm0,%%ymm0 \n"
4368     "lea        " MEMLEA(0x20,1) ",%1          \n"
4369     "vmovdqu    %%ymm0," MEMACCESS(2) "        \n"
4370     "lea        " MEMLEA(0x20,2) ",%2          \n"
4371     "sub        $0x8,%3                        \n"
4372     "jg         1b                             \n"
4373     "vzeroupper                                \n"
4374   : "+r"(src_argb0),  // %0
4375     "+r"(src_argb1),  // %1
4376     "+r"(dst_argb),   // %2
4377     "+r"(width)       // %3
4378   :
4379   : "memory", "cc"
4380     , "xmm0"
4381   );
4382 }
4383 #endif  // HAS_ARGBSUBTRACTROW_AVX2
4384 
4385 #ifdef HAS_SOBELXROW_SSE2
4386 // SobelX as a matrix is
4387 // -1  0  1
4388 // -2  0  2
4389 // -1  0  1
SobelXRow_SSE2(const uint8 * src_y0,const uint8 * src_y1,const uint8 * src_y2,uint8 * dst_sobelx,int width)4390 void SobelXRow_SSE2(const uint8* src_y0,
4391                     const uint8* src_y1,
4392                     const uint8* src_y2,
4393                     uint8* dst_sobelx,
4394                     int width) {
4395   asm volatile (
4396     "sub       %0,%1                           \n"
4397     "sub       %0,%2                           \n"
4398     "sub       %0,%3                           \n"
4399     "pxor      %%xmm5,%%xmm5                   \n"
4400 
4401     // 8 pixel loop.
4402     LABELALIGN
4403     "1:                                        \n"
4404     "movq      " MEMACCESS(0) ",%%xmm0         \n"
4405     "movq      " MEMACCESS2(0x2,0) ",%%xmm1    \n"
4406     "punpcklbw %%xmm5,%%xmm0                   \n"
4407     "punpcklbw %%xmm5,%%xmm1                   \n"
4408     "psubw     %%xmm1,%%xmm0                   \n"
4409     MEMOPREG(movq,0x00,0,1,1,xmm1)             //  movq      (%0,%1,1),%%xmm1
4410     MEMOPREG(movq,0x02,0,1,1,xmm2)             //  movq      0x2(%0,%1,1),%%xmm2
4411     "punpcklbw %%xmm5,%%xmm1                   \n"
4412     "punpcklbw %%xmm5,%%xmm2                   \n"
4413     "psubw     %%xmm2,%%xmm1                   \n"
4414     MEMOPREG(movq,0x00,0,2,1,xmm2)             //  movq      (%0,%2,1),%%xmm2
4415     MEMOPREG(movq,0x02,0,2,1,xmm3)             //  movq      0x2(%0,%2,1),%%xmm3
4416     "punpcklbw %%xmm5,%%xmm2                   \n"
4417     "punpcklbw %%xmm5,%%xmm3                   \n"
4418     "psubw     %%xmm3,%%xmm2                   \n"
4419     "paddw     %%xmm2,%%xmm0                   \n"
4420     "paddw     %%xmm1,%%xmm0                   \n"
4421     "paddw     %%xmm1,%%xmm0                   \n"
4422     "pxor      %%xmm1,%%xmm1                   \n"
4423     "psubw     %%xmm0,%%xmm1                   \n"
4424     "pmaxsw    %%xmm1,%%xmm0                   \n"
4425     "packuswb  %%xmm0,%%xmm0                   \n"
4426     MEMOPMEM(movq,xmm0,0x00,0,3,1)             //  movq      %%xmm0,(%0,%3,1)
4427     "lea       " MEMLEA(0x8,0) ",%0            \n"
4428     "sub       $0x8,%4                         \n"
4429     "jg        1b                              \n"
4430   : "+r"(src_y0),      // %0
4431     "+r"(src_y1),      // %1
4432     "+r"(src_y2),      // %2
4433     "+r"(dst_sobelx),  // %3
4434     "+r"(width)        // %4
4435   :
4436   : "memory", "cc", NACL_R14
4437     "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
4438   );
4439 }
4440 #endif  // HAS_SOBELXROW_SSE2
4441 
4442 #ifdef HAS_SOBELYROW_SSE2
4443 // SobelY as a matrix is
4444 // -1 -2 -1
4445 //  0  0  0
4446 //  1  2  1
SobelYRow_SSE2(const uint8 * src_y0,const uint8 * src_y1,uint8 * dst_sobely,int width)4447 void SobelYRow_SSE2(const uint8* src_y0,
4448                     const uint8* src_y1,
4449                     uint8* dst_sobely,
4450                     int width) {
4451   asm volatile (
4452     "sub       %0,%1                           \n"
4453     "sub       %0,%2                           \n"
4454     "pxor      %%xmm5,%%xmm5                   \n"
4455 
4456     // 8 pixel loop.
4457     LABELALIGN
4458     "1:                                        \n"
4459     "movq      " MEMACCESS(0) ",%%xmm0         \n"
4460     MEMOPREG(movq,0x00,0,1,1,xmm1)             //  movq      (%0,%1,1),%%xmm1
4461     "punpcklbw %%xmm5,%%xmm0                   \n"
4462     "punpcklbw %%xmm5,%%xmm1                   \n"
4463     "psubw     %%xmm1,%%xmm0                   \n"
4464     "movq      " MEMACCESS2(0x1,0) ",%%xmm1    \n"
4465     MEMOPREG(movq,0x01,0,1,1,xmm2)             //  movq      0x1(%0,%1,1),%%xmm2
4466     "punpcklbw %%xmm5,%%xmm1                   \n"
4467     "punpcklbw %%xmm5,%%xmm2                   \n"
4468     "psubw     %%xmm2,%%xmm1                   \n"
4469     "movq      " MEMACCESS2(0x2,0) ",%%xmm2    \n"
4470     MEMOPREG(movq,0x02,0,1,1,xmm3)             //  movq      0x2(%0,%1,1),%%xmm3
4471     "punpcklbw %%xmm5,%%xmm2                   \n"
4472     "punpcklbw %%xmm5,%%xmm3                   \n"
4473     "psubw     %%xmm3,%%xmm2                   \n"
4474     "paddw     %%xmm2,%%xmm0                   \n"
4475     "paddw     %%xmm1,%%xmm0                   \n"
4476     "paddw     %%xmm1,%%xmm0                   \n"
4477     "pxor      %%xmm1,%%xmm1                   \n"
4478     "psubw     %%xmm0,%%xmm1                   \n"
4479     "pmaxsw    %%xmm1,%%xmm0                   \n"
4480     "packuswb  %%xmm0,%%xmm0                   \n"
4481     MEMOPMEM(movq,xmm0,0x00,0,2,1)             //  movq      %%xmm0,(%0,%2,1)
4482     "lea       " MEMLEA(0x8,0) ",%0            \n"
4483     "sub       $0x8,%3                         \n"
4484     "jg        1b                              \n"
4485   : "+r"(src_y0),      // %0
4486     "+r"(src_y1),      // %1
4487     "+r"(dst_sobely),  // %2
4488     "+r"(width)        // %3
4489   :
4490   : "memory", "cc", NACL_R14
4491     "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
4492   );
4493 }
4494 #endif  // HAS_SOBELYROW_SSE2
4495 
4496 #ifdef HAS_SOBELROW_SSE2
4497 // Adds Sobel X and Sobel Y and stores Sobel into ARGB.
4498 // A = 255
4499 // R = Sobel
4500 // G = Sobel
4501 // B = Sobel
SobelRow_SSE2(const uint8 * src_sobelx,const uint8 * src_sobely,uint8 * dst_argb,int width)4502 void SobelRow_SSE2(const uint8* src_sobelx,
4503                    const uint8* src_sobely,
4504                    uint8* dst_argb,
4505                    int width) {
4506   asm volatile (
4507     "sub       %0,%1                           \n"
4508     "pcmpeqb   %%xmm5,%%xmm5                   \n"
4509     "pslld     $0x18,%%xmm5                    \n"
4510 
4511     // 8 pixel loop.
4512     LABELALIGN
4513     "1:                                        \n"
4514     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
4515     MEMOPREG(movdqu,0x00,0,1,1,xmm1)           //  movdqu    (%0,%1,1),%%xmm1
4516     "lea       " MEMLEA(0x10,0) ",%0           \n"
4517     "paddusb   %%xmm1,%%xmm0                   \n"
4518     "movdqa    %%xmm0,%%xmm2                   \n"
4519     "punpcklbw %%xmm0,%%xmm2                   \n"
4520     "punpckhbw %%xmm0,%%xmm0                   \n"
4521     "movdqa    %%xmm2,%%xmm1                   \n"
4522     "punpcklwd %%xmm2,%%xmm1                   \n"
4523     "punpckhwd %%xmm2,%%xmm2                   \n"
4524     "por       %%xmm5,%%xmm1                   \n"
4525     "por       %%xmm5,%%xmm2                   \n"
4526     "movdqa    %%xmm0,%%xmm3                   \n"
4527     "punpcklwd %%xmm0,%%xmm3                   \n"
4528     "punpckhwd %%xmm0,%%xmm0                   \n"
4529     "por       %%xmm5,%%xmm3                   \n"
4530     "por       %%xmm5,%%xmm0                   \n"
4531     "movdqu    %%xmm1," MEMACCESS(2) "         \n"
4532     "movdqu    %%xmm2," MEMACCESS2(0x10,2) "   \n"
4533     "movdqu    %%xmm3," MEMACCESS2(0x20,2) "   \n"
4534     "movdqu    %%xmm0," MEMACCESS2(0x30,2) "   \n"
4535     "lea       " MEMLEA(0x40,2) ",%2           \n"
4536     "sub       $0x10,%3                        \n"
4537     "jg        1b                              \n"
4538   : "+r"(src_sobelx),  // %0
4539     "+r"(src_sobely),  // %1
4540     "+r"(dst_argb),    // %2
4541     "+r"(width)        // %3
4542   :
4543   : "memory", "cc", NACL_R14
4544     "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
4545   );
4546 }
4547 #endif  // HAS_SOBELROW_SSE2
4548 
4549 #ifdef HAS_SOBELTOPLANEROW_SSE2
4550 // Adds Sobel X and Sobel Y and stores Sobel into a plane.
SobelToPlaneRow_SSE2(const uint8 * src_sobelx,const uint8 * src_sobely,uint8 * dst_y,int width)4551 void SobelToPlaneRow_SSE2(const uint8* src_sobelx,
4552                           const uint8* src_sobely,
4553                           uint8* dst_y,
4554                           int width) {
4555   asm volatile (
4556     "sub       %0,%1                           \n"
4557     "pcmpeqb   %%xmm5,%%xmm5                   \n"
4558     "pslld     $0x18,%%xmm5                    \n"
4559 
4560     // 8 pixel loop.
4561     LABELALIGN
4562     "1:                                        \n"
4563     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
4564     MEMOPREG(movdqu,0x00,0,1,1,xmm1)           //  movdqu    (%0,%1,1),%%xmm1
4565     "lea       " MEMLEA(0x10,0) ",%0           \n"
4566     "paddusb   %%xmm1,%%xmm0                   \n"
4567     "movdqu    %%xmm0," MEMACCESS(2) "         \n"
4568     "lea       " MEMLEA(0x10,2) ",%2           \n"
4569     "sub       $0x10,%3                        \n"
4570     "jg        1b                              \n"
4571   : "+r"(src_sobelx),  // %0
4572     "+r"(src_sobely),  // %1
4573     "+r"(dst_y),       // %2
4574     "+r"(width)        // %3
4575   :
4576   : "memory", "cc", NACL_R14
4577     "xmm0", "xmm1"
4578   );
4579 }
4580 #endif  // HAS_SOBELTOPLANEROW_SSE2
4581 
4582 #ifdef HAS_SOBELXYROW_SSE2
4583 // Mixes Sobel X, Sobel Y and Sobel into ARGB.
4584 // A = 255
4585 // R = Sobel X
4586 // G = Sobel
4587 // B = Sobel Y
SobelXYRow_SSE2(const uint8 * src_sobelx,const uint8 * src_sobely,uint8 * dst_argb,int width)4588 void SobelXYRow_SSE2(const uint8* src_sobelx,
4589                      const uint8* src_sobely,
4590                      uint8* dst_argb,
4591                      int width) {
4592   asm volatile (
4593     "sub       %0,%1                           \n"
4594     "pcmpeqb   %%xmm5,%%xmm5                   \n"
4595 
4596     // 8 pixel loop.
4597     LABELALIGN
4598     "1:                                        \n"
4599     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
4600     MEMOPREG(movdqu,0x00,0,1,1,xmm1)           //  movdqu    (%0,%1,1),%%xmm1
4601     "lea       " MEMLEA(0x10,0) ",%0           \n"
4602     "movdqa    %%xmm0,%%xmm2                   \n"
4603     "paddusb   %%xmm1,%%xmm2                   \n"
4604     "movdqa    %%xmm0,%%xmm3                   \n"
4605     "punpcklbw %%xmm5,%%xmm3                   \n"
4606     "punpckhbw %%xmm5,%%xmm0                   \n"
4607     "movdqa    %%xmm1,%%xmm4                   \n"
4608     "punpcklbw %%xmm2,%%xmm4                   \n"
4609     "punpckhbw %%xmm2,%%xmm1                   \n"
4610     "movdqa    %%xmm4,%%xmm6                   \n"
4611     "punpcklwd %%xmm3,%%xmm6                   \n"
4612     "punpckhwd %%xmm3,%%xmm4                   \n"
4613     "movdqa    %%xmm1,%%xmm7                   \n"
4614     "punpcklwd %%xmm0,%%xmm7                   \n"
4615     "punpckhwd %%xmm0,%%xmm1                   \n"
4616     "movdqu    %%xmm6," MEMACCESS(2) "         \n"
4617     "movdqu    %%xmm4," MEMACCESS2(0x10,2) "   \n"
4618     "movdqu    %%xmm7," MEMACCESS2(0x20,2) "   \n"
4619     "movdqu    %%xmm1," MEMACCESS2(0x30,2) "   \n"
4620     "lea       " MEMLEA(0x40,2) ",%2           \n"
4621     "sub       $0x10,%3                        \n"
4622     "jg        1b                              \n"
4623   : "+r"(src_sobelx),  // %0
4624     "+r"(src_sobely),  // %1
4625     "+r"(dst_argb),    // %2
4626     "+r"(width)        // %3
4627   :
4628   : "memory", "cc", NACL_R14
4629     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
4630   );
4631 }
4632 #endif  // HAS_SOBELXYROW_SSE2
4633 
4634 #ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2
4635 // Creates a table of cumulative sums where each value is a sum of all values
4636 // above and to the left of the value, inclusive of the value.
ComputeCumulativeSumRow_SSE2(const uint8 * row,int32 * cumsum,const int32 * previous_cumsum,int width)4637 void ComputeCumulativeSumRow_SSE2(const uint8* row,
4638                                   int32* cumsum,
4639                                   const int32* previous_cumsum,
4640                                   int width) {
4641   asm volatile (
4642     "pxor      %%xmm0,%%xmm0                   \n"
4643     "pxor      %%xmm1,%%xmm1                   \n"
4644     "sub       $0x4,%3                         \n"
4645     "jl        49f                             \n"
4646     "test      $0xf,%1                         \n"
4647     "jne       49f                             \n"
4648 
4649     // 4 pixel loop.
4650     LABELALIGN
4651     "40:                                       \n"
4652     "movdqu    " MEMACCESS(0) ",%%xmm2         \n"
4653     "lea       " MEMLEA(0x10,0) ",%0           \n"
4654     "movdqa    %%xmm2,%%xmm4                   \n"
4655     "punpcklbw %%xmm1,%%xmm2                   \n"
4656     "movdqa    %%xmm2,%%xmm3                   \n"
4657     "punpcklwd %%xmm1,%%xmm2                   \n"
4658     "punpckhwd %%xmm1,%%xmm3                   \n"
4659     "punpckhbw %%xmm1,%%xmm4                   \n"
4660     "movdqa    %%xmm4,%%xmm5                   \n"
4661     "punpcklwd %%xmm1,%%xmm4                   \n"
4662     "punpckhwd %%xmm1,%%xmm5                   \n"
4663     "paddd     %%xmm2,%%xmm0                   \n"
4664     "movdqu    " MEMACCESS(2) ",%%xmm2         \n"
4665     "paddd     %%xmm0,%%xmm2                   \n"
4666     "paddd     %%xmm3,%%xmm0                   \n"
4667     "movdqu    " MEMACCESS2(0x10,2) ",%%xmm3   \n"
4668     "paddd     %%xmm0,%%xmm3                   \n"
4669     "paddd     %%xmm4,%%xmm0                   \n"
4670     "movdqu    " MEMACCESS2(0x20,2) ",%%xmm4   \n"
4671     "paddd     %%xmm0,%%xmm4                   \n"
4672     "paddd     %%xmm5,%%xmm0                   \n"
4673     "movdqu    " MEMACCESS2(0x30,2) ",%%xmm5   \n"
4674     "lea       " MEMLEA(0x40,2) ",%2           \n"
4675     "paddd     %%xmm0,%%xmm5                   \n"
4676     "movdqu    %%xmm2," MEMACCESS(1) "         \n"
4677     "movdqu    %%xmm3," MEMACCESS2(0x10,1) "   \n"
4678     "movdqu    %%xmm4," MEMACCESS2(0x20,1) "   \n"
4679     "movdqu    %%xmm5," MEMACCESS2(0x30,1) "   \n"
4680     "lea       " MEMLEA(0x40,1) ",%1           \n"
4681     "sub       $0x4,%3                         \n"
4682     "jge       40b                             \n"
4683 
4684     "49:                                       \n"
4685     "add       $0x3,%3                         \n"
4686     "jl        19f                             \n"
4687 
4688     // 1 pixel loop.
4689     LABELALIGN
4690     "10:                                       \n"
4691     "movd      " MEMACCESS(0) ",%%xmm2         \n"
4692     "lea       " MEMLEA(0x4,0) ",%0            \n"
4693     "punpcklbw %%xmm1,%%xmm2                   \n"
4694     "punpcklwd %%xmm1,%%xmm2                   \n"
4695     "paddd     %%xmm2,%%xmm0                   \n"
4696     "movdqu    " MEMACCESS(2) ",%%xmm2         \n"
4697     "lea       " MEMLEA(0x10,2) ",%2           \n"
4698     "paddd     %%xmm0,%%xmm2                   \n"
4699     "movdqu    %%xmm2," MEMACCESS(1) "         \n"
4700     "lea       " MEMLEA(0x10,1) ",%1           \n"
4701     "sub       $0x1,%3                         \n"
4702     "jge       10b                             \n"
4703 
4704     "19:                                       \n"
4705   : "+r"(row),  // %0
4706     "+r"(cumsum),  // %1
4707     "+r"(previous_cumsum),  // %2
4708     "+r"(width)  // %3
4709   :
4710   : "memory", "cc"
4711     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
4712   );
4713 }
4714 #endif  // HAS_COMPUTECUMULATIVESUMROW_SSE2
4715 
4716 #ifdef HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
CumulativeSumToAverageRow_SSE2(const int32 * topleft,const int32 * botleft,int width,int area,uint8 * dst,int count)4717 void CumulativeSumToAverageRow_SSE2(const int32* topleft,
4718                                     const int32* botleft,
4719                                     int width,
4720                                     int area,
4721                                     uint8* dst,
4722                                     int count) {
4723   asm volatile (
4724     "movd      %5,%%xmm5                       \n"
4725     "cvtdq2ps  %%xmm5,%%xmm5                   \n"
4726     "rcpss     %%xmm5,%%xmm4                   \n"
4727     "pshufd    $0x0,%%xmm4,%%xmm4              \n"
4728     "sub       $0x4,%3                         \n"
4729     "jl        49f                             \n"
4730     "cmpl      $0x80,%5                        \n"
4731     "ja        40f                             \n"
4732 
4733     "pshufd    $0x0,%%xmm5,%%xmm5              \n"
4734     "pcmpeqb   %%xmm6,%%xmm6                   \n"
4735     "psrld     $0x10,%%xmm6                    \n"
4736     "cvtdq2ps  %%xmm6,%%xmm6                   \n"
4737     "addps     %%xmm6,%%xmm5                   \n"
4738     "mulps     %%xmm4,%%xmm5                   \n"
4739     "cvtps2dq  %%xmm5,%%xmm5                   \n"
4740     "packssdw  %%xmm5,%%xmm5                   \n"
4741 
4742     // 4 pixel small loop.
4743     LABELALIGN
4744   "4:                                         \n"
4745     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
4746     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
4747     "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
4748     "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
4749     MEMOPREG(psubd,0x00,0,4,4,xmm0)            // psubd    0x00(%0,%4,4),%%xmm0
4750     MEMOPREG(psubd,0x10,0,4,4,xmm1)            // psubd    0x10(%0,%4,4),%%xmm1
4751     MEMOPREG(psubd,0x20,0,4,4,xmm2)            // psubd    0x20(%0,%4,4),%%xmm2
4752     MEMOPREG(psubd,0x30,0,4,4,xmm3)            // psubd    0x30(%0,%4,4),%%xmm3
4753     "lea       " MEMLEA(0x40,0) ",%0           \n"
4754     "psubd     " MEMACCESS(1) ",%%xmm0         \n"
4755     "psubd     " MEMACCESS2(0x10,1) ",%%xmm1   \n"
4756     "psubd     " MEMACCESS2(0x20,1) ",%%xmm2   \n"
4757     "psubd     " MEMACCESS2(0x30,1) ",%%xmm3   \n"
4758     MEMOPREG(paddd,0x00,1,4,4,xmm0)            // paddd    0x00(%1,%4,4),%%xmm0
4759     MEMOPREG(paddd,0x10,1,4,4,xmm1)            // paddd    0x10(%1,%4,4),%%xmm1
4760     MEMOPREG(paddd,0x20,1,4,4,xmm2)            // paddd    0x20(%1,%4,4),%%xmm2
4761     MEMOPREG(paddd,0x30,1,4,4,xmm3)            // paddd    0x30(%1,%4,4),%%xmm3
4762     "lea       " MEMLEA(0x40,1) ",%1           \n"
4763     "packssdw  %%xmm1,%%xmm0                   \n"
4764     "packssdw  %%xmm3,%%xmm2                   \n"
4765     "pmulhuw   %%xmm5,%%xmm0                   \n"
4766     "pmulhuw   %%xmm5,%%xmm2                   \n"
4767     "packuswb  %%xmm2,%%xmm0                   \n"
4768     "movdqu    %%xmm0," MEMACCESS(2) "         \n"
4769     "lea       " MEMLEA(0x10,2) ",%2           \n"
4770     "sub       $0x4,%3                         \n"
4771     "jge       4b                              \n"
4772     "jmp       49f                             \n"
4773 
4774   // 4 pixel loop                              \n"
4775     LABELALIGN
4776   "40:                                         \n"
4777     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
4778     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
4779     "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
4780     "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
4781     MEMOPREG(psubd,0x00,0,4,4,xmm0)            // psubd    0x00(%0,%4,4),%%xmm0
4782     MEMOPREG(psubd,0x10,0,4,4,xmm1)            // psubd    0x10(%0,%4,4),%%xmm1
4783     MEMOPREG(psubd,0x20,0,4,4,xmm2)            // psubd    0x20(%0,%4,4),%%xmm2
4784     MEMOPREG(psubd,0x30,0,4,4,xmm3)            // psubd    0x30(%0,%4,4),%%xmm3
4785     "lea       " MEMLEA(0x40,0) ",%0           \n"
4786     "psubd     " MEMACCESS(1) ",%%xmm0         \n"
4787     "psubd     " MEMACCESS2(0x10,1) ",%%xmm1   \n"
4788     "psubd     " MEMACCESS2(0x20,1) ",%%xmm2   \n"
4789     "psubd     " MEMACCESS2(0x30,1) ",%%xmm3   \n"
4790     MEMOPREG(paddd,0x00,1,4,4,xmm0)            // paddd    0x00(%1,%4,4),%%xmm0
4791     MEMOPREG(paddd,0x10,1,4,4,xmm1)            // paddd    0x10(%1,%4,4),%%xmm1
4792     MEMOPREG(paddd,0x20,1,4,4,xmm2)            // paddd    0x20(%1,%4,4),%%xmm2
4793     MEMOPREG(paddd,0x30,1,4,4,xmm3)            // paddd    0x30(%1,%4,4),%%xmm3
4794     "lea       " MEMLEA(0x40,1) ",%1           \n"
4795     "cvtdq2ps  %%xmm0,%%xmm0                   \n"
4796     "cvtdq2ps  %%xmm1,%%xmm1                   \n"
4797     "mulps     %%xmm4,%%xmm0                   \n"
4798     "mulps     %%xmm4,%%xmm1                   \n"
4799     "cvtdq2ps  %%xmm2,%%xmm2                   \n"
4800     "cvtdq2ps  %%xmm3,%%xmm3                   \n"
4801     "mulps     %%xmm4,%%xmm2                   \n"
4802     "mulps     %%xmm4,%%xmm3                   \n"
4803     "cvtps2dq  %%xmm0,%%xmm0                   \n"
4804     "cvtps2dq  %%xmm1,%%xmm1                   \n"
4805     "cvtps2dq  %%xmm2,%%xmm2                   \n"
4806     "cvtps2dq  %%xmm3,%%xmm3                   \n"
4807     "packssdw  %%xmm1,%%xmm0                   \n"
4808     "packssdw  %%xmm3,%%xmm2                   \n"
4809     "packuswb  %%xmm2,%%xmm0                   \n"
4810     "movdqu    %%xmm0," MEMACCESS(2) "         \n"
4811     "lea       " MEMLEA(0x10,2) ",%2           \n"
4812     "sub       $0x4,%3                         \n"
4813     "jge       40b                             \n"
4814 
4815   "49:                                         \n"
4816     "add       $0x3,%3                         \n"
4817     "jl        19f                             \n"
4818 
4819   // 1 pixel loop                              \n"
4820     LABELALIGN
4821   "10:                                         \n"
4822     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
4823     MEMOPREG(psubd,0x00,0,4,4,xmm0)            // psubd    0x00(%0,%4,4),%%xmm0
4824     "lea       " MEMLEA(0x10,0) ",%0           \n"
4825     "psubd     " MEMACCESS(1) ",%%xmm0         \n"
4826     MEMOPREG(paddd,0x00,1,4,4,xmm0)            // paddd    0x00(%1,%4,4),%%xmm0
4827     "lea       " MEMLEA(0x10,1) ",%1           \n"
4828     "cvtdq2ps  %%xmm0,%%xmm0                   \n"
4829     "mulps     %%xmm4,%%xmm0                   \n"
4830     "cvtps2dq  %%xmm0,%%xmm0                   \n"
4831     "packssdw  %%xmm0,%%xmm0                   \n"
4832     "packuswb  %%xmm0,%%xmm0                   \n"
4833     "movd      %%xmm0," MEMACCESS(2) "         \n"
4834     "lea       " MEMLEA(0x4,2) ",%2            \n"
4835     "sub       $0x1,%3                         \n"
4836     "jge       10b                             \n"
4837   "19:                                         \n"
4838   : "+r"(topleft),  // %0
4839     "+r"(botleft),  // %1
4840     "+r"(dst),      // %2
4841     "+rm"(count)    // %3
4842   : "r"((intptr_t)(width)),  // %4
4843     "rm"(area)     // %5
4844   : "memory", "cc", NACL_R14
4845     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
4846   );
4847 }
4848 #endif  // HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
4849 
4850 #ifdef HAS_ARGBAFFINEROW_SSE2
4851 // Copy ARGB pixels from source image with slope to a row of destination.
4852 LIBYUV_API
ARGBAffineRow_SSE2(const uint8 * src_argb,int src_argb_stride,uint8 * dst_argb,const float * src_dudv,int width)4853 void ARGBAffineRow_SSE2(const uint8* src_argb,
4854                         int src_argb_stride,
4855                         uint8* dst_argb,
4856                         const float* src_dudv,
4857                         int width) {
4858   intptr_t src_argb_stride_temp = src_argb_stride;
4859   intptr_t temp;
4860   asm volatile (
4861     "movq      " MEMACCESS(3) ",%%xmm2         \n"
4862     "movq      " MEMACCESS2(0x08,3) ",%%xmm7   \n"
4863     "shl       $0x10,%1                        \n"
4864     "add       $0x4,%1                         \n"
4865     "movd      %1,%%xmm5                       \n"
4866     "sub       $0x4,%4                         \n"
4867     "jl        49f                             \n"
4868 
4869     "pshufd    $0x44,%%xmm7,%%xmm7             \n"
4870     "pshufd    $0x0,%%xmm5,%%xmm5              \n"
4871     "movdqa    %%xmm2,%%xmm0                   \n"
4872     "addps     %%xmm7,%%xmm0                   \n"
4873     "movlhps   %%xmm0,%%xmm2                   \n"
4874     "movdqa    %%xmm7,%%xmm4                   \n"
4875     "addps     %%xmm4,%%xmm4                   \n"
4876     "movdqa    %%xmm2,%%xmm3                   \n"
4877     "addps     %%xmm4,%%xmm3                   \n"
4878     "addps     %%xmm4,%%xmm4                   \n"
4879 
4880   // 4 pixel loop                              \n"
4881     LABELALIGN
4882   "40:                                         \n"
4883     "cvttps2dq %%xmm2,%%xmm0                   \n"  // x, y float to int first 2
4884     "cvttps2dq %%xmm3,%%xmm1                   \n"  // x, y float to int next 2
4885     "packssdw  %%xmm1,%%xmm0                   \n"  // x, y as 8 shorts
4886     "pmaddwd   %%xmm5,%%xmm0                   \n"  // off = x * 4 + y * stride
4887     "movd      %%xmm0,%k1                      \n"
4888     "pshufd    $0x39,%%xmm0,%%xmm0             \n"
4889     "movd      %%xmm0,%k5                      \n"
4890     "pshufd    $0x39,%%xmm0,%%xmm0             \n"
4891     MEMOPREG(movd,0x00,0,1,1,xmm1)             //  movd      (%0,%1,1),%%xmm1
4892     MEMOPREG(movd,0x00,0,5,1,xmm6)             //  movd      (%0,%5,1),%%xmm6
4893     "punpckldq %%xmm6,%%xmm1                   \n"
4894     "addps     %%xmm4,%%xmm2                   \n"
4895     "movq      %%xmm1," MEMACCESS(2) "         \n"
4896     "movd      %%xmm0,%k1                      \n"
4897     "pshufd    $0x39,%%xmm0,%%xmm0             \n"
4898     "movd      %%xmm0,%k5                      \n"
4899     MEMOPREG(movd,0x00,0,1,1,xmm0)             //  movd      (%0,%1,1),%%xmm0
4900     MEMOPREG(movd,0x00,0,5,1,xmm6)             //  movd      (%0,%5,1),%%xmm6
4901     "punpckldq %%xmm6,%%xmm0                   \n"
4902     "addps     %%xmm4,%%xmm3                   \n"
4903     "movq      %%xmm0," MEMACCESS2(0x08,2) "   \n"
4904     "lea       " MEMLEA(0x10,2) ",%2           \n"
4905     "sub       $0x4,%4                         \n"
4906     "jge       40b                             \n"
4907 
4908   "49:                                         \n"
4909     "add       $0x3,%4                         \n"
4910     "jl        19f                             \n"
4911 
4912   // 1 pixel loop                              \n"
4913     LABELALIGN
4914   "10:                                         \n"
4915     "cvttps2dq %%xmm2,%%xmm0                   \n"
4916     "packssdw  %%xmm0,%%xmm0                   \n"
4917     "pmaddwd   %%xmm5,%%xmm0                   \n"
4918     "addps     %%xmm7,%%xmm2                   \n"
4919     "movd      %%xmm0,%k1                      \n"
4920     MEMOPREG(movd,0x00,0,1,1,xmm0)             //  movd      (%0,%1,1),%%xmm0
4921     "movd      %%xmm0," MEMACCESS(2) "         \n"
4922     "lea       " MEMLEA(0x04,2) ",%2           \n"
4923     "sub       $0x1,%4                         \n"
4924     "jge       10b                             \n"
4925   "19:                                         \n"
4926   : "+r"(src_argb),  // %0
4927     "+r"(src_argb_stride_temp),  // %1
4928     "+r"(dst_argb),  // %2
4929     "+r"(src_dudv),  // %3
4930     "+rm"(width),    // %4
4931     "=&r"(temp)      // %5
4932   :
4933   : "memory", "cc", NACL_R14
4934     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
4935   );
4936 }
4937 #endif  // HAS_ARGBAFFINEROW_SSE2
4938 
4939 #ifdef HAS_INTERPOLATEROW_SSSE3
4940 // Bilinear filter 16x2 -> 16x1
InterpolateRow_SSSE3(uint8 * dst_ptr,const uint8 * src_ptr,ptrdiff_t src_stride,int dst_width,int source_y_fraction)4941 void InterpolateRow_SSSE3(uint8* dst_ptr,
4942                           const uint8* src_ptr,
4943                           ptrdiff_t src_stride,
4944                           int dst_width,
4945                           int source_y_fraction) {
4946   asm volatile (
4947     "sub       %1,%0                           \n"
4948     "cmp       $0x0,%3                         \n"
4949     "je        100f                            \n"
4950     "cmp       $0x80,%3                        \n"
4951     "je        50f                             \n"
4952 
4953     "movd      %3,%%xmm0                       \n"
4954     "neg       %3                              \n"
4955     "add       $0x100,%3                       \n"
4956     "movd      %3,%%xmm5                       \n"
4957     "punpcklbw %%xmm0,%%xmm5                   \n"
4958     "punpcklwd %%xmm5,%%xmm5                   \n"
4959     "pshufd    $0x0,%%xmm5,%%xmm5              \n"
4960     "mov       $0x80808080,%%eax               \n"
4961     "movd      %%eax,%%xmm4                    \n"
4962     "pshufd    $0x0,%%xmm4,%%xmm4              \n"
4963 
4964     // General purpose row blend.
4965     LABELALIGN
4966     "1:                                        \n"
4967     "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
4968     MEMOPREG(movdqu,0x00,1,4,1,xmm2)
4969     "movdqa     %%xmm0,%%xmm1                  \n"
4970     "punpcklbw  %%xmm2,%%xmm0                  \n"
4971     "punpckhbw  %%xmm2,%%xmm1                  \n"
4972     "psubb      %%xmm4,%%xmm0                  \n"
4973     "psubb      %%xmm4,%%xmm1                  \n"
4974     "movdqa     %%xmm5,%%xmm2                  \n"
4975     "movdqa     %%xmm5,%%xmm3                  \n"
4976     "pmaddubsw  %%xmm0,%%xmm2                  \n"
4977     "pmaddubsw  %%xmm1,%%xmm3                  \n"
4978     "paddw      %%xmm4,%%xmm2                  \n"
4979     "paddw      %%xmm4,%%xmm3                  \n"
4980     "psrlw      $0x8,%%xmm2                    \n"
4981     "psrlw      $0x8,%%xmm3                    \n"
4982     "packuswb   %%xmm3,%%xmm2                  \n"
4983     MEMOPMEM(movdqu,xmm2,0x00,1,0,1)
4984     "lea       " MEMLEA(0x10,1) ",%1           \n"
4985     "sub       $0x10,%2                        \n"
4986     "jg        1b                              \n"
4987     "jmp       99f                             \n"
4988 
4989     // Blend 50 / 50.
4990     LABELALIGN
4991   "50:                                         \n"
4992     "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
4993     MEMOPREG(movdqu,0x00,1,4,1,xmm1)
4994     "pavgb     %%xmm1,%%xmm0                   \n"
4995     MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
4996     "lea       " MEMLEA(0x10,1) ",%1           \n"
4997     "sub       $0x10,%2                        \n"
4998     "jg        50b                             \n"
4999     "jmp       99f                             \n"
5000 
5001     // Blend 100 / 0 - Copy row unchanged.
5002     LABELALIGN
5003   "100:                                        \n"
5004     "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
5005     MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
5006     "lea       " MEMLEA(0x10,1) ",%1           \n"
5007     "sub       $0x10,%2                        \n"
5008     "jg        100b                            \n"
5009 
5010   "99:                                         \n"
5011   : "+r"(dst_ptr),     // %0
5012     "+r"(src_ptr),     // %1
5013     "+rm"(dst_width),  // %2
5014     "+r"(source_y_fraction)  // %3
5015   : "r"((intptr_t)(src_stride))  // %4
5016   : "memory", "cc", "eax", NACL_R14
5017     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
5018   );
5019 }
5020 #endif  // HAS_INTERPOLATEROW_SSSE3
5021 
5022 #ifdef HAS_INTERPOLATEROW_AVX2
5023 // Bilinear filter 32x2 -> 32x1
InterpolateRow_AVX2(uint8 * dst_ptr,const uint8 * src_ptr,ptrdiff_t src_stride,int dst_width,int source_y_fraction)5024 void InterpolateRow_AVX2(uint8* dst_ptr,
5025                          const uint8* src_ptr,
5026                          ptrdiff_t src_stride,
5027                          int dst_width,
5028                          int source_y_fraction) {
5029   asm volatile (
5030     "cmp       $0x0,%3                         \n"
5031     "je        100f                            \n"
5032     "sub       %1,%0                           \n"
5033     "cmp       $0x80,%3                        \n"
5034     "je        50f                             \n"
5035 
5036     "vmovd      %3,%%xmm0                      \n"
5037     "neg        %3                             \n"
5038     "add        $0x100,%3                      \n"
5039     "vmovd      %3,%%xmm5                      \n"
5040     "vpunpcklbw %%xmm0,%%xmm5,%%xmm5           \n"
5041     "vpunpcklwd %%xmm5,%%xmm5,%%xmm5           \n"
5042     "vbroadcastss %%xmm5,%%ymm5                \n"
5043     "mov        $0x80808080,%%eax              \n"
5044     "vmovd      %%eax,%%xmm4                   \n"
5045     "vbroadcastss %%xmm4,%%ymm4                \n"
5046 
5047     // General purpose row blend.
5048     LABELALIGN
5049     "1:                                        \n"
5050     "vmovdqu    " MEMACCESS(1) ",%%ymm0        \n"
5051     MEMOPREG(vmovdqu,0x00,1,4,1,ymm2)
5052     "vpunpckhbw %%ymm2,%%ymm0,%%ymm1           \n"
5053     "vpunpcklbw %%ymm2,%%ymm0,%%ymm0           \n"
5054     "vpsubb     %%ymm4,%%ymm1,%%ymm1           \n"
5055     "vpsubb     %%ymm4,%%ymm0,%%ymm0           \n"
5056     "vpmaddubsw %%ymm1,%%ymm5,%%ymm1           \n"
5057     "vpmaddubsw %%ymm0,%%ymm5,%%ymm0           \n"
5058     "vpaddw     %%ymm4,%%ymm1,%%ymm1           \n"
5059     "vpaddw     %%ymm4,%%ymm0,%%ymm0           \n"
5060     "vpsrlw     $0x8,%%ymm1,%%ymm1             \n"
5061     "vpsrlw     $0x8,%%ymm0,%%ymm0             \n"
5062     "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"
5063     MEMOPMEM(vmovdqu,ymm0,0x00,1,0,1)
5064     "lea       " MEMLEA(0x20,1) ",%1           \n"
5065     "sub       $0x20,%2                        \n"
5066     "jg        1b                              \n"
5067     "jmp       99f                             \n"
5068 
5069     // Blend 50 / 50.
5070     LABELALIGN
5071   "50:                                         \n"
5072     "vmovdqu    " MEMACCESS(1) ",%%ymm0        \n"
5073     VMEMOPREG(vpavgb,0x00,1,4,1,ymm0,ymm0)     // vpavgb (%1,%4,1),%%ymm0,%%ymm0
5074     MEMOPMEM(vmovdqu,ymm0,0x00,1,0,1)
5075     "lea       " MEMLEA(0x20,1) ",%1           \n"
5076     "sub       $0x20,%2                        \n"
5077     "jg        50b                             \n"
5078     "jmp       99f                             \n"
5079 
5080     // Blend 100 / 0 - Copy row unchanged.
5081     LABELALIGN
5082   "100:                                        \n"
5083     "rep movsb " MEMMOVESTRING(1,0) "          \n"
5084     "jmp       999f                            \n"
5085 
5086   "99:                                         \n"
5087     "vzeroupper                                \n"
5088   "999:                                        \n"
5089   : "+D"(dst_ptr),    // %0
5090     "+S"(src_ptr),    // %1
5091     "+cm"(dst_width),  // %2
5092     "+r"(source_y_fraction)  // %3
5093   : "r"((intptr_t)(src_stride))  // %4
5094   : "memory", "cc", "eax", NACL_R14
5095     "xmm0", "xmm1", "xmm2", "xmm4", "xmm5"
5096   );
5097 }
5098 #endif  // HAS_INTERPOLATEROW_AVX2
5099 
5100 #ifdef HAS_ARGBSHUFFLEROW_SSSE3
5101 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
ARGBShuffleRow_SSSE3(const uint8 * src_argb,uint8 * dst_argb,const uint8 * shuffler,int width)5102 void ARGBShuffleRow_SSSE3(const uint8* src_argb,
5103                           uint8* dst_argb,
5104                           const uint8* shuffler,
5105                           int width) {
5106   asm volatile (
5107     "movdqu    " MEMACCESS(3) ",%%xmm5         \n"
5108     LABELALIGN
5109     "1:                                        \n"
5110     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
5111     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
5112     "lea       " MEMLEA(0x20,0) ",%0           \n"
5113     "pshufb    %%xmm5,%%xmm0                   \n"
5114     "pshufb    %%xmm5,%%xmm1                   \n"
5115     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
5116     "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
5117     "lea       " MEMLEA(0x20,1) ",%1           \n"
5118     "sub       $0x8,%2                         \n"
5119     "jg        1b                              \n"
5120   : "+r"(src_argb),  // %0
5121     "+r"(dst_argb),  // %1
5122     "+r"(width)        // %2
5123   : "r"(shuffler)    // %3
5124   : "memory", "cc"
5125     , "xmm0", "xmm1", "xmm5"
5126   );
5127 }
5128 #endif  // HAS_ARGBSHUFFLEROW_SSSE3
5129 
5130 #ifdef HAS_ARGBSHUFFLEROW_AVX2
5131 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
ARGBShuffleRow_AVX2(const uint8 * src_argb,uint8 * dst_argb,const uint8 * shuffler,int width)5132 void ARGBShuffleRow_AVX2(const uint8* src_argb,
5133                          uint8* dst_argb,
5134                          const uint8* shuffler,
5135                          int width) {
5136   asm volatile (
5137     "vbroadcastf128 " MEMACCESS(3) ",%%ymm5    \n"
5138     LABELALIGN
5139     "1:                                        \n"
5140     "vmovdqu   " MEMACCESS(0) ",%%ymm0         \n"
5141     "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm1   \n"
5142     "lea       " MEMLEA(0x40,0) ",%0           \n"
5143     "vpshufb   %%ymm5,%%ymm0,%%ymm0            \n"
5144     "vpshufb   %%ymm5,%%ymm1,%%ymm1            \n"
5145     "vmovdqu   %%ymm0," MEMACCESS(1) "         \n"
5146     "vmovdqu   %%ymm1," MEMACCESS2(0x20,1) "   \n"
5147     "lea       " MEMLEA(0x40,1) ",%1           \n"
5148     "sub       $0x10,%2                        \n"
5149     "jg        1b                              \n"
5150     "vzeroupper                                \n"
5151   : "+r"(src_argb),  // %0
5152     "+r"(dst_argb),  // %1
5153     "+r"(width)        // %2
5154   : "r"(shuffler)    // %3
5155   : "memory", "cc"
5156     , "xmm0", "xmm1", "xmm5"
5157   );
5158 }
5159 #endif  // HAS_ARGBSHUFFLEROW_AVX2
5160 
5161 #ifdef HAS_ARGBSHUFFLEROW_SSE2
5162 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
ARGBShuffleRow_SSE2(const uint8 * src_argb,uint8 * dst_argb,const uint8 * shuffler,int width)5163 void ARGBShuffleRow_SSE2(const uint8* src_argb,
5164                          uint8* dst_argb,
5165                          const uint8* shuffler,
5166                          int width) {
5167   uintptr_t pixel_temp;
5168   asm volatile (
5169     "pxor      %%xmm5,%%xmm5                   \n"
5170     "mov       " MEMACCESS(4) ",%k2            \n"
5171     "cmp       $0x3000102,%k2                  \n"
5172     "je        3012f                           \n"
5173     "cmp       $0x10203,%k2                    \n"
5174     "je        123f                            \n"
5175     "cmp       $0x30201,%k2                    \n"
5176     "je        321f                            \n"
5177     "cmp       $0x2010003,%k2                  \n"
5178     "je        2103f                           \n"
5179 
5180     LABELALIGN
5181     "1:                                        \n"
5182     "movzb     " MEMACCESS(4) ",%2             \n"
5183     MEMOPARG(movzb,0x00,0,2,1,2) "             \n"  //  movzb     (%0,%2,1),%2
5184     "mov       %b2," MEMACCESS(1) "            \n"
5185     "movzb     " MEMACCESS2(0x1,4) ",%2        \n"
5186     MEMOPARG(movzb,0x00,0,2,1,2) "             \n"  //  movzb     (%0,%2,1),%2
5187     "mov       %b2," MEMACCESS2(0x1,1) "       \n"
5188     "movzb     " MEMACCESS2(0x2,4) ",%2        \n"
5189     MEMOPARG(movzb,0x00,0,2,1,2) "             \n"  //  movzb     (%0,%2,1),%2
5190     "mov       %b2," MEMACCESS2(0x2,1) "       \n"
5191     "movzb     " MEMACCESS2(0x3,4) ",%2        \n"
5192     MEMOPARG(movzb,0x00,0,2,1,2) "             \n"  //  movzb     (%0,%2,1),%2
5193     "mov       %b2," MEMACCESS2(0x3,1) "       \n"
5194     "lea       " MEMLEA(0x4,0) ",%0            \n"
5195     "lea       " MEMLEA(0x4,1) ",%1            \n"
5196     "sub       $0x1,%3                         \n"
5197     "jg        1b                              \n"
5198     "jmp       99f                             \n"
5199 
5200     LABELALIGN
5201   "123:                                        \n"
5202     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
5203     "lea       " MEMLEA(0x10,0) ",%0           \n"
5204     "movdqa    %%xmm0,%%xmm1                   \n"
5205     "punpcklbw %%xmm5,%%xmm0                   \n"
5206     "punpckhbw %%xmm5,%%xmm1                   \n"
5207     "pshufhw   $0x1b,%%xmm0,%%xmm0             \n"
5208     "pshuflw   $0x1b,%%xmm0,%%xmm0             \n"
5209     "pshufhw   $0x1b,%%xmm1,%%xmm1             \n"
5210     "pshuflw   $0x1b,%%xmm1,%%xmm1             \n"
5211     "packuswb  %%xmm1,%%xmm0                   \n"
5212     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
5213     "lea       " MEMLEA(0x10,1) ",%1           \n"
5214     "sub       $0x4,%3                         \n"
5215     "jg        123b                            \n"
5216     "jmp       99f                             \n"
5217 
5218     LABELALIGN
5219   "321:                                        \n"
5220     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
5221     "lea       " MEMLEA(0x10,0) ",%0           \n"
5222     "movdqa    %%xmm0,%%xmm1                   \n"
5223     "punpcklbw %%xmm5,%%xmm0                   \n"
5224     "punpckhbw %%xmm5,%%xmm1                   \n"
5225     "pshufhw   $0x39,%%xmm0,%%xmm0             \n"
5226     "pshuflw   $0x39,%%xmm0,%%xmm0             \n"
5227     "pshufhw   $0x39,%%xmm1,%%xmm1             \n"
5228     "pshuflw   $0x39,%%xmm1,%%xmm1             \n"
5229     "packuswb  %%xmm1,%%xmm0                   \n"
5230     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
5231     "lea       " MEMLEA(0x10,1) ",%1           \n"
5232     "sub       $0x4,%3                         \n"
5233     "jg        321b                            \n"
5234     "jmp       99f                             \n"
5235 
5236     LABELALIGN
5237   "2103:                                       \n"
5238     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
5239     "lea       " MEMLEA(0x10,0) ",%0           \n"
5240     "movdqa    %%xmm0,%%xmm1                   \n"
5241     "punpcklbw %%xmm5,%%xmm0                   \n"
5242     "punpckhbw %%xmm5,%%xmm1                   \n"
5243     "pshufhw   $0x93,%%xmm0,%%xmm0             \n"
5244     "pshuflw   $0x93,%%xmm0,%%xmm0             \n"
5245     "pshufhw   $0x93,%%xmm1,%%xmm1             \n"
5246     "pshuflw   $0x93,%%xmm1,%%xmm1             \n"
5247     "packuswb  %%xmm1,%%xmm0                   \n"
5248     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
5249     "lea       " MEMLEA(0x10,1) ",%1           \n"
5250     "sub       $0x4,%3                         \n"
5251     "jg        2103b                           \n"
5252     "jmp       99f                             \n"
5253 
5254     LABELALIGN
5255   "3012:                                       \n"
5256     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
5257     "lea       " MEMLEA(0x10,0) ",%0           \n"
5258     "movdqa    %%xmm0,%%xmm1                   \n"
5259     "punpcklbw %%xmm5,%%xmm0                   \n"
5260     "punpckhbw %%xmm5,%%xmm1                   \n"
5261     "pshufhw   $0xc6,%%xmm0,%%xmm0             \n"
5262     "pshuflw   $0xc6,%%xmm0,%%xmm0             \n"
5263     "pshufhw   $0xc6,%%xmm1,%%xmm1             \n"
5264     "pshuflw   $0xc6,%%xmm1,%%xmm1             \n"
5265     "packuswb  %%xmm1,%%xmm0                   \n"
5266     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
5267     "lea       " MEMLEA(0x10,1) ",%1           \n"
5268     "sub       $0x4,%3                         \n"
5269     "jg        3012b                           \n"
5270 
5271   "99:                                         \n"
5272   : "+r"(src_argb),     // %0
5273     "+r"(dst_argb),     // %1
5274     "=&d"(pixel_temp),  // %2
5275     "+r"(width)         // %3
5276   : "r"(shuffler)       // %4
5277   : "memory", "cc", NACL_R14
5278     "xmm0", "xmm1", "xmm5"
5279   );
5280 }
5281 #endif  // HAS_ARGBSHUFFLEROW_SSE2
5282 
5283 #ifdef HAS_I422TOYUY2ROW_SSE2
I422ToYUY2Row_SSE2(const uint8 * src_y,const uint8 * src_u,const uint8 * src_v,uint8 * dst_frame,int width)5284 void I422ToYUY2Row_SSE2(const uint8* src_y,
5285                         const uint8* src_u,
5286                         const uint8* src_v,
5287                         uint8* dst_frame,
5288                         int width) {
5289   asm volatile (
5290     "sub       %1,%2                             \n"
5291     LABELALIGN
5292     "1:                                        \n"
5293     "movq      " MEMACCESS(1) ",%%xmm2           \n"
5294     MEMOPREG(movq,0x00,1,2,1,xmm3)               //  movq    (%1,%2,1),%%xmm3
5295     "lea       " MEMLEA(0x8,1) ",%1              \n"
5296     "punpcklbw %%xmm3,%%xmm2                     \n"
5297     "movdqu    " MEMACCESS(0) ",%%xmm0           \n"
5298     "lea       " MEMLEA(0x10,0) ",%0             \n"
5299     "movdqa    %%xmm0,%%xmm1                     \n"
5300     "punpcklbw %%xmm2,%%xmm0                     \n"
5301     "punpckhbw %%xmm2,%%xmm1                     \n"
5302     "movdqu    %%xmm0," MEMACCESS(3) "           \n"
5303     "movdqu    %%xmm1," MEMACCESS2(0x10,3) "     \n"
5304     "lea       " MEMLEA(0x20,3) ",%3             \n"
5305     "sub       $0x10,%4                          \n"
5306     "jg         1b                               \n"
5307     : "+r"(src_y),  // %0
5308       "+r"(src_u),  // %1
5309       "+r"(src_v),  // %2
5310       "+r"(dst_frame),  // %3
5311       "+rm"(width)  // %4
5312     :
5313     : "memory", "cc", NACL_R14
5314     "xmm0", "xmm1", "xmm2", "xmm3"
5315   );
5316 }
5317 #endif  // HAS_I422TOYUY2ROW_SSE2
5318 
5319 #ifdef HAS_I422TOUYVYROW_SSE2
I422ToUYVYRow_SSE2(const uint8 * src_y,const uint8 * src_u,const uint8 * src_v,uint8 * dst_frame,int width)5320 void I422ToUYVYRow_SSE2(const uint8* src_y,
5321                         const uint8* src_u,
5322                         const uint8* src_v,
5323                         uint8* dst_frame,
5324                         int width) {
5325   asm volatile (
5326     "sub        %1,%2                            \n"
5327     LABELALIGN
5328     "1:                                        \n"
5329     "movq      " MEMACCESS(1) ",%%xmm2           \n"
5330     MEMOPREG(movq,0x00,1,2,1,xmm3)               //  movq    (%1,%2,1),%%xmm3
5331     "lea       " MEMLEA(0x8,1) ",%1              \n"
5332     "punpcklbw %%xmm3,%%xmm2                     \n"
5333     "movdqu    " MEMACCESS(0) ",%%xmm0           \n"
5334     "movdqa    %%xmm2,%%xmm1                     \n"
5335     "lea       " MEMLEA(0x10,0) ",%0             \n"
5336     "punpcklbw %%xmm0,%%xmm1                     \n"
5337     "punpckhbw %%xmm0,%%xmm2                     \n"
5338     "movdqu    %%xmm1," MEMACCESS(3) "           \n"
5339     "movdqu    %%xmm2," MEMACCESS2(0x10,3) "     \n"
5340     "lea       " MEMLEA(0x20,3) ",%3             \n"
5341     "sub       $0x10,%4                          \n"
5342     "jg         1b                               \n"
5343     : "+r"(src_y),  // %0
5344       "+r"(src_u),  // %1
5345       "+r"(src_v),  // %2
5346       "+r"(dst_frame),  // %3
5347       "+rm"(width)  // %4
5348     :
5349     : "memory", "cc", NACL_R14
5350     "xmm0", "xmm1", "xmm2", "xmm3"
5351   );
5352 }
5353 #endif  // HAS_I422TOUYVYROW_SSE2
5354 
5355 #ifdef HAS_ARGBPOLYNOMIALROW_SSE2
ARGBPolynomialRow_SSE2(const uint8 * src_argb,uint8 * dst_argb,const float * poly,int width)5356 void ARGBPolynomialRow_SSE2(const uint8* src_argb,
5357                             uint8* dst_argb,
5358                             const float* poly,
5359                             int width) {
5360   asm volatile (
5361     "pxor      %%xmm3,%%xmm3                   \n"
5362 
5363     // 2 pixel loop.
5364     LABELALIGN
5365     "1:                                        \n"
5366     "movq      " MEMACCESS(0) ",%%xmm0         \n"
5367     "lea       " MEMLEA(0x8,0) ",%0            \n"
5368     "punpcklbw %%xmm3,%%xmm0                   \n"
5369     "movdqa    %%xmm0,%%xmm4                   \n"
5370     "punpcklwd %%xmm3,%%xmm0                   \n"
5371     "punpckhwd %%xmm3,%%xmm4                   \n"
5372     "cvtdq2ps  %%xmm0,%%xmm0                   \n"
5373     "cvtdq2ps  %%xmm4,%%xmm4                   \n"
5374     "movdqa    %%xmm0,%%xmm1                   \n"
5375     "movdqa    %%xmm4,%%xmm5                   \n"
5376     "mulps     " MEMACCESS2(0x10,3) ",%%xmm0   \n"
5377     "mulps     " MEMACCESS2(0x10,3) ",%%xmm4   \n"
5378     "addps     " MEMACCESS(3) ",%%xmm0         \n"
5379     "addps     " MEMACCESS(3) ",%%xmm4         \n"
5380     "movdqa    %%xmm1,%%xmm2                   \n"
5381     "movdqa    %%xmm5,%%xmm6                   \n"
5382     "mulps     %%xmm1,%%xmm2                   \n"
5383     "mulps     %%xmm5,%%xmm6                   \n"
5384     "mulps     %%xmm2,%%xmm1                   \n"
5385     "mulps     %%xmm6,%%xmm5                   \n"
5386     "mulps     " MEMACCESS2(0x20,3) ",%%xmm2   \n"
5387     "mulps     " MEMACCESS2(0x20,3) ",%%xmm6   \n"
5388     "mulps     " MEMACCESS2(0x30,3) ",%%xmm1   \n"
5389     "mulps     " MEMACCESS2(0x30,3) ",%%xmm5   \n"
5390     "addps     %%xmm2,%%xmm0                   \n"
5391     "addps     %%xmm6,%%xmm4                   \n"
5392     "addps     %%xmm1,%%xmm0                   \n"
5393     "addps     %%xmm5,%%xmm4                   \n"
5394     "cvttps2dq %%xmm0,%%xmm0                   \n"
5395     "cvttps2dq %%xmm4,%%xmm4                   \n"
5396     "packuswb  %%xmm4,%%xmm0                   \n"
5397     "packuswb  %%xmm0,%%xmm0                   \n"
5398     "movq      %%xmm0," MEMACCESS(1) "         \n"
5399     "lea       " MEMLEA(0x8,1) ",%1            \n"
5400     "sub       $0x2,%2                         \n"
5401     "jg        1b                              \n"
5402   : "+r"(src_argb),  // %0
5403     "+r"(dst_argb),  // %1
5404     "+r"(width)      // %2
5405   : "r"(poly)        // %3
5406   : "memory", "cc"
5407     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
5408   );
5409 }
5410 #endif  // HAS_ARGBPOLYNOMIALROW_SSE2
5411 
5412 #ifdef HAS_ARGBPOLYNOMIALROW_AVX2
ARGBPolynomialRow_AVX2(const uint8 * src_argb,uint8 * dst_argb,const float * poly,int width)5413 void ARGBPolynomialRow_AVX2(const uint8* src_argb,
5414                             uint8* dst_argb,
5415                             const float* poly,
5416                             int width) {
5417   asm volatile (
5418     "vbroadcastf128 " MEMACCESS(3) ",%%ymm4     \n"
5419     "vbroadcastf128 " MEMACCESS2(0x10,3) ",%%ymm5 \n"
5420     "vbroadcastf128 " MEMACCESS2(0x20,3) ",%%ymm6 \n"
5421     "vbroadcastf128 " MEMACCESS2(0x30,3) ",%%ymm7 \n"
5422 
5423     // 2 pixel loop.
5424     LABELALIGN
5425     "1:                                        \n"
5426     "vpmovzxbd   " MEMACCESS(0) ",%%ymm0       \n"  // 2 ARGB pixels
5427     "lea         " MEMLEA(0x8,0) ",%0          \n"
5428     "vcvtdq2ps   %%ymm0,%%ymm0                 \n"  // X 8 floats
5429     "vmulps      %%ymm0,%%ymm0,%%ymm2          \n"  // X * X
5430     "vmulps      %%ymm7,%%ymm0,%%ymm3          \n"  // C3 * X
5431     "vfmadd132ps %%ymm5,%%ymm4,%%ymm0          \n"  // result = C0 + C1 * X
5432     "vfmadd231ps %%ymm6,%%ymm2,%%ymm0          \n"  // result += C2 * X * X
5433     "vfmadd231ps %%ymm3,%%ymm2,%%ymm0          \n"  // result += C3 * X * X * X
5434     "vcvttps2dq  %%ymm0,%%ymm0                 \n"
5435     "vpackusdw   %%ymm0,%%ymm0,%%ymm0          \n"
5436     "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
5437     "vpackuswb   %%xmm0,%%xmm0,%%xmm0          \n"
5438     "vmovq       %%xmm0," MEMACCESS(1) "       \n"
5439     "lea         " MEMLEA(0x8,1) ",%1          \n"
5440     "sub         $0x2,%2                       \n"
5441     "jg          1b                            \n"
5442     "vzeroupper                                \n"
5443   : "+r"(src_argb),  // %0
5444     "+r"(dst_argb),  // %1
5445     "+r"(width)      // %2
5446   : "r"(poly)        // %3
5447   : "memory", "cc",
5448     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
5449   );
5450 }
5451 #endif  // HAS_ARGBPOLYNOMIALROW_AVX2
5452 
5453 #ifdef HAS_HALFFLOATROW_SSE2
5454 static float kScaleBias = 1.9259299444e-34f;
HalfFloatRow_SSE2(const uint16 * src,uint16 * dst,float scale,int width)5455 void HalfFloatRow_SSE2(const uint16* src, uint16* dst, float scale, int width) {
5456   asm volatile (
5457     "pshufd      $0x0,%3,%%xmm4                \n"
5458     "pxor        %%xmm5,%%xmm5                 \n"
5459     "sub         %0,%1                         \n"
5460 
5461     // 16 pixel loop.
5462     LABELALIGN
5463     "1:                                        \n"
5464     "movdqu      " MEMACCESS(0) ",%%xmm2       \n"  // 8 shorts
5465     "add         $0x10,%0                      \n"
5466     "movdqa      %%xmm2,%%xmm3                 \n"
5467     "punpcklwd   %%xmm5,%%xmm2                 \n"  // 8 ints in xmm2/1
5468     "cvtdq2ps    %%xmm2,%%xmm2                 \n"  // 8 floats
5469     "punpckhwd   %%xmm5,%%xmm3                 \n"
5470     "cvtdq2ps    %%xmm3,%%xmm3                 \n"
5471     "mulps       %%xmm4,%%xmm2                 \n"
5472     "mulps       %%xmm4,%%xmm3                 \n"
5473     "psrld       $0xd,%%xmm2                   \n"
5474     "psrld       $0xd,%%xmm3                   \n"
5475     "packssdw    %%xmm3,%%xmm2                 \n"
5476     MEMOPMEM(movdqu,xmm2,-0x10,0,1,1)
5477     "sub         $0x8,%2                       \n"
5478     "jg          1b                            \n"
5479   : "+r"(src),    // %0
5480     "+r"(dst),    // %1
5481     "+r"(width)   // %2
5482   : "x"(scale * kScaleBias)   // %3
5483   : "memory", "cc",
5484     "xmm2", "xmm3", "xmm4", "xmm5"
5485   );
5486 }
5487 #endif  // HAS_HALFFLOATROW_SSE2
5488 
5489 #ifdef HAS_HALFFLOATROW_AVX2
HalfFloatRow_AVX2(const uint16 * src,uint16 * dst,float scale,int width)5490 void HalfFloatRow_AVX2(const uint16* src, uint16* dst, float scale, int width) {
5491   asm volatile (
5492     "vbroadcastss  %3, %%ymm4                  \n"
5493     "vpxor      %%ymm5,%%ymm5,%%ymm5           \n"
5494     "sub        %0,%1                          \n"
5495 
5496     // 16 pixel loop.
5497     LABELALIGN
5498     "1:                                        \n"
5499     "vmovdqu    " MEMACCESS(0) ",%%ymm2        \n"  // 16 shorts
5500     "add        $0x20,%0                       \n"
5501     "vpunpckhwd %%ymm5,%%ymm2,%%ymm3           \n"  // mutates
5502     "vpunpcklwd %%ymm5,%%ymm2,%%ymm2           \n"
5503     "vcvtdq2ps  %%ymm3,%%ymm3                  \n"
5504     "vcvtdq2ps  %%ymm2,%%ymm2                  \n"
5505     "vmulps     %%ymm3,%%ymm4,%%ymm3           \n"
5506     "vmulps     %%ymm2,%%ymm4,%%ymm2           \n"
5507     "vpsrld     $0xd,%%ymm3,%%ymm3             \n"
5508     "vpsrld     $0xd,%%ymm2,%%ymm2             \n"
5509     "vpackssdw  %%ymm3, %%ymm2, %%ymm2         \n"  // unmutates
5510     MEMOPMEM(vmovdqu,ymm2,-0x20,0,1,1)
5511     "sub        $0x10,%2                       \n"
5512     "jg         1b                             \n"
5513 
5514     "vzeroupper                                \n"
5515   : "+r"(src),    // %0
5516     "+r"(dst),    // %1
5517     "+r"(width)   // %2
5518   : "x"(scale * kScaleBias)   // %3
5519   : "memory", "cc",
5520     "xmm2", "xmm3", "xmm4", "xmm5"
5521   );
5522 }
5523 #endif  // HAS_HALFFLOATROW_AVX2
5524 
5525 #ifdef HAS_HALFFLOATROW_F16C
HalfFloatRow_F16C(const uint16 * src,uint16 * dst,float scale,int width)5526 void HalfFloatRow_F16C(const uint16* src, uint16* dst, float scale, int width) {
5527   asm volatile (
5528     "vbroadcastss  %3, %%ymm4                  \n"
5529     "sub        %0,%1                          \n"
5530 
5531     // 16 pixel loop.
5532     LABELALIGN
5533     "1:                                        \n"
5534     "vpmovzxwd   " MEMACCESS(0) ",%%ymm2       \n"  // 16 shorts -> 16 ints
5535     "vpmovzxwd   " MEMACCESS2(0x10,0) ",%%ymm3 \n"
5536     "vcvtdq2ps   %%ymm2,%%ymm2                 \n"
5537     "vcvtdq2ps   %%ymm3,%%ymm3                 \n"
5538     "vmulps      %%ymm2,%%ymm4,%%ymm2          \n"
5539     "vmulps      %%ymm3,%%ymm4,%%ymm3          \n"
5540     "vcvtps2ph   $3, %%ymm2, %%xmm2            \n"
5541     "vcvtps2ph   $3, %%ymm3, %%xmm3            \n"
5542     MEMOPMEM(vmovdqu,xmm2,0x00,0,1,1)
5543     MEMOPMEM(vmovdqu,xmm3,0x10,0,1,1)
5544     "add         $0x20,%0                      \n"
5545     "sub         $0x10,%2                      \n"
5546     "jg          1b                            \n"
5547     "vzeroupper                                \n"
5548   : "+r"(src),   // %0
5549     "+r"(dst),   // %1
5550     "+r"(width)  // %2
5551   : "x"(scale)   // %3
5552   : "memory", "cc",
5553     "xmm2", "xmm3", "xmm4"
5554   );
5555 }
5556 #endif  // HAS_HALFFLOATROW_F16C
5557 
5558 #ifdef HAS_HALFFLOATROW_F16C
HalfFloat1Row_F16C(const uint16 * src,uint16 * dst,float,int width)5559 void HalfFloat1Row_F16C(const uint16* src, uint16* dst, float, int width) {
5560   asm volatile (
5561     "sub        %0,%1                          \n"
5562     // 16 pixel loop.
5563     LABELALIGN
5564     "1:                                        \n"
5565     "vpmovzxwd   " MEMACCESS(0) ",%%ymm2       \n"  // 16 shorts -> 16 ints
5566     "vpmovzxwd   " MEMACCESS2(0x10,0) ",%%ymm3 \n"
5567     "vcvtdq2ps   %%ymm2,%%ymm2                 \n"
5568     "vcvtdq2ps   %%ymm3,%%ymm3                 \n"
5569     "vcvtps2ph   $3, %%ymm2, %%xmm2            \n"
5570     "vcvtps2ph   $3, %%ymm3, %%xmm3            \n"
5571     MEMOPMEM(vmovdqu,xmm2,0x00,0,1,1)
5572     MEMOPMEM(vmovdqu,xmm3,0x10,0,1,1)
5573     "add         $0x20,%0                      \n"
5574     "sub         $0x10,%2                      \n"
5575     "jg          1b                            \n"
5576     "vzeroupper                                \n"
5577   : "+r"(src),   // %0
5578     "+r"(dst),   // %1
5579     "+r"(width)  // %2
5580   :
5581   : "memory", "cc",
5582     "xmm2", "xmm3"
5583   );
5584 }
5585 #endif  // HAS_HALFFLOATROW_F16C
5586 
5587 #ifdef HAS_ARGBCOLORTABLEROW_X86
5588 // Tranform ARGB pixels with color table.
ARGBColorTableRow_X86(uint8 * dst_argb,const uint8 * table_argb,int width)5589 void ARGBColorTableRow_X86(uint8* dst_argb,
5590                            const uint8* table_argb,
5591                            int width) {
5592   uintptr_t pixel_temp;
5593   asm volatile (
5594     // 1 pixel loop.
5595     LABELALIGN
5596     "1:                                        \n"
5597     "movzb     " MEMACCESS(0) ",%1             \n"
5598     "lea       " MEMLEA(0x4,0) ",%0            \n"
5599     MEMOPARG(movzb,0x00,3,1,4,1) "             \n"  // movzb (%3,%1,4),%1
5600     "mov       %b1," MEMACCESS2(-0x4,0) "      \n"
5601     "movzb     " MEMACCESS2(-0x3,0) ",%1       \n"
5602     MEMOPARG(movzb,0x01,3,1,4,1) "             \n"  // movzb 0x1(%3,%1,4),%1
5603     "mov       %b1," MEMACCESS2(-0x3,0) "      \n"
5604     "movzb     " MEMACCESS2(-0x2,0) ",%1       \n"
5605     MEMOPARG(movzb,0x02,3,1,4,1) "             \n"  // movzb 0x2(%3,%1,4),%1
5606     "mov       %b1," MEMACCESS2(-0x2,0) "      \n"
5607     "movzb     " MEMACCESS2(-0x1,0) ",%1       \n"
5608     MEMOPARG(movzb,0x03,3,1,4,1) "             \n"  // movzb 0x3(%3,%1,4),%1
5609     "mov       %b1," MEMACCESS2(-0x1,0) "      \n"
5610     "dec       %2                              \n"
5611     "jg        1b                              \n"
5612   : "+r"(dst_argb),     // %0
5613     "=&d"(pixel_temp),  // %1
5614     "+r"(width)         // %2
5615   : "r"(table_argb)     // %3
5616   : "memory", "cc");
5617 }
5618 #endif  // HAS_ARGBCOLORTABLEROW_X86
5619 
5620 #ifdef HAS_RGBCOLORTABLEROW_X86
5621 // Tranform RGB pixels with color table.
RGBColorTableRow_X86(uint8 * dst_argb,const uint8 * table_argb,int width)5622 void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width) {
5623   uintptr_t pixel_temp;
5624   asm volatile (
5625     // 1 pixel loop.
5626     LABELALIGN
5627     "1:                                        \n"
5628     "movzb     " MEMACCESS(0) ",%1             \n"
5629     "lea       " MEMLEA(0x4,0) ",%0            \n"
5630     MEMOPARG(movzb,0x00,3,1,4,1) "             \n"  // movzb (%3,%1,4),%1
5631     "mov       %b1," MEMACCESS2(-0x4,0) "      \n"
5632     "movzb     " MEMACCESS2(-0x3,0) ",%1       \n"
5633     MEMOPARG(movzb,0x01,3,1,4,1) "             \n"  // movzb 0x1(%3,%1,4),%1
5634     "mov       %b1," MEMACCESS2(-0x3,0) "      \n"
5635     "movzb     " MEMACCESS2(-0x2,0) ",%1       \n"
5636     MEMOPARG(movzb,0x02,3,1,4,1) "             \n"  // movzb 0x2(%3,%1,4),%1
5637     "mov       %b1," MEMACCESS2(-0x2,0) "      \n"
5638     "dec       %2                              \n"
5639     "jg        1b                              \n"
5640   : "+r"(dst_argb),     // %0
5641     "=&d"(pixel_temp),  // %1
5642     "+r"(width)         // %2
5643   : "r"(table_argb)     // %3
5644   : "memory", "cc");
5645 }
5646 #endif  // HAS_RGBCOLORTABLEROW_X86
5647 
5648 #ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3
5649 // Tranform RGB pixels with luma table.
ARGBLumaColorTableRow_SSSE3(const uint8 * src_argb,uint8 * dst_argb,int width,const uint8 * luma,uint32 lumacoeff)5650 void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb,
5651                                  uint8* dst_argb,
5652                                  int width,
5653                                  const uint8* luma,
5654                                  uint32 lumacoeff) {
5655   uintptr_t pixel_temp;
5656   uintptr_t table_temp;
5657   asm volatile (
5658     "movd      %6,%%xmm3                       \n"
5659     "pshufd    $0x0,%%xmm3,%%xmm3              \n"
5660     "pcmpeqb   %%xmm4,%%xmm4                   \n"
5661     "psllw     $0x8,%%xmm4                     \n"
5662     "pxor      %%xmm5,%%xmm5                   \n"
5663 
5664     // 4 pixel loop.
5665     LABELALIGN
5666     "1:                                        \n"
5667     "movdqu    " MEMACCESS(2) ",%%xmm0         \n"
5668     "pmaddubsw %%xmm3,%%xmm0                   \n"
5669     "phaddw    %%xmm0,%%xmm0                   \n"
5670     "pand      %%xmm4,%%xmm0                   \n"
5671     "punpcklwd %%xmm5,%%xmm0                   \n"
5672     "movd      %%xmm0,%k1                      \n"  // 32 bit offset
5673     "add       %5,%1                           \n"
5674     "pshufd    $0x39,%%xmm0,%%xmm0             \n"
5675 
5676     "movzb     " MEMACCESS(2) ",%0             \n"
5677     MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
5678     "mov       %b0," MEMACCESS(3) "            \n"
5679     "movzb     " MEMACCESS2(0x1,2) ",%0        \n"
5680     MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
5681     "mov       %b0," MEMACCESS2(0x1,3) "       \n"
5682     "movzb     " MEMACCESS2(0x2,2) ",%0        \n"
5683     MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
5684     "mov       %b0," MEMACCESS2(0x2,3) "       \n"
5685     "movzb     " MEMACCESS2(0x3,2) ",%0        \n"
5686     "mov       %b0," MEMACCESS2(0x3,3) "       \n"
5687 
5688     "movd      %%xmm0,%k1                      \n"  // 32 bit offset
5689     "add       %5,%1                           \n"
5690     "pshufd    $0x39,%%xmm0,%%xmm0             \n"
5691 
5692     "movzb     " MEMACCESS2(0x4,2) ",%0        \n"
5693     MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
5694     "mov       %b0," MEMACCESS2(0x4,3) "       \n"
5695     "movzb     " MEMACCESS2(0x5,2) ",%0        \n"
5696     MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
5697     "mov       %b0," MEMACCESS2(0x5,3) "       \n"
5698     "movzb     " MEMACCESS2(0x6,2) ",%0        \n"
5699     MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
5700     "mov       %b0," MEMACCESS2(0x6,3) "       \n"
5701     "movzb     " MEMACCESS2(0x7,2) ",%0        \n"
5702     "mov       %b0," MEMACCESS2(0x7,3) "       \n"
5703 
5704     "movd      %%xmm0,%k1                      \n"  // 32 bit offset
5705     "add       %5,%1                           \n"
5706     "pshufd    $0x39,%%xmm0,%%xmm0             \n"
5707 
5708     "movzb     " MEMACCESS2(0x8,2) ",%0        \n"
5709     MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
5710     "mov       %b0," MEMACCESS2(0x8,3) "       \n"
5711     "movzb     " MEMACCESS2(0x9,2) ",%0        \n"
5712     MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
5713     "mov       %b0," MEMACCESS2(0x9,3) "       \n"
5714     "movzb     " MEMACCESS2(0xa,2) ",%0        \n"
5715     MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
5716     "mov       %b0," MEMACCESS2(0xa,3) "       \n"
5717     "movzb     " MEMACCESS2(0xb,2) ",%0        \n"
5718     "mov       %b0," MEMACCESS2(0xb,3) "       \n"
5719 
5720     "movd      %%xmm0,%k1                      \n"  // 32 bit offset
5721     "add       %5,%1                           \n"
5722 
5723     "movzb     " MEMACCESS2(0xc,2) ",%0        \n"
5724     MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
5725     "mov       %b0," MEMACCESS2(0xc,3) "       \n"
5726     "movzb     " MEMACCESS2(0xd,2) ",%0        \n"
5727     MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
5728     "mov       %b0," MEMACCESS2(0xd,3) "       \n"
5729     "movzb     " MEMACCESS2(0xe,2) ",%0        \n"
5730     MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
5731     "mov       %b0," MEMACCESS2(0xe,3) "       \n"
5732     "movzb     " MEMACCESS2(0xf,2) ",%0        \n"
5733     "mov       %b0," MEMACCESS2(0xf,3) "       \n"
5734     "lea       " MEMLEA(0x10,2) ",%2           \n"
5735     "lea       " MEMLEA(0x10,3) ",%3           \n"
5736     "sub       $0x4,%4                         \n"
5737     "jg        1b                              \n"
5738   : "=&d"(pixel_temp),  // %0
5739     "=&a"(table_temp),  // %1
5740     "+r"(src_argb),     // %2
5741     "+r"(dst_argb),     // %3
5742     "+rm"(width)        // %4
5743   : "r"(luma),          // %5
5744     "rm"(lumacoeff)     // %6
5745   : "memory", "cc", "xmm0", "xmm3", "xmm4", "xmm5"
5746   );
5747 }
5748 #endif  // HAS_ARGBLUMACOLORTABLEROW_SSSE3
5749 
5750 #endif  // defined(__x86_64__) || defined(__i386__)
5751 
5752 #ifdef __cplusplus
5753 }  // extern "C"
5754 }  // namespace libyuv
5755 #endif
5756