1 // VERSION 2
2 /*
3 * Copyright 2011 The LibYuv Project Authors. All rights reserved.
4 *
5 * Use of this source code is governed by a BSD-style license
6 * that can be found in the LICENSE file in the root of the source
7 * tree. An additional intellectual property rights grant can be found
8 * in the file PATENTS. All contributing project authors may
9 * be found in the AUTHORS file in the root of the source tree.
10 */
11
12 #include "libyuv/row.h"
13
14 #ifdef __cplusplus
15 namespace libyuv {
16 extern "C" {
17 #endif
18
19 // This module is for GCC x86 and x64.
20 #if !defined(LIBYUV_DISABLE_X86) && \
21 (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))
22
23 #if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3)
24
25 // Constants for ARGB
26 static vec8 kARGBToY = {
27 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0
28 };
29
30 // JPeg full range.
31 static vec8 kARGBToYJ = {
32 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0
33 };
34 #endif // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3)
35
36 #if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3)
37
38 static vec8 kARGBToU = {
39 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0
40 };
41
42 static vec8 kARGBToUJ = {
43 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0
44 };
45
46 static vec8 kARGBToV = {
47 -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
48 };
49
50 static vec8 kARGBToVJ = {
51 -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0
52 };
53
54 // Constants for BGRA
55 static vec8 kBGRAToY = {
56 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13
57 };
58
59 static vec8 kBGRAToU = {
60 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112
61 };
62
63 static vec8 kBGRAToV = {
64 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18
65 };
66
67 // Constants for ABGR
68 static vec8 kABGRToY = {
69 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0
70 };
71
72 static vec8 kABGRToU = {
73 -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0
74 };
75
76 static vec8 kABGRToV = {
77 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0
78 };
79
80 // Constants for RGBA.
81 static vec8 kRGBAToY = {
82 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33
83 };
84
85 static vec8 kRGBAToU = {
86 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38
87 };
88
89 static vec8 kRGBAToV = {
90 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112
91 };
92
93 static uvec8 kAddY16 = {
94 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u
95 };
96
97 // 7 bit fixed point 0.5.
98 static vec16 kAddYJ64 = {
99 64, 64, 64, 64, 64, 64, 64, 64
100 };
101
102 static uvec8 kAddUV128 = {
103 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
104 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
105 };
106
107 static uvec16 kAddUVJ128 = {
108 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u
109 };
110 #endif // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3)
111
112 #ifdef HAS_RGB24TOARGBROW_SSSE3
113
114 // Shuffle table for converting RGB24 to ARGB.
115 static uvec8 kShuffleMaskRGB24ToARGB = {
116 0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u
117 };
118
119 // Shuffle table for converting RAW to ARGB.
120 static uvec8 kShuffleMaskRAWToARGB = {
121 2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u
122 };
123
124 // Shuffle table for converting RAW to RGB24. First 8.
125 static const uvec8 kShuffleMaskRAWToRGB24_0 = {
126 2u, 1u, 0u, 5u, 4u, 3u, 8u, 7u,
127 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
128 };
129
130 // Shuffle table for converting RAW to RGB24. Middle 8.
131 static const uvec8 kShuffleMaskRAWToRGB24_1 = {
132 2u, 7u, 6u, 5u, 10u, 9u, 8u, 13u,
133 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
134 };
135
136 // Shuffle table for converting RAW to RGB24. Last 8.
137 static const uvec8 kShuffleMaskRAWToRGB24_2 = {
138 8u, 7u, 12u, 11u, 10u, 15u, 14u, 13u,
139 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
140 };
141
142 // Shuffle table for converting ARGB to RGB24.
143 static uvec8 kShuffleMaskARGBToRGB24 = {
144 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u
145 };
146
147 // Shuffle table for converting ARGB to RAW.
148 static uvec8 kShuffleMaskARGBToRAW = {
149 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u
150 };
151
152 // Shuffle table for converting ARGBToRGB24 for I422ToRGB24. First 8 + next 4
153 static uvec8 kShuffleMaskARGBToRGB24_0 = {
154 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u
155 };
156
157 // YUY2 shuf 16 Y to 32 Y.
158 static const lvec8 kShuffleYUY2Y = {
159 0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14,
160 0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14
161 };
162
163 // YUY2 shuf 8 UV to 16 UV.
164 static const lvec8 kShuffleYUY2UV = {
165 1, 3, 1, 3, 5, 7, 5, 7, 9, 11, 9, 11, 13, 15, 13, 15,
166 1, 3, 1, 3, 5, 7, 5, 7, 9, 11, 9, 11, 13, 15, 13, 15
167 };
168
169 // UYVY shuf 16 Y to 32 Y.
170 static const lvec8 kShuffleUYVYY = {
171 1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15,
172 1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15
173 };
174
175 // UYVY shuf 8 UV to 16 UV.
176 static const lvec8 kShuffleUYVYUV = {
177 0, 2, 0, 2, 4, 6, 4, 6, 8, 10, 8, 10, 12, 14, 12, 14,
178 0, 2, 0, 2, 4, 6, 4, 6, 8, 10, 8, 10, 12, 14, 12, 14
179 };
180
181 // NV21 shuf 8 VU to 16 UV.
182 static const lvec8 kShuffleNV21 = {
183 1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,
184 1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,
185 };
186 #endif // HAS_RGB24TOARGBROW_SSSE3
187
188 #ifdef HAS_J400TOARGBROW_SSE2
J400ToARGBRow_SSE2(const uint8 * src_y,uint8 * dst_argb,int width)189 void J400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int width) {
190 asm volatile (
191 "pcmpeqb %%xmm5,%%xmm5 \n"
192 "pslld $0x18,%%xmm5 \n"
193 LABELALIGN
194 "1: \n"
195 "movq " MEMACCESS(0) ",%%xmm0 \n"
196 "lea " MEMLEA(0x8,0) ",%0 \n"
197 "punpcklbw %%xmm0,%%xmm0 \n"
198 "movdqa %%xmm0,%%xmm1 \n"
199 "punpcklwd %%xmm0,%%xmm0 \n"
200 "punpckhwd %%xmm1,%%xmm1 \n"
201 "por %%xmm5,%%xmm0 \n"
202 "por %%xmm5,%%xmm1 \n"
203 "movdqu %%xmm0," MEMACCESS(1) " \n"
204 "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
205 "lea " MEMLEA(0x20,1) ",%1 \n"
206 "sub $0x8,%2 \n"
207 "jg 1b \n"
208 : "+r"(src_y), // %0
209 "+r"(dst_argb), // %1
210 "+r"(width) // %2
211 :: "memory", "cc", "xmm0", "xmm1", "xmm5"
212 );
213 }
214 #endif // HAS_J400TOARGBROW_SSE2
215
216 #ifdef HAS_RGB24TOARGBROW_SSSE3
RGB24ToARGBRow_SSSE3(const uint8 * src_rgb24,uint8 * dst_argb,int width)217 void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int width) {
218 asm volatile (
219 "pcmpeqb %%xmm5,%%xmm5 \n" // generate mask 0xff000000
220 "pslld $0x18,%%xmm5 \n"
221 "movdqa %3,%%xmm4 \n"
222 LABELALIGN
223 "1: \n"
224 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
225 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
226 "movdqu " MEMACCESS2(0x20,0) ",%%xmm3 \n"
227 "lea " MEMLEA(0x30,0) ",%0 \n"
228 "movdqa %%xmm3,%%xmm2 \n"
229 "palignr $0x8,%%xmm1,%%xmm2 \n"
230 "pshufb %%xmm4,%%xmm2 \n"
231 "por %%xmm5,%%xmm2 \n"
232 "palignr $0xc,%%xmm0,%%xmm1 \n"
233 "pshufb %%xmm4,%%xmm0 \n"
234 "movdqu %%xmm2," MEMACCESS2(0x20,1) " \n"
235 "por %%xmm5,%%xmm0 \n"
236 "pshufb %%xmm4,%%xmm1 \n"
237 "movdqu %%xmm0," MEMACCESS(1) " \n"
238 "por %%xmm5,%%xmm1 \n"
239 "palignr $0x4,%%xmm3,%%xmm3 \n"
240 "pshufb %%xmm4,%%xmm3 \n"
241 "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
242 "por %%xmm5,%%xmm3 \n"
243 "movdqu %%xmm3," MEMACCESS2(0x30,1) " \n"
244 "lea " MEMLEA(0x40,1) ",%1 \n"
245 "sub $0x10,%2 \n"
246 "jg 1b \n"
247 : "+r"(src_rgb24), // %0
248 "+r"(dst_argb), // %1
249 "+r"(width) // %2
250 : "m"(kShuffleMaskRGB24ToARGB) // %3
251 : "memory", "cc" , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
252 );
253 }
254
RAWToARGBRow_SSSE3(const uint8 * src_raw,uint8 * dst_argb,int width)255 void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int width) {
256 asm volatile (
257 "pcmpeqb %%xmm5,%%xmm5 \n" // generate mask 0xff000000
258 "pslld $0x18,%%xmm5 \n"
259 "movdqa %3,%%xmm4 \n"
260 LABELALIGN
261 "1: \n"
262 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
263 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
264 "movdqu " MEMACCESS2(0x20,0) ",%%xmm3 \n"
265 "lea " MEMLEA(0x30,0) ",%0 \n"
266 "movdqa %%xmm3,%%xmm2 \n"
267 "palignr $0x8,%%xmm1,%%xmm2 \n"
268 "pshufb %%xmm4,%%xmm2 \n"
269 "por %%xmm5,%%xmm2 \n"
270 "palignr $0xc,%%xmm0,%%xmm1 \n"
271 "pshufb %%xmm4,%%xmm0 \n"
272 "movdqu %%xmm2," MEMACCESS2(0x20,1) " \n"
273 "por %%xmm5,%%xmm0 \n"
274 "pshufb %%xmm4,%%xmm1 \n"
275 "movdqu %%xmm0," MEMACCESS(1) " \n"
276 "por %%xmm5,%%xmm1 \n"
277 "palignr $0x4,%%xmm3,%%xmm3 \n"
278 "pshufb %%xmm4,%%xmm3 \n"
279 "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
280 "por %%xmm5,%%xmm3 \n"
281 "movdqu %%xmm3," MEMACCESS2(0x30,1) " \n"
282 "lea " MEMLEA(0x40,1) ",%1 \n"
283 "sub $0x10,%2 \n"
284 "jg 1b \n"
285 : "+r"(src_raw), // %0
286 "+r"(dst_argb), // %1
287 "+r"(width) // %2
288 : "m"(kShuffleMaskRAWToARGB) // %3
289 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
290 );
291 }
292
RAWToRGB24Row_SSSE3(const uint8 * src_raw,uint8 * dst_rgb24,int width)293 void RAWToRGB24Row_SSSE3(const uint8* src_raw, uint8* dst_rgb24, int width) {
294 asm volatile (
295 "movdqa %3,%%xmm3 \n"
296 "movdqa %4,%%xmm4 \n"
297 "movdqa %5,%%xmm5 \n"
298 LABELALIGN
299 "1: \n"
300 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
301 "movdqu " MEMACCESS2(0x4,0) ",%%xmm1 \n"
302 "movdqu " MEMACCESS2(0x8,0) ",%%xmm2 \n"
303 "lea " MEMLEA(0x18,0) ",%0 \n"
304 "pshufb %%xmm3,%%xmm0 \n"
305 "pshufb %%xmm4,%%xmm1 \n"
306 "pshufb %%xmm5,%%xmm2 \n"
307 "movq %%xmm0," MEMACCESS(1) " \n"
308 "movq %%xmm1," MEMACCESS2(0x8,1) " \n"
309 "movq %%xmm2," MEMACCESS2(0x10,1) " \n"
310 "lea " MEMLEA(0x18,1) ",%1 \n"
311 "sub $0x8,%2 \n"
312 "jg 1b \n"
313 : "+r"(src_raw), // %0
314 "+r"(dst_rgb24), // %1
315 "+r"(width) // %2
316 : "m"(kShuffleMaskRAWToRGB24_0), // %3
317 "m"(kShuffleMaskRAWToRGB24_1), // %4
318 "m"(kShuffleMaskRAWToRGB24_2) // %5
319 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
320 );
321 }
322
RGB565ToARGBRow_SSE2(const uint8 * src,uint8 * dst,int width)323 void RGB565ToARGBRow_SSE2(const uint8* src, uint8* dst, int width) {
324 asm volatile (
325 "mov $0x1080108,%%eax \n"
326 "movd %%eax,%%xmm5 \n"
327 "pshufd $0x0,%%xmm5,%%xmm5 \n"
328 "mov $0x20802080,%%eax \n"
329 "movd %%eax,%%xmm6 \n"
330 "pshufd $0x0,%%xmm6,%%xmm6 \n"
331 "pcmpeqb %%xmm3,%%xmm3 \n"
332 "psllw $0xb,%%xmm3 \n"
333 "pcmpeqb %%xmm4,%%xmm4 \n"
334 "psllw $0xa,%%xmm4 \n"
335 "psrlw $0x5,%%xmm4 \n"
336 "pcmpeqb %%xmm7,%%xmm7 \n"
337 "psllw $0x8,%%xmm7 \n"
338 "sub %0,%1 \n"
339 "sub %0,%1 \n"
340 LABELALIGN
341 "1: \n"
342 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
343 "movdqa %%xmm0,%%xmm1 \n"
344 "movdqa %%xmm0,%%xmm2 \n"
345 "pand %%xmm3,%%xmm1 \n"
346 "psllw $0xb,%%xmm2 \n"
347 "pmulhuw %%xmm5,%%xmm1 \n"
348 "pmulhuw %%xmm5,%%xmm2 \n"
349 "psllw $0x8,%%xmm1 \n"
350 "por %%xmm2,%%xmm1 \n"
351 "pand %%xmm4,%%xmm0 \n"
352 "pmulhuw %%xmm6,%%xmm0 \n"
353 "por %%xmm7,%%xmm0 \n"
354 "movdqa %%xmm1,%%xmm2 \n"
355 "punpcklbw %%xmm0,%%xmm1 \n"
356 "punpckhbw %%xmm0,%%xmm2 \n"
357 MEMOPMEM(movdqu,xmm1,0x00,1,0,2) // movdqu %%xmm1,(%1,%0,2)
358 MEMOPMEM(movdqu,xmm2,0x10,1,0,2) // movdqu %%xmm2,0x10(%1,%0,2)
359 "lea " MEMLEA(0x10,0) ",%0 \n"
360 "sub $0x8,%2 \n"
361 "jg 1b \n"
362 : "+r"(src), // %0
363 "+r"(dst), // %1
364 "+r"(width) // %2
365 :
366 : "memory", "cc", "eax", NACL_R14
367 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
368 );
369 }
370
ARGB1555ToARGBRow_SSE2(const uint8 * src,uint8 * dst,int width)371 void ARGB1555ToARGBRow_SSE2(const uint8* src, uint8* dst, int width) {
372 asm volatile (
373 "mov $0x1080108,%%eax \n"
374 "movd %%eax,%%xmm5 \n"
375 "pshufd $0x0,%%xmm5,%%xmm5 \n"
376 "mov $0x42004200,%%eax \n"
377 "movd %%eax,%%xmm6 \n"
378 "pshufd $0x0,%%xmm6,%%xmm6 \n"
379 "pcmpeqb %%xmm3,%%xmm3 \n"
380 "psllw $0xb,%%xmm3 \n"
381 "movdqa %%xmm3,%%xmm4 \n"
382 "psrlw $0x6,%%xmm4 \n"
383 "pcmpeqb %%xmm7,%%xmm7 \n"
384 "psllw $0x8,%%xmm7 \n"
385 "sub %0,%1 \n"
386 "sub %0,%1 \n"
387 LABELALIGN
388 "1: \n"
389 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
390 "movdqa %%xmm0,%%xmm1 \n"
391 "movdqa %%xmm0,%%xmm2 \n"
392 "psllw $0x1,%%xmm1 \n"
393 "psllw $0xb,%%xmm2 \n"
394 "pand %%xmm3,%%xmm1 \n"
395 "pmulhuw %%xmm5,%%xmm2 \n"
396 "pmulhuw %%xmm5,%%xmm1 \n"
397 "psllw $0x8,%%xmm1 \n"
398 "por %%xmm2,%%xmm1 \n"
399 "movdqa %%xmm0,%%xmm2 \n"
400 "pand %%xmm4,%%xmm0 \n"
401 "psraw $0x8,%%xmm2 \n"
402 "pmulhuw %%xmm6,%%xmm0 \n"
403 "pand %%xmm7,%%xmm2 \n"
404 "por %%xmm2,%%xmm0 \n"
405 "movdqa %%xmm1,%%xmm2 \n"
406 "punpcklbw %%xmm0,%%xmm1 \n"
407 "punpckhbw %%xmm0,%%xmm2 \n"
408 MEMOPMEM(movdqu,xmm1,0x00,1,0,2) // movdqu %%xmm1,(%1,%0,2)
409 MEMOPMEM(movdqu,xmm2,0x10,1,0,2) // movdqu %%xmm2,0x10(%1,%0,2)
410 "lea " MEMLEA(0x10,0) ",%0 \n"
411 "sub $0x8,%2 \n"
412 "jg 1b \n"
413 : "+r"(src), // %0
414 "+r"(dst), // %1
415 "+r"(width) // %2
416 :
417 : "memory", "cc", "eax", NACL_R14
418 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
419 );
420 }
421
ARGB4444ToARGBRow_SSE2(const uint8 * src,uint8 * dst,int width)422 void ARGB4444ToARGBRow_SSE2(const uint8* src, uint8* dst, int width) {
423 asm volatile (
424 "mov $0xf0f0f0f,%%eax \n"
425 "movd %%eax,%%xmm4 \n"
426 "pshufd $0x0,%%xmm4,%%xmm4 \n"
427 "movdqa %%xmm4,%%xmm5 \n"
428 "pslld $0x4,%%xmm5 \n"
429 "sub %0,%1 \n"
430 "sub %0,%1 \n"
431 LABELALIGN
432 "1: \n"
433 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
434 "movdqa %%xmm0,%%xmm2 \n"
435 "pand %%xmm4,%%xmm0 \n"
436 "pand %%xmm5,%%xmm2 \n"
437 "movdqa %%xmm0,%%xmm1 \n"
438 "movdqa %%xmm2,%%xmm3 \n"
439 "psllw $0x4,%%xmm1 \n"
440 "psrlw $0x4,%%xmm3 \n"
441 "por %%xmm1,%%xmm0 \n"
442 "por %%xmm3,%%xmm2 \n"
443 "movdqa %%xmm0,%%xmm1 \n"
444 "punpcklbw %%xmm2,%%xmm0 \n"
445 "punpckhbw %%xmm2,%%xmm1 \n"
446 MEMOPMEM(movdqu,xmm0,0x00,1,0,2) // movdqu %%xmm0,(%1,%0,2)
447 MEMOPMEM(movdqu,xmm1,0x10,1,0,2) // movdqu %%xmm1,0x10(%1,%0,2)
448 "lea " MEMLEA(0x10,0) ",%0 \n"
449 "sub $0x8,%2 \n"
450 "jg 1b \n"
451 : "+r"(src), // %0
452 "+r"(dst), // %1
453 "+r"(width) // %2
454 :
455 : "memory", "cc", "eax", NACL_R14
456 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
457 );
458 }
459
ARGBToRGB24Row_SSSE3(const uint8 * src,uint8 * dst,int width)460 void ARGBToRGB24Row_SSSE3(const uint8* src, uint8* dst, int width) {
461 asm volatile (
462 "movdqa %3,%%xmm6 \n"
463 LABELALIGN
464 "1: \n"
465 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
466 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
467 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
468 "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n"
469 "lea " MEMLEA(0x40,0) ",%0 \n"
470 "pshufb %%xmm6,%%xmm0 \n"
471 "pshufb %%xmm6,%%xmm1 \n"
472 "pshufb %%xmm6,%%xmm2 \n"
473 "pshufb %%xmm6,%%xmm3 \n"
474 "movdqa %%xmm1,%%xmm4 \n"
475 "psrldq $0x4,%%xmm1 \n"
476 "pslldq $0xc,%%xmm4 \n"
477 "movdqa %%xmm2,%%xmm5 \n"
478 "por %%xmm4,%%xmm0 \n"
479 "pslldq $0x8,%%xmm5 \n"
480 "movdqu %%xmm0," MEMACCESS(1) " \n"
481 "por %%xmm5,%%xmm1 \n"
482 "psrldq $0x8,%%xmm2 \n"
483 "pslldq $0x4,%%xmm3 \n"
484 "por %%xmm3,%%xmm2 \n"
485 "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
486 "movdqu %%xmm2," MEMACCESS2(0x20,1) " \n"
487 "lea " MEMLEA(0x30,1) ",%1 \n"
488 "sub $0x10,%2 \n"
489 "jg 1b \n"
490 : "+r"(src), // %0
491 "+r"(dst), // %1
492 "+r"(width) // %2
493 : "m"(kShuffleMaskARGBToRGB24) // %3
494 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
495 );
496 }
497
ARGBToRAWRow_SSSE3(const uint8 * src,uint8 * dst,int width)498 void ARGBToRAWRow_SSSE3(const uint8* src, uint8* dst, int width) {
499 asm volatile (
500 "movdqa %3,%%xmm6 \n"
501 LABELALIGN
502 "1: \n"
503 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
504 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
505 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
506 "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n"
507 "lea " MEMLEA(0x40,0) ",%0 \n"
508 "pshufb %%xmm6,%%xmm0 \n"
509 "pshufb %%xmm6,%%xmm1 \n"
510 "pshufb %%xmm6,%%xmm2 \n"
511 "pshufb %%xmm6,%%xmm3 \n"
512 "movdqa %%xmm1,%%xmm4 \n"
513 "psrldq $0x4,%%xmm1 \n"
514 "pslldq $0xc,%%xmm4 \n"
515 "movdqa %%xmm2,%%xmm5 \n"
516 "por %%xmm4,%%xmm0 \n"
517 "pslldq $0x8,%%xmm5 \n"
518 "movdqu %%xmm0," MEMACCESS(1) " \n"
519 "por %%xmm5,%%xmm1 \n"
520 "psrldq $0x8,%%xmm2 \n"
521 "pslldq $0x4,%%xmm3 \n"
522 "por %%xmm3,%%xmm2 \n"
523 "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
524 "movdqu %%xmm2," MEMACCESS2(0x20,1) " \n"
525 "lea " MEMLEA(0x30,1) ",%1 \n"
526 "sub $0x10,%2 \n"
527 "jg 1b \n"
528 : "+r"(src), // %0
529 "+r"(dst), // %1
530 "+r"(width) // %2
531 : "m"(kShuffleMaskARGBToRAW) // %3
532 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
533 );
534 }
535
ARGBToRGB565Row_SSE2(const uint8 * src,uint8 * dst,int width)536 void ARGBToRGB565Row_SSE2(const uint8* src, uint8* dst, int width) {
537 asm volatile (
538 "pcmpeqb %%xmm3,%%xmm3 \n"
539 "psrld $0x1b,%%xmm3 \n"
540 "pcmpeqb %%xmm4,%%xmm4 \n"
541 "psrld $0x1a,%%xmm4 \n"
542 "pslld $0x5,%%xmm4 \n"
543 "pcmpeqb %%xmm5,%%xmm5 \n"
544 "pslld $0xb,%%xmm5 \n"
545 LABELALIGN
546 "1: \n"
547 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
548 "movdqa %%xmm0,%%xmm1 \n"
549 "movdqa %%xmm0,%%xmm2 \n"
550 "pslld $0x8,%%xmm0 \n"
551 "psrld $0x3,%%xmm1 \n"
552 "psrld $0x5,%%xmm2 \n"
553 "psrad $0x10,%%xmm0 \n"
554 "pand %%xmm3,%%xmm1 \n"
555 "pand %%xmm4,%%xmm2 \n"
556 "pand %%xmm5,%%xmm0 \n"
557 "por %%xmm2,%%xmm1 \n"
558 "por %%xmm1,%%xmm0 \n"
559 "packssdw %%xmm0,%%xmm0 \n"
560 "lea " MEMLEA(0x10,0) ",%0 \n"
561 "movq %%xmm0," MEMACCESS(1) " \n"
562 "lea " MEMLEA(0x8,1) ",%1 \n"
563 "sub $0x4,%2 \n"
564 "jg 1b \n"
565 : "+r"(src), // %0
566 "+r"(dst), // %1
567 "+r"(width) // %2
568 :: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
569 );
570 }
571
ARGBToRGB565DitherRow_SSE2(const uint8 * src,uint8 * dst,const uint32 dither4,int width)572 void ARGBToRGB565DitherRow_SSE2(const uint8* src, uint8* dst,
573 const uint32 dither4, int width) {
574 asm volatile (
575 "movd %3,%%xmm6 \n"
576 "punpcklbw %%xmm6,%%xmm6 \n"
577 "movdqa %%xmm6,%%xmm7 \n"
578 "punpcklwd %%xmm6,%%xmm6 \n"
579 "punpckhwd %%xmm7,%%xmm7 \n"
580 "pcmpeqb %%xmm3,%%xmm3 \n"
581 "psrld $0x1b,%%xmm3 \n"
582 "pcmpeqb %%xmm4,%%xmm4 \n"
583 "psrld $0x1a,%%xmm4 \n"
584 "pslld $0x5,%%xmm4 \n"
585 "pcmpeqb %%xmm5,%%xmm5 \n"
586 "pslld $0xb,%%xmm5 \n"
587
588 LABELALIGN
589 "1: \n"
590 "movdqu (%0),%%xmm0 \n"
591 "paddusb %%xmm6,%%xmm0 \n"
592 "movdqa %%xmm0,%%xmm1 \n"
593 "movdqa %%xmm0,%%xmm2 \n"
594 "pslld $0x8,%%xmm0 \n"
595 "psrld $0x3,%%xmm1 \n"
596 "psrld $0x5,%%xmm2 \n"
597 "psrad $0x10,%%xmm0 \n"
598 "pand %%xmm3,%%xmm1 \n"
599 "pand %%xmm4,%%xmm2 \n"
600 "pand %%xmm5,%%xmm0 \n"
601 "por %%xmm2,%%xmm1 \n"
602 "por %%xmm1,%%xmm0 \n"
603 "packssdw %%xmm0,%%xmm0 \n"
604 "lea 0x10(%0),%0 \n"
605 "movq %%xmm0,(%1) \n"
606 "lea 0x8(%1),%1 \n"
607 "sub $0x4,%2 \n"
608 "jg 1b \n"
609 : "+r"(src), // %0
610 "+r"(dst), // %1
611 "+r"(width) // %2
612 : "m"(dither4) // %3
613 : "memory", "cc",
614 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
615 );
616 }
617
618 #ifdef HAS_ARGBTORGB565DITHERROW_AVX2
ARGBToRGB565DitherRow_AVX2(const uint8 * src,uint8 * dst,const uint32 dither4,int width)619 void ARGBToRGB565DitherRow_AVX2(const uint8* src, uint8* dst,
620 const uint32 dither4, int width) {
621 asm volatile (
622 "vbroadcastss %3,%%xmm6 \n"
623 "vpunpcklbw %%xmm6,%%xmm6,%%xmm6 \n"
624 "vpermq $0xd8,%%ymm6,%%ymm6 \n"
625 "vpunpcklwd %%ymm6,%%ymm6,%%ymm6 \n"
626 "vpcmpeqb %%ymm3,%%ymm3,%%ymm3 \n"
627 "vpsrld $0x1b,%%ymm3,%%ymm3 \n"
628 "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"
629 "vpsrld $0x1a,%%ymm4,%%ymm4 \n"
630 "vpslld $0x5,%%ymm4,%%ymm4 \n"
631 "vpslld $0xb,%%ymm3,%%ymm5 \n"
632
633 LABELALIGN
634 "1: \n"
635 "vmovdqu (%0),%%ymm0 \n"
636 "vpaddusb %%ymm6,%%ymm0,%%ymm0 \n"
637 "vpsrld $0x5,%%ymm0,%%ymm2 \n"
638 "vpsrld $0x3,%%ymm0,%%ymm1 \n"
639 "vpsrld $0x8,%%ymm0,%%ymm0 \n"
640 "vpand %%ymm4,%%ymm2,%%ymm2 \n"
641 "vpand %%ymm3,%%ymm1,%%ymm1 \n"
642 "vpand %%ymm5,%%ymm0,%%ymm0 \n"
643 "vpor %%ymm2,%%ymm1,%%ymm1 \n"
644 "vpor %%ymm1,%%ymm0,%%ymm0 \n"
645 "vpackusdw %%ymm0,%%ymm0,%%ymm0 \n"
646 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
647 "lea 0x20(%0),%0 \n"
648 "vmovdqu %%xmm0,(%1) \n"
649 "lea 0x10(%1),%1 \n"
650 "sub $0x8,%2 \n"
651 "jg 1b \n"
652 "vzeroupper \n"
653 : "+r"(src), // %0
654 "+r"(dst), // %1
655 "+r"(width) // %2
656 : "m"(dither4) // %3
657 : "memory", "cc",
658 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
659 );
660 }
661 #endif // HAS_ARGBTORGB565DITHERROW_AVX2
662
663
ARGBToARGB1555Row_SSE2(const uint8 * src,uint8 * dst,int width)664 void ARGBToARGB1555Row_SSE2(const uint8* src, uint8* dst, int width) {
665 asm volatile (
666 "pcmpeqb %%xmm4,%%xmm4 \n"
667 "psrld $0x1b,%%xmm4 \n"
668 "movdqa %%xmm4,%%xmm5 \n"
669 "pslld $0x5,%%xmm5 \n"
670 "movdqa %%xmm4,%%xmm6 \n"
671 "pslld $0xa,%%xmm6 \n"
672 "pcmpeqb %%xmm7,%%xmm7 \n"
673 "pslld $0xf,%%xmm7 \n"
674 LABELALIGN
675 "1: \n"
676 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
677 "movdqa %%xmm0,%%xmm1 \n"
678 "movdqa %%xmm0,%%xmm2 \n"
679 "movdqa %%xmm0,%%xmm3 \n"
680 "psrad $0x10,%%xmm0 \n"
681 "psrld $0x3,%%xmm1 \n"
682 "psrld $0x6,%%xmm2 \n"
683 "psrld $0x9,%%xmm3 \n"
684 "pand %%xmm7,%%xmm0 \n"
685 "pand %%xmm4,%%xmm1 \n"
686 "pand %%xmm5,%%xmm2 \n"
687 "pand %%xmm6,%%xmm3 \n"
688 "por %%xmm1,%%xmm0 \n"
689 "por %%xmm3,%%xmm2 \n"
690 "por %%xmm2,%%xmm0 \n"
691 "packssdw %%xmm0,%%xmm0 \n"
692 "lea " MEMLEA(0x10,0) ",%0 \n"
693 "movq %%xmm0," MEMACCESS(1) " \n"
694 "lea " MEMLEA(0x8,1) ",%1 \n"
695 "sub $0x4,%2 \n"
696 "jg 1b \n"
697 : "+r"(src), // %0
698 "+r"(dst), // %1
699 "+r"(width) // %2
700 :: "memory", "cc",
701 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
702 );
703 }
704
ARGBToARGB4444Row_SSE2(const uint8 * src,uint8 * dst,int width)705 void ARGBToARGB4444Row_SSE2(const uint8* src, uint8* dst, int width) {
706 asm volatile (
707 "pcmpeqb %%xmm4,%%xmm4 \n"
708 "psllw $0xc,%%xmm4 \n"
709 "movdqa %%xmm4,%%xmm3 \n"
710 "psrlw $0x8,%%xmm3 \n"
711 LABELALIGN
712 "1: \n"
713 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
714 "movdqa %%xmm0,%%xmm1 \n"
715 "pand %%xmm3,%%xmm0 \n"
716 "pand %%xmm4,%%xmm1 \n"
717 "psrlq $0x4,%%xmm0 \n"
718 "psrlq $0x8,%%xmm1 \n"
719 "por %%xmm1,%%xmm0 \n"
720 "packuswb %%xmm0,%%xmm0 \n"
721 "lea " MEMLEA(0x10,0) ",%0 \n"
722 "movq %%xmm0," MEMACCESS(1) " \n"
723 "lea " MEMLEA(0x8,1) ",%1 \n"
724 "sub $0x4,%2 \n"
725 "jg 1b \n"
726 : "+r"(src), // %0
727 "+r"(dst), // %1
728 "+r"(width) // %2
729 :: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
730 );
731 }
732 #endif // HAS_RGB24TOARGBROW_SSSE3
733
734 #ifdef HAS_ARGBTOYROW_SSSE3
735 // Convert 16 ARGB pixels (64 bytes) to 16 Y values.
ARGBToYRow_SSSE3(const uint8 * src_argb,uint8 * dst_y,int width)736 void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) {
737 asm volatile (
738 "movdqa %3,%%xmm4 \n"
739 "movdqa %4,%%xmm5 \n"
740 LABELALIGN
741 "1: \n"
742 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
743 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
744 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
745 "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n"
746 "pmaddubsw %%xmm4,%%xmm0 \n"
747 "pmaddubsw %%xmm4,%%xmm1 \n"
748 "pmaddubsw %%xmm4,%%xmm2 \n"
749 "pmaddubsw %%xmm4,%%xmm3 \n"
750 "lea " MEMLEA(0x40,0) ",%0 \n"
751 "phaddw %%xmm1,%%xmm0 \n"
752 "phaddw %%xmm3,%%xmm2 \n"
753 "psrlw $0x7,%%xmm0 \n"
754 "psrlw $0x7,%%xmm2 \n"
755 "packuswb %%xmm2,%%xmm0 \n"
756 "paddb %%xmm5,%%xmm0 \n"
757 "movdqu %%xmm0," MEMACCESS(1) " \n"
758 "lea " MEMLEA(0x10,1) ",%1 \n"
759 "sub $0x10,%2 \n"
760 "jg 1b \n"
761 : "+r"(src_argb), // %0
762 "+r"(dst_y), // %1
763 "+r"(width) // %2
764 : "m"(kARGBToY), // %3
765 "m"(kAddY16) // %4
766 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
767 );
768 }
769 #endif // HAS_ARGBTOYROW_SSSE3
770
771 #ifdef HAS_ARGBTOYJROW_SSSE3
772 // Convert 16 ARGB pixels (64 bytes) to 16 YJ values.
773 // Same as ARGBToYRow but different coefficients, no add 16, but do rounding.
ARGBToYJRow_SSSE3(const uint8 * src_argb,uint8 * dst_y,int width)774 void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) {
775 asm volatile (
776 "movdqa %3,%%xmm4 \n"
777 "movdqa %4,%%xmm5 \n"
778 LABELALIGN
779 "1: \n"
780 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
781 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
782 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
783 "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n"
784 "pmaddubsw %%xmm4,%%xmm0 \n"
785 "pmaddubsw %%xmm4,%%xmm1 \n"
786 "pmaddubsw %%xmm4,%%xmm2 \n"
787 "pmaddubsw %%xmm4,%%xmm3 \n"
788 "lea " MEMLEA(0x40,0) ",%0 \n"
789 "phaddw %%xmm1,%%xmm0 \n"
790 "phaddw %%xmm3,%%xmm2 \n"
791 "paddw %%xmm5,%%xmm0 \n"
792 "paddw %%xmm5,%%xmm2 \n"
793 "psrlw $0x7,%%xmm0 \n"
794 "psrlw $0x7,%%xmm2 \n"
795 "packuswb %%xmm2,%%xmm0 \n"
796 "movdqu %%xmm0," MEMACCESS(1) " \n"
797 "lea " MEMLEA(0x10,1) ",%1 \n"
798 "sub $0x10,%2 \n"
799 "jg 1b \n"
800 : "+r"(src_argb), // %0
801 "+r"(dst_y), // %1
802 "+r"(width) // %2
803 : "m"(kARGBToYJ), // %3
804 "m"(kAddYJ64) // %4
805 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
806 );
807 }
808 #endif // HAS_ARGBTOYJROW_SSSE3
809
810 #ifdef HAS_ARGBTOYROW_AVX2
811 // vpermd for vphaddw + vpackuswb vpermd.
812 static const lvec32 kPermdARGBToY_AVX = {
813 0, 4, 1, 5, 2, 6, 3, 7
814 };
815
816 // Convert 32 ARGB pixels (128 bytes) to 32 Y values.
ARGBToYRow_AVX2(const uint8 * src_argb,uint8 * dst_y,int width)817 void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int width) {
818 asm volatile (
819 "vbroadcastf128 %3,%%ymm4 \n"
820 "vbroadcastf128 %4,%%ymm5 \n"
821 "vmovdqu %5,%%ymm6 \n"
822 LABELALIGN
823 "1: \n"
824 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
825 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
826 "vmovdqu " MEMACCESS2(0x40,0) ",%%ymm2 \n"
827 "vmovdqu " MEMACCESS2(0x60,0) ",%%ymm3 \n"
828 "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n"
829 "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n"
830 "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
831 "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
832 "lea " MEMLEA(0x80,0) ",%0 \n"
833 "vphaddw %%ymm1,%%ymm0,%%ymm0 \n" // mutates.
834 "vphaddw %%ymm3,%%ymm2,%%ymm2 \n"
835 "vpsrlw $0x7,%%ymm0,%%ymm0 \n"
836 "vpsrlw $0x7,%%ymm2,%%ymm2 \n"
837 "vpackuswb %%ymm2,%%ymm0,%%ymm0 \n" // mutates.
838 "vpermd %%ymm0,%%ymm6,%%ymm0 \n" // unmutate.
839 "vpaddb %%ymm5,%%ymm0,%%ymm0 \n" // add 16 for Y
840 "vmovdqu %%ymm0," MEMACCESS(1) " \n"
841 "lea " MEMLEA(0x20,1) ",%1 \n"
842 "sub $0x20,%2 \n"
843 "jg 1b \n"
844 "vzeroupper \n"
845 : "+r"(src_argb), // %0
846 "+r"(dst_y), // %1
847 "+r"(width) // %2
848 : "m"(kARGBToY), // %3
849 "m"(kAddY16), // %4
850 "m"(kPermdARGBToY_AVX) // %5
851 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
852 );
853 }
854 #endif // HAS_ARGBTOYROW_AVX2
855
856 #ifdef HAS_ARGBTOYJROW_AVX2
857 // Convert 32 ARGB pixels (128 bytes) to 32 Y values.
ARGBToYJRow_AVX2(const uint8 * src_argb,uint8 * dst_y,int width)858 void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int width) {
859 asm volatile (
860 "vbroadcastf128 %3,%%ymm4 \n"
861 "vbroadcastf128 %4,%%ymm5 \n"
862 "vmovdqu %5,%%ymm6 \n"
863 LABELALIGN
864 "1: \n"
865 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
866 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
867 "vmovdqu " MEMACCESS2(0x40,0) ",%%ymm2 \n"
868 "vmovdqu " MEMACCESS2(0x60,0) ",%%ymm3 \n"
869 "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n"
870 "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n"
871 "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
872 "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
873 "lea " MEMLEA(0x80,0) ",%0 \n"
874 "vphaddw %%ymm1,%%ymm0,%%ymm0 \n" // mutates.
875 "vphaddw %%ymm3,%%ymm2,%%ymm2 \n"
876 "vpaddw %%ymm5,%%ymm0,%%ymm0 \n" // Add .5 for rounding.
877 "vpaddw %%ymm5,%%ymm2,%%ymm2 \n"
878 "vpsrlw $0x7,%%ymm0,%%ymm0 \n"
879 "vpsrlw $0x7,%%ymm2,%%ymm2 \n"
880 "vpackuswb %%ymm2,%%ymm0,%%ymm0 \n" // mutates.
881 "vpermd %%ymm0,%%ymm6,%%ymm0 \n" // unmutate.
882 "vmovdqu %%ymm0," MEMACCESS(1) " \n"
883 "lea " MEMLEA(0x20,1) ",%1 \n"
884 "sub $0x20,%2 \n"
885 "jg 1b \n"
886 "vzeroupper \n"
887 : "+r"(src_argb), // %0
888 "+r"(dst_y), // %1
889 "+r"(width) // %2
890 : "m"(kARGBToYJ), // %3
891 "m"(kAddYJ64), // %4
892 "m"(kPermdARGBToY_AVX) // %5
893 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
894 );
895 }
896 #endif // HAS_ARGBTOYJROW_AVX2
897
898 #ifdef HAS_ARGBTOUVROW_SSSE3
ARGBToUVRow_SSSE3(const uint8 * src_argb0,int src_stride_argb,uint8 * dst_u,uint8 * dst_v,int width)899 void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
900 uint8* dst_u, uint8* dst_v, int width) {
901 asm volatile (
902 "movdqa %5,%%xmm3 \n"
903 "movdqa %6,%%xmm4 \n"
904 "movdqa %7,%%xmm5 \n"
905 "sub %1,%2 \n"
906 LABELALIGN
907 "1: \n"
908 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
909 MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7
910 "pavgb %%xmm7,%%xmm0 \n"
911 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
912 MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7
913 "pavgb %%xmm7,%%xmm1 \n"
914 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
915 MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7
916 "pavgb %%xmm7,%%xmm2 \n"
917 "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n"
918 MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7
919 "pavgb %%xmm7,%%xmm6 \n"
920
921 "lea " MEMLEA(0x40,0) ",%0 \n"
922 "movdqa %%xmm0,%%xmm7 \n"
923 "shufps $0x88,%%xmm1,%%xmm0 \n"
924 "shufps $0xdd,%%xmm1,%%xmm7 \n"
925 "pavgb %%xmm7,%%xmm0 \n"
926 "movdqa %%xmm2,%%xmm7 \n"
927 "shufps $0x88,%%xmm6,%%xmm2 \n"
928 "shufps $0xdd,%%xmm6,%%xmm7 \n"
929 "pavgb %%xmm7,%%xmm2 \n"
930 "movdqa %%xmm0,%%xmm1 \n"
931 "movdqa %%xmm2,%%xmm6 \n"
932 "pmaddubsw %%xmm4,%%xmm0 \n"
933 "pmaddubsw %%xmm4,%%xmm2 \n"
934 "pmaddubsw %%xmm3,%%xmm1 \n"
935 "pmaddubsw %%xmm3,%%xmm6 \n"
936 "phaddw %%xmm2,%%xmm0 \n"
937 "phaddw %%xmm6,%%xmm1 \n"
938 "psraw $0x8,%%xmm0 \n"
939 "psraw $0x8,%%xmm1 \n"
940 "packsswb %%xmm1,%%xmm0 \n"
941 "paddb %%xmm5,%%xmm0 \n"
942 "movlps %%xmm0," MEMACCESS(1) " \n"
943 MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1)
944 "lea " MEMLEA(0x8,1) ",%1 \n"
945 "sub $0x10,%3 \n"
946 "jg 1b \n"
947 : "+r"(src_argb0), // %0
948 "+r"(dst_u), // %1
949 "+r"(dst_v), // %2
950 "+rm"(width) // %3
951 : "r"((intptr_t)(src_stride_argb)), // %4
952 "m"(kARGBToV), // %5
953 "m"(kARGBToU), // %6
954 "m"(kAddUV128) // %7
955 : "memory", "cc", NACL_R14
956 "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
957 );
958 }
959 #endif // HAS_ARGBTOUVROW_SSSE3
960
961 #ifdef HAS_ARGBTOUVROW_AVX2
962 // vpshufb for vphaddw + vpackuswb packed to shorts.
963 static const lvec8 kShufARGBToUV_AVX = {
964 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
965 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15
966 };
ARGBToUVRow_AVX2(const uint8 * src_argb0,int src_stride_argb,uint8 * dst_u,uint8 * dst_v,int width)967 void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb,
968 uint8* dst_u, uint8* dst_v, int width) {
969 asm volatile (
970 "vbroadcastf128 %5,%%ymm5 \n"
971 "vbroadcastf128 %6,%%ymm6 \n"
972 "vbroadcastf128 %7,%%ymm7 \n"
973 "sub %1,%2 \n"
974 LABELALIGN
975 "1: \n"
976 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
977 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
978 "vmovdqu " MEMACCESS2(0x40,0) ",%%ymm2 \n"
979 "vmovdqu " MEMACCESS2(0x60,0) ",%%ymm3 \n"
980 VMEMOPREG(vpavgb,0x00,0,4,1,ymm0,ymm0) // vpavgb (%0,%4,1),%%ymm0,%%ymm0
981 VMEMOPREG(vpavgb,0x20,0,4,1,ymm1,ymm1)
982 VMEMOPREG(vpavgb,0x40,0,4,1,ymm2,ymm2)
983 VMEMOPREG(vpavgb,0x60,0,4,1,ymm3,ymm3)
984 "lea " MEMLEA(0x80,0) ",%0 \n"
985 "vshufps $0x88,%%ymm1,%%ymm0,%%ymm4 \n"
986 "vshufps $0xdd,%%ymm1,%%ymm0,%%ymm0 \n"
987 "vpavgb %%ymm4,%%ymm0,%%ymm0 \n"
988 "vshufps $0x88,%%ymm3,%%ymm2,%%ymm4 \n"
989 "vshufps $0xdd,%%ymm3,%%ymm2,%%ymm2 \n"
990 "vpavgb %%ymm4,%%ymm2,%%ymm2 \n"
991
992 "vpmaddubsw %%ymm7,%%ymm0,%%ymm1 \n"
993 "vpmaddubsw %%ymm7,%%ymm2,%%ymm3 \n"
994 "vpmaddubsw %%ymm6,%%ymm0,%%ymm0 \n"
995 "vpmaddubsw %%ymm6,%%ymm2,%%ymm2 \n"
996 "vphaddw %%ymm3,%%ymm1,%%ymm1 \n"
997 "vphaddw %%ymm2,%%ymm0,%%ymm0 \n"
998 "vpsraw $0x8,%%ymm1,%%ymm1 \n"
999 "vpsraw $0x8,%%ymm0,%%ymm0 \n"
1000 "vpacksswb %%ymm0,%%ymm1,%%ymm0 \n"
1001 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
1002 "vpshufb %8,%%ymm0,%%ymm0 \n"
1003 "vpaddb %%ymm5,%%ymm0,%%ymm0 \n"
1004
1005 "vextractf128 $0x0,%%ymm0," MEMACCESS(1) " \n"
1006 VEXTOPMEM(vextractf128,1,ymm0,0x0,1,2,1) // vextractf128 $1,%%ymm0,(%1,%2,1)
1007 "lea " MEMLEA(0x10,1) ",%1 \n"
1008 "sub $0x20,%3 \n"
1009 "jg 1b \n"
1010 "vzeroupper \n"
1011 : "+r"(src_argb0), // %0
1012 "+r"(dst_u), // %1
1013 "+r"(dst_v), // %2
1014 "+rm"(width) // %3
1015 : "r"((intptr_t)(src_stride_argb)), // %4
1016 "m"(kAddUV128), // %5
1017 "m"(kARGBToV), // %6
1018 "m"(kARGBToU), // %7
1019 "m"(kShufARGBToUV_AVX) // %8
1020 : "memory", "cc", NACL_R14
1021 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
1022 );
1023 }
1024 #endif // HAS_ARGBTOUVROW_AVX2
1025
1026 #ifdef HAS_ARGBTOUVJROW_AVX2
ARGBToUVJRow_AVX2(const uint8 * src_argb0,int src_stride_argb,uint8 * dst_u,uint8 * dst_v,int width)1027 void ARGBToUVJRow_AVX2(const uint8* src_argb0, int src_stride_argb,
1028 uint8* dst_u, uint8* dst_v, int width) {
1029 asm volatile (
1030 "vbroadcastf128 %5,%%ymm5 \n"
1031 "vbroadcastf128 %6,%%ymm6 \n"
1032 "vbroadcastf128 %7,%%ymm7 \n"
1033 "sub %1,%2 \n"
1034 LABELALIGN
1035 "1: \n"
1036 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
1037 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
1038 "vmovdqu " MEMACCESS2(0x40,0) ",%%ymm2 \n"
1039 "vmovdqu " MEMACCESS2(0x60,0) ",%%ymm3 \n"
1040 VMEMOPREG(vpavgb,0x00,0,4,1,ymm0,ymm0) // vpavgb (%0,%4,1),%%ymm0,%%ymm0
1041 VMEMOPREG(vpavgb,0x20,0,4,1,ymm1,ymm1)
1042 VMEMOPREG(vpavgb,0x40,0,4,1,ymm2,ymm2)
1043 VMEMOPREG(vpavgb,0x60,0,4,1,ymm3,ymm3)
1044 "lea " MEMLEA(0x80,0) ",%0 \n"
1045 "vshufps $0x88,%%ymm1,%%ymm0,%%ymm4 \n"
1046 "vshufps $0xdd,%%ymm1,%%ymm0,%%ymm0 \n"
1047 "vpavgb %%ymm4,%%ymm0,%%ymm0 \n"
1048 "vshufps $0x88,%%ymm3,%%ymm2,%%ymm4 \n"
1049 "vshufps $0xdd,%%ymm3,%%ymm2,%%ymm2 \n"
1050 "vpavgb %%ymm4,%%ymm2,%%ymm2 \n"
1051
1052 "vpmaddubsw %%ymm7,%%ymm0,%%ymm1 \n"
1053 "vpmaddubsw %%ymm7,%%ymm2,%%ymm3 \n"
1054 "vpmaddubsw %%ymm6,%%ymm0,%%ymm0 \n"
1055 "vpmaddubsw %%ymm6,%%ymm2,%%ymm2 \n"
1056 "vphaddw %%ymm3,%%ymm1,%%ymm1 \n"
1057 "vphaddw %%ymm2,%%ymm0,%%ymm0 \n"
1058 "vpaddw %%ymm5,%%ymm0,%%ymm0 \n"
1059 "vpaddw %%ymm5,%%ymm1,%%ymm1 \n"
1060 "vpsraw $0x8,%%ymm1,%%ymm1 \n"
1061 "vpsraw $0x8,%%ymm0,%%ymm0 \n"
1062 "vpacksswb %%ymm0,%%ymm1,%%ymm0 \n"
1063 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
1064 "vpshufb %8,%%ymm0,%%ymm0 \n"
1065
1066 "vextractf128 $0x0,%%ymm0," MEMACCESS(1) " \n"
1067 VEXTOPMEM(vextractf128,1,ymm0,0x0,1,2,1) // vextractf128 $1,%%ymm0,(%1,%2,1)
1068 "lea " MEMLEA(0x10,1) ",%1 \n"
1069 "sub $0x20,%3 \n"
1070 "jg 1b \n"
1071 "vzeroupper \n"
1072 : "+r"(src_argb0), // %0
1073 "+r"(dst_u), // %1
1074 "+r"(dst_v), // %2
1075 "+rm"(width) // %3
1076 : "r"((intptr_t)(src_stride_argb)), // %4
1077 "m"(kAddUVJ128), // %5
1078 "m"(kARGBToVJ), // %6
1079 "m"(kARGBToUJ), // %7
1080 "m"(kShufARGBToUV_AVX) // %8
1081 : "memory", "cc", NACL_R14
1082 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
1083 );
1084 }
1085 #endif // HAS_ARGBTOUVJROW_AVX2
1086
1087 #ifdef HAS_ARGBTOUVJROW_SSSE3
ARGBToUVJRow_SSSE3(const uint8 * src_argb0,int src_stride_argb,uint8 * dst_u,uint8 * dst_v,int width)1088 void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
1089 uint8* dst_u, uint8* dst_v, int width) {
1090 asm volatile (
1091 "movdqa %5,%%xmm3 \n"
1092 "movdqa %6,%%xmm4 \n"
1093 "movdqa %7,%%xmm5 \n"
1094 "sub %1,%2 \n"
1095 LABELALIGN
1096 "1: \n"
1097 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
1098 MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7
1099 "pavgb %%xmm7,%%xmm0 \n"
1100 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
1101 MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7
1102 "pavgb %%xmm7,%%xmm1 \n"
1103 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
1104 MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7
1105 "pavgb %%xmm7,%%xmm2 \n"
1106 "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n"
1107 MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7
1108 "pavgb %%xmm7,%%xmm6 \n"
1109
1110 "lea " MEMLEA(0x40,0) ",%0 \n"
1111 "movdqa %%xmm0,%%xmm7 \n"
1112 "shufps $0x88,%%xmm1,%%xmm0 \n"
1113 "shufps $0xdd,%%xmm1,%%xmm7 \n"
1114 "pavgb %%xmm7,%%xmm0 \n"
1115 "movdqa %%xmm2,%%xmm7 \n"
1116 "shufps $0x88,%%xmm6,%%xmm2 \n"
1117 "shufps $0xdd,%%xmm6,%%xmm7 \n"
1118 "pavgb %%xmm7,%%xmm2 \n"
1119 "movdqa %%xmm0,%%xmm1 \n"
1120 "movdqa %%xmm2,%%xmm6 \n"
1121 "pmaddubsw %%xmm4,%%xmm0 \n"
1122 "pmaddubsw %%xmm4,%%xmm2 \n"
1123 "pmaddubsw %%xmm3,%%xmm1 \n"
1124 "pmaddubsw %%xmm3,%%xmm6 \n"
1125 "phaddw %%xmm2,%%xmm0 \n"
1126 "phaddw %%xmm6,%%xmm1 \n"
1127 "paddw %%xmm5,%%xmm0 \n"
1128 "paddw %%xmm5,%%xmm1 \n"
1129 "psraw $0x8,%%xmm0 \n"
1130 "psraw $0x8,%%xmm1 \n"
1131 "packsswb %%xmm1,%%xmm0 \n"
1132 "movlps %%xmm0," MEMACCESS(1) " \n"
1133 MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1)
1134 "lea " MEMLEA(0x8,1) ",%1 \n"
1135 "sub $0x10,%3 \n"
1136 "jg 1b \n"
1137 : "+r"(src_argb0), // %0
1138 "+r"(dst_u), // %1
1139 "+r"(dst_v), // %2
1140 "+rm"(width) // %3
1141 : "r"((intptr_t)(src_stride_argb)), // %4
1142 "m"(kARGBToVJ), // %5
1143 "m"(kARGBToUJ), // %6
1144 "m"(kAddUVJ128) // %7
1145 : "memory", "cc", NACL_R14
1146 "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1147 );
1148 }
1149 #endif // HAS_ARGBTOUVJROW_SSSE3
1150
1151 #ifdef HAS_ARGBTOUV444ROW_SSSE3
ARGBToUV444Row_SSSE3(const uint8 * src_argb,uint8 * dst_u,uint8 * dst_v,int width)1152 void ARGBToUV444Row_SSSE3(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
1153 int width) {
1154 asm volatile (
1155 "movdqa %4,%%xmm3 \n"
1156 "movdqa %5,%%xmm4 \n"
1157 "movdqa %6,%%xmm5 \n"
1158 "sub %1,%2 \n"
1159 LABELALIGN
1160 "1: \n"
1161 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
1162 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
1163 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
1164 "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n"
1165 "pmaddubsw %%xmm4,%%xmm0 \n"
1166 "pmaddubsw %%xmm4,%%xmm1 \n"
1167 "pmaddubsw %%xmm4,%%xmm2 \n"
1168 "pmaddubsw %%xmm4,%%xmm6 \n"
1169 "phaddw %%xmm1,%%xmm0 \n"
1170 "phaddw %%xmm6,%%xmm2 \n"
1171 "psraw $0x8,%%xmm0 \n"
1172 "psraw $0x8,%%xmm2 \n"
1173 "packsswb %%xmm2,%%xmm0 \n"
1174 "paddb %%xmm5,%%xmm0 \n"
1175 "movdqu %%xmm0," MEMACCESS(1) " \n"
1176 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
1177 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
1178 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
1179 "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n"
1180 "pmaddubsw %%xmm3,%%xmm0 \n"
1181 "pmaddubsw %%xmm3,%%xmm1 \n"
1182 "pmaddubsw %%xmm3,%%xmm2 \n"
1183 "pmaddubsw %%xmm3,%%xmm6 \n"
1184 "phaddw %%xmm1,%%xmm0 \n"
1185 "phaddw %%xmm6,%%xmm2 \n"
1186 "psraw $0x8,%%xmm0 \n"
1187 "psraw $0x8,%%xmm2 \n"
1188 "packsswb %%xmm2,%%xmm0 \n"
1189 "paddb %%xmm5,%%xmm0 \n"
1190 "lea " MEMLEA(0x40,0) ",%0 \n"
1191 MEMOPMEM(movdqu,xmm0,0x00,1,2,1) // movdqu %%xmm0,(%1,%2,1)
1192 "lea " MEMLEA(0x10,1) ",%1 \n"
1193 "sub $0x10,%3 \n"
1194 "jg 1b \n"
1195 : "+r"(src_argb), // %0
1196 "+r"(dst_u), // %1
1197 "+r"(dst_v), // %2
1198 "+rm"(width) // %3
1199 : "m"(kARGBToV), // %4
1200 "m"(kARGBToU), // %5
1201 "m"(kAddUV128) // %6
1202 : "memory", "cc", NACL_R14
1203 "xmm0", "xmm1", "xmm2", "xmm6"
1204 );
1205 }
1206 #endif // HAS_ARGBTOUV444ROW_SSSE3
1207
BGRAToYRow_SSSE3(const uint8 * src_bgra,uint8 * dst_y,int width)1208 void BGRAToYRow_SSSE3(const uint8* src_bgra, uint8* dst_y, int width) {
1209 asm volatile (
1210 "movdqa %4,%%xmm5 \n"
1211 "movdqa %3,%%xmm4 \n"
1212 LABELALIGN
1213 "1: \n"
1214 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
1215 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
1216 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
1217 "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n"
1218 "pmaddubsw %%xmm4,%%xmm0 \n"
1219 "pmaddubsw %%xmm4,%%xmm1 \n"
1220 "pmaddubsw %%xmm4,%%xmm2 \n"
1221 "pmaddubsw %%xmm4,%%xmm3 \n"
1222 "lea " MEMLEA(0x40,0) ",%0 \n"
1223 "phaddw %%xmm1,%%xmm0 \n"
1224 "phaddw %%xmm3,%%xmm2 \n"
1225 "psrlw $0x7,%%xmm0 \n"
1226 "psrlw $0x7,%%xmm2 \n"
1227 "packuswb %%xmm2,%%xmm0 \n"
1228 "paddb %%xmm5,%%xmm0 \n"
1229 "movdqu %%xmm0," MEMACCESS(1) " \n"
1230 "lea " MEMLEA(0x10,1) ",%1 \n"
1231 "sub $0x10,%2 \n"
1232 "jg 1b \n"
1233 : "+r"(src_bgra), // %0
1234 "+r"(dst_y), // %1
1235 "+r"(width) // %2
1236 : "m"(kBGRAToY), // %3
1237 "m"(kAddY16) // %4
1238 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1239 );
1240 }
1241
BGRAToUVRow_SSSE3(const uint8 * src_bgra0,int src_stride_bgra,uint8 * dst_u,uint8 * dst_v,int width)1242 void BGRAToUVRow_SSSE3(const uint8* src_bgra0, int src_stride_bgra,
1243 uint8* dst_u, uint8* dst_v, int width) {
1244 asm volatile (
1245 "movdqa %5,%%xmm3 \n"
1246 "movdqa %6,%%xmm4 \n"
1247 "movdqa %7,%%xmm5 \n"
1248 "sub %1,%2 \n"
1249 LABELALIGN
1250 "1: \n"
1251 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
1252 MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7
1253 "pavgb %%xmm7,%%xmm0 \n"
1254 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
1255 MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7
1256 "pavgb %%xmm7,%%xmm1 \n"
1257 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
1258 MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7
1259 "pavgb %%xmm7,%%xmm2 \n"
1260 "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n"
1261 MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7
1262 "pavgb %%xmm7,%%xmm6 \n"
1263
1264 "lea " MEMLEA(0x40,0) ",%0 \n"
1265 "movdqa %%xmm0,%%xmm7 \n"
1266 "shufps $0x88,%%xmm1,%%xmm0 \n"
1267 "shufps $0xdd,%%xmm1,%%xmm7 \n"
1268 "pavgb %%xmm7,%%xmm0 \n"
1269 "movdqa %%xmm2,%%xmm7 \n"
1270 "shufps $0x88,%%xmm6,%%xmm2 \n"
1271 "shufps $0xdd,%%xmm6,%%xmm7 \n"
1272 "pavgb %%xmm7,%%xmm2 \n"
1273 "movdqa %%xmm0,%%xmm1 \n"
1274 "movdqa %%xmm2,%%xmm6 \n"
1275 "pmaddubsw %%xmm4,%%xmm0 \n"
1276 "pmaddubsw %%xmm4,%%xmm2 \n"
1277 "pmaddubsw %%xmm3,%%xmm1 \n"
1278 "pmaddubsw %%xmm3,%%xmm6 \n"
1279 "phaddw %%xmm2,%%xmm0 \n"
1280 "phaddw %%xmm6,%%xmm1 \n"
1281 "psraw $0x8,%%xmm0 \n"
1282 "psraw $0x8,%%xmm1 \n"
1283 "packsswb %%xmm1,%%xmm0 \n"
1284 "paddb %%xmm5,%%xmm0 \n"
1285 "movlps %%xmm0," MEMACCESS(1) " \n"
1286 MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1)
1287 "lea " MEMLEA(0x8,1) ",%1 \n"
1288 "sub $0x10,%3 \n"
1289 "jg 1b \n"
1290 : "+r"(src_bgra0), // %0
1291 "+r"(dst_u), // %1
1292 "+r"(dst_v), // %2
1293 "+rm"(width) // %3
1294 : "r"((intptr_t)(src_stride_bgra)), // %4
1295 "m"(kBGRAToV), // %5
1296 "m"(kBGRAToU), // %6
1297 "m"(kAddUV128) // %7
1298 : "memory", "cc", NACL_R14
1299 "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1300 );
1301 }
1302
ABGRToYRow_SSSE3(const uint8 * src_abgr,uint8 * dst_y,int width)1303 void ABGRToYRow_SSSE3(const uint8* src_abgr, uint8* dst_y, int width) {
1304 asm volatile (
1305 "movdqa %4,%%xmm5 \n"
1306 "movdqa %3,%%xmm4 \n"
1307 LABELALIGN
1308 "1: \n"
1309 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
1310 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
1311 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
1312 "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n"
1313 "pmaddubsw %%xmm4,%%xmm0 \n"
1314 "pmaddubsw %%xmm4,%%xmm1 \n"
1315 "pmaddubsw %%xmm4,%%xmm2 \n"
1316 "pmaddubsw %%xmm4,%%xmm3 \n"
1317 "lea " MEMLEA(0x40,0) ",%0 \n"
1318 "phaddw %%xmm1,%%xmm0 \n"
1319 "phaddw %%xmm3,%%xmm2 \n"
1320 "psrlw $0x7,%%xmm0 \n"
1321 "psrlw $0x7,%%xmm2 \n"
1322 "packuswb %%xmm2,%%xmm0 \n"
1323 "paddb %%xmm5,%%xmm0 \n"
1324 "movdqu %%xmm0," MEMACCESS(1) " \n"
1325 "lea " MEMLEA(0x10,1) ",%1 \n"
1326 "sub $0x10,%2 \n"
1327 "jg 1b \n"
1328 : "+r"(src_abgr), // %0
1329 "+r"(dst_y), // %1
1330 "+r"(width) // %2
1331 : "m"(kABGRToY), // %3
1332 "m"(kAddY16) // %4
1333 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1334 );
1335 }
1336
RGBAToYRow_SSSE3(const uint8 * src_rgba,uint8 * dst_y,int width)1337 void RGBAToYRow_SSSE3(const uint8* src_rgba, uint8* dst_y, int width) {
1338 asm volatile (
1339 "movdqa %4,%%xmm5 \n"
1340 "movdqa %3,%%xmm4 \n"
1341 LABELALIGN
1342 "1: \n"
1343 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
1344 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
1345 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
1346 "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n"
1347 "pmaddubsw %%xmm4,%%xmm0 \n"
1348 "pmaddubsw %%xmm4,%%xmm1 \n"
1349 "pmaddubsw %%xmm4,%%xmm2 \n"
1350 "pmaddubsw %%xmm4,%%xmm3 \n"
1351 "lea " MEMLEA(0x40,0) ",%0 \n"
1352 "phaddw %%xmm1,%%xmm0 \n"
1353 "phaddw %%xmm3,%%xmm2 \n"
1354 "psrlw $0x7,%%xmm0 \n"
1355 "psrlw $0x7,%%xmm2 \n"
1356 "packuswb %%xmm2,%%xmm0 \n"
1357 "paddb %%xmm5,%%xmm0 \n"
1358 "movdqu %%xmm0," MEMACCESS(1) " \n"
1359 "lea " MEMLEA(0x10,1) ",%1 \n"
1360 "sub $0x10,%2 \n"
1361 "jg 1b \n"
1362 : "+r"(src_rgba), // %0
1363 "+r"(dst_y), // %1
1364 "+r"(width) // %2
1365 : "m"(kRGBAToY), // %3
1366 "m"(kAddY16) // %4
1367 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1368 );
1369 }
1370
ABGRToUVRow_SSSE3(const uint8 * src_abgr0,int src_stride_abgr,uint8 * dst_u,uint8 * dst_v,int width)1371 void ABGRToUVRow_SSSE3(const uint8* src_abgr0, int src_stride_abgr,
1372 uint8* dst_u, uint8* dst_v, int width) {
1373 asm volatile (
1374 "movdqa %5,%%xmm3 \n"
1375 "movdqa %6,%%xmm4 \n"
1376 "movdqa %7,%%xmm5 \n"
1377 "sub %1,%2 \n"
1378 LABELALIGN
1379 "1: \n"
1380 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
1381 MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7
1382 "pavgb %%xmm7,%%xmm0 \n"
1383 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
1384 MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7
1385 "pavgb %%xmm7,%%xmm1 \n"
1386 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
1387 MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7
1388 "pavgb %%xmm7,%%xmm2 \n"
1389 "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n"
1390 MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7
1391 "pavgb %%xmm7,%%xmm6 \n"
1392
1393 "lea " MEMLEA(0x40,0) ",%0 \n"
1394 "movdqa %%xmm0,%%xmm7 \n"
1395 "shufps $0x88,%%xmm1,%%xmm0 \n"
1396 "shufps $0xdd,%%xmm1,%%xmm7 \n"
1397 "pavgb %%xmm7,%%xmm0 \n"
1398 "movdqa %%xmm2,%%xmm7 \n"
1399 "shufps $0x88,%%xmm6,%%xmm2 \n"
1400 "shufps $0xdd,%%xmm6,%%xmm7 \n"
1401 "pavgb %%xmm7,%%xmm2 \n"
1402 "movdqa %%xmm0,%%xmm1 \n"
1403 "movdqa %%xmm2,%%xmm6 \n"
1404 "pmaddubsw %%xmm4,%%xmm0 \n"
1405 "pmaddubsw %%xmm4,%%xmm2 \n"
1406 "pmaddubsw %%xmm3,%%xmm1 \n"
1407 "pmaddubsw %%xmm3,%%xmm6 \n"
1408 "phaddw %%xmm2,%%xmm0 \n"
1409 "phaddw %%xmm6,%%xmm1 \n"
1410 "psraw $0x8,%%xmm0 \n"
1411 "psraw $0x8,%%xmm1 \n"
1412 "packsswb %%xmm1,%%xmm0 \n"
1413 "paddb %%xmm5,%%xmm0 \n"
1414 "movlps %%xmm0," MEMACCESS(1) " \n"
1415 MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1)
1416 "lea " MEMLEA(0x8,1) ",%1 \n"
1417 "sub $0x10,%3 \n"
1418 "jg 1b \n"
1419 : "+r"(src_abgr0), // %0
1420 "+r"(dst_u), // %1
1421 "+r"(dst_v), // %2
1422 "+rm"(width) // %3
1423 : "r"((intptr_t)(src_stride_abgr)), // %4
1424 "m"(kABGRToV), // %5
1425 "m"(kABGRToU), // %6
1426 "m"(kAddUV128) // %7
1427 : "memory", "cc", NACL_R14
1428 "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1429 );
1430 }
1431
RGBAToUVRow_SSSE3(const uint8 * src_rgba0,int src_stride_rgba,uint8 * dst_u,uint8 * dst_v,int width)1432 void RGBAToUVRow_SSSE3(const uint8* src_rgba0, int src_stride_rgba,
1433 uint8* dst_u, uint8* dst_v, int width) {
1434 asm volatile (
1435 "movdqa %5,%%xmm3 \n"
1436 "movdqa %6,%%xmm4 \n"
1437 "movdqa %7,%%xmm5 \n"
1438 "sub %1,%2 \n"
1439 LABELALIGN
1440 "1: \n"
1441 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
1442 MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7
1443 "pavgb %%xmm7,%%xmm0 \n"
1444 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
1445 MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7
1446 "pavgb %%xmm7,%%xmm1 \n"
1447 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
1448 MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7
1449 "pavgb %%xmm7,%%xmm2 \n"
1450 "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n"
1451 MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7
1452 "pavgb %%xmm7,%%xmm6 \n"
1453
1454 "lea " MEMLEA(0x40,0) ",%0 \n"
1455 "movdqa %%xmm0,%%xmm7 \n"
1456 "shufps $0x88,%%xmm1,%%xmm0 \n"
1457 "shufps $0xdd,%%xmm1,%%xmm7 \n"
1458 "pavgb %%xmm7,%%xmm0 \n"
1459 "movdqa %%xmm2,%%xmm7 \n"
1460 "shufps $0x88,%%xmm6,%%xmm2 \n"
1461 "shufps $0xdd,%%xmm6,%%xmm7 \n"
1462 "pavgb %%xmm7,%%xmm2 \n"
1463 "movdqa %%xmm0,%%xmm1 \n"
1464 "movdqa %%xmm2,%%xmm6 \n"
1465 "pmaddubsw %%xmm4,%%xmm0 \n"
1466 "pmaddubsw %%xmm4,%%xmm2 \n"
1467 "pmaddubsw %%xmm3,%%xmm1 \n"
1468 "pmaddubsw %%xmm3,%%xmm6 \n"
1469 "phaddw %%xmm2,%%xmm0 \n"
1470 "phaddw %%xmm6,%%xmm1 \n"
1471 "psraw $0x8,%%xmm0 \n"
1472 "psraw $0x8,%%xmm1 \n"
1473 "packsswb %%xmm1,%%xmm0 \n"
1474 "paddb %%xmm5,%%xmm0 \n"
1475 "movlps %%xmm0," MEMACCESS(1) " \n"
1476 MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1)
1477 "lea " MEMLEA(0x8,1) ",%1 \n"
1478 "sub $0x10,%3 \n"
1479 "jg 1b \n"
1480 : "+r"(src_rgba0), // %0
1481 "+r"(dst_u), // %1
1482 "+r"(dst_v), // %2
1483 "+rm"(width) // %3
1484 : "r"((intptr_t)(src_stride_rgba)), // %4
1485 "m"(kRGBAToV), // %5
1486 "m"(kRGBAToU), // %6
1487 "m"(kAddUV128) // %7
1488 : "memory", "cc", NACL_R14
1489 "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1490 );
1491 }
1492
1493 #if defined(HAS_I422TOARGBROW_SSSE3) || defined(HAS_I422TOARGBROW_AVX2)
1494
1495 // Read 8 UV from 444
1496 #define READYUV444 \
1497 "movq " MEMACCESS([u_buf]) ",%%xmm0 \n" \
1498 MEMOPREG(movq, 0x00, [u_buf], [v_buf], 1, xmm1) \
1499 "lea " MEMLEA(0x8, [u_buf]) ",%[u_buf] \n" \
1500 "punpcklbw %%xmm1,%%xmm0 \n" \
1501 "movq " MEMACCESS([y_buf]) ",%%xmm4 \n" \
1502 "punpcklbw %%xmm4,%%xmm4 \n" \
1503 "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n"
1504
1505 // Read 4 UV from 422, upsample to 8 UV
1506 #define READYUV422 \
1507 "movd " MEMACCESS([u_buf]) ",%%xmm0 \n" \
1508 MEMOPREG(movd, 0x00, [u_buf], [v_buf], 1, xmm1) \
1509 "lea " MEMLEA(0x4, [u_buf]) ",%[u_buf] \n" \
1510 "punpcklbw %%xmm1,%%xmm0 \n" \
1511 "punpcklwd %%xmm0,%%xmm0 \n" \
1512 "movq " MEMACCESS([y_buf]) ",%%xmm4 \n" \
1513 "punpcklbw %%xmm4,%%xmm4 \n" \
1514 "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n"
1515
1516 // Read 4 UV from 422, upsample to 8 UV. With 8 Alpha.
1517 #define READYUVA422 \
1518 "movd " MEMACCESS([u_buf]) ",%%xmm0 \n" \
1519 MEMOPREG(movd, 0x00, [u_buf], [v_buf], 1, xmm1) \
1520 "lea " MEMLEA(0x4, [u_buf]) ",%[u_buf] \n" \
1521 "punpcklbw %%xmm1,%%xmm0 \n" \
1522 "punpcklwd %%xmm0,%%xmm0 \n" \
1523 "movq " MEMACCESS([y_buf]) ",%%xmm4 \n" \
1524 "punpcklbw %%xmm4,%%xmm4 \n" \
1525 "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n" \
1526 "movq " MEMACCESS([a_buf]) ",%%xmm5 \n" \
1527 "lea " MEMLEA(0x8, [a_buf]) ",%[a_buf] \n"
1528
1529 // Read 2 UV from 411, upsample to 8 UV.
1530 // reading 4 bytes is an msan violation.
1531 // "movd " MEMACCESS([u_buf]) ",%%xmm0 \n"
1532 // MEMOPREG(movd, 0x00, [u_buf], [v_buf], 1, xmm1)
1533 // pinsrw fails with drmemory
1534 // __asm pinsrw xmm0, [esi], 0 /* U */
1535 // __asm pinsrw xmm1, [esi + edi], 0 /* V */
1536 #define READYUV411_TEMP \
1537 "movzwl " MEMACCESS([u_buf]) ",%[temp] \n" \
1538 "movd %[temp],%%xmm0 \n" \
1539 MEMOPARG(movzwl, 0x00, [u_buf], [v_buf], 1, [temp]) " \n" \
1540 "movd %[temp],%%xmm1 \n" \
1541 "lea " MEMLEA(0x2, [u_buf]) ",%[u_buf] \n" \
1542 "punpcklbw %%xmm1,%%xmm0 \n" \
1543 "punpcklwd %%xmm0,%%xmm0 \n" \
1544 "punpckldq %%xmm0,%%xmm0 \n" \
1545 "movq " MEMACCESS([y_buf]) ",%%xmm4 \n" \
1546 "punpcklbw %%xmm4,%%xmm4 \n" \
1547 "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n"
1548
1549 // Read 4 UV from NV12, upsample to 8 UV
1550 #define READNV12 \
1551 "movq " MEMACCESS([uv_buf]) ",%%xmm0 \n" \
1552 "lea " MEMLEA(0x8, [uv_buf]) ",%[uv_buf] \n" \
1553 "punpcklwd %%xmm0,%%xmm0 \n" \
1554 "movq " MEMACCESS([y_buf]) ",%%xmm4 \n" \
1555 "punpcklbw %%xmm4,%%xmm4 \n" \
1556 "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n"
1557
1558 // Read 4 VU from NV21, upsample to 8 UV
1559 #define READNV21 \
1560 "movq " MEMACCESS([vu_buf]) ",%%xmm0 \n" \
1561 "lea " MEMLEA(0x8, [vu_buf]) ",%[vu_buf] \n" \
1562 "pshufb %[kShuffleNV21], %%xmm0 \n" \
1563 "movq " MEMACCESS([y_buf]) ",%%xmm4 \n" \
1564 "punpcklbw %%xmm4,%%xmm4 \n" \
1565 "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n"
1566
1567 // Read 4 YUY2 with 8 Y and update 4 UV to 8 UV.
1568 #define READYUY2 \
1569 "movdqu " MEMACCESS([yuy2_buf]) ",%%xmm4 \n" \
1570 "pshufb %[kShuffleYUY2Y], %%xmm4 \n" \
1571 "movdqu " MEMACCESS([yuy2_buf]) ",%%xmm0 \n" \
1572 "pshufb %[kShuffleYUY2UV], %%xmm0 \n" \
1573 "lea " MEMLEA(0x10, [yuy2_buf]) ",%[yuy2_buf] \n"
1574
1575 // Read 4 UYVY with 8 Y and update 4 UV to 8 UV.
1576 #define READUYVY \
1577 "movdqu " MEMACCESS([uyvy_buf]) ",%%xmm4 \n" \
1578 "pshufb %[kShuffleUYVYY], %%xmm4 \n" \
1579 "movdqu " MEMACCESS([uyvy_buf]) ",%%xmm0 \n" \
1580 "pshufb %[kShuffleUYVYUV], %%xmm0 \n" \
1581 "lea " MEMLEA(0x10, [uyvy_buf]) ",%[uyvy_buf] \n"
1582
1583 #if defined(__x86_64__)
1584 #define YUVTORGB_SETUP(yuvconstants) \
1585 "movdqa " MEMACCESS([yuvconstants]) ",%%xmm8 \n" \
1586 "movdqa " MEMACCESS2(32, [yuvconstants]) ",%%xmm9 \n" \
1587 "movdqa " MEMACCESS2(64, [yuvconstants]) ",%%xmm10 \n" \
1588 "movdqa " MEMACCESS2(96, [yuvconstants]) ",%%xmm11 \n" \
1589 "movdqa " MEMACCESS2(128, [yuvconstants]) ",%%xmm12 \n" \
1590 "movdqa " MEMACCESS2(160, [yuvconstants]) ",%%xmm13 \n" \
1591 "movdqa " MEMACCESS2(192, [yuvconstants]) ",%%xmm14 \n"
1592 // Convert 8 pixels: 8 UV and 8 Y
1593 #define YUVTORGB(yuvconstants) \
1594 "movdqa %%xmm0,%%xmm1 \n" \
1595 "movdqa %%xmm0,%%xmm2 \n" \
1596 "movdqa %%xmm0,%%xmm3 \n" \
1597 "movdqa %%xmm11,%%xmm0 \n" \
1598 "pmaddubsw %%xmm8,%%xmm1 \n" \
1599 "psubw %%xmm1,%%xmm0 \n" \
1600 "movdqa %%xmm12,%%xmm1 \n" \
1601 "pmaddubsw %%xmm9,%%xmm2 \n" \
1602 "psubw %%xmm2,%%xmm1 \n" \
1603 "movdqa %%xmm13,%%xmm2 \n" \
1604 "pmaddubsw %%xmm10,%%xmm3 \n" \
1605 "psubw %%xmm3,%%xmm2 \n" \
1606 "pmulhuw %%xmm14,%%xmm4 \n" \
1607 "paddsw %%xmm4,%%xmm0 \n" \
1608 "paddsw %%xmm4,%%xmm1 \n" \
1609 "paddsw %%xmm4,%%xmm2 \n" \
1610 "psraw $0x6,%%xmm0 \n" \
1611 "psraw $0x6,%%xmm1 \n" \
1612 "psraw $0x6,%%xmm2 \n" \
1613 "packuswb %%xmm0,%%xmm0 \n" \
1614 "packuswb %%xmm1,%%xmm1 \n" \
1615 "packuswb %%xmm2,%%xmm2 \n"
1616 #define YUVTORGB_REGS \
1617 "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14",
1618
1619 #else
1620 #define YUVTORGB_SETUP(yuvconstants)
1621 // Convert 8 pixels: 8 UV and 8 Y
1622 #define YUVTORGB(yuvconstants) \
1623 "movdqa %%xmm0,%%xmm1 \n" \
1624 "movdqa %%xmm0,%%xmm2 \n" \
1625 "movdqa %%xmm0,%%xmm3 \n" \
1626 "movdqa " MEMACCESS2(96, [yuvconstants]) ",%%xmm0 \n" \
1627 "pmaddubsw " MEMACCESS([yuvconstants]) ",%%xmm1 \n" \
1628 "psubw %%xmm1,%%xmm0 \n" \
1629 "movdqa " MEMACCESS2(128, [yuvconstants]) ",%%xmm1 \n" \
1630 "pmaddubsw " MEMACCESS2(32, [yuvconstants]) ",%%xmm2 \n" \
1631 "psubw %%xmm2,%%xmm1 \n" \
1632 "movdqa " MEMACCESS2(160, [yuvconstants]) ",%%xmm2 \n" \
1633 "pmaddubsw " MEMACCESS2(64, [yuvconstants]) ",%%xmm3 \n" \
1634 "psubw %%xmm3,%%xmm2 \n" \
1635 "pmulhuw " MEMACCESS2(192, [yuvconstants]) ",%%xmm4 \n" \
1636 "paddsw %%xmm4,%%xmm0 \n" \
1637 "paddsw %%xmm4,%%xmm1 \n" \
1638 "paddsw %%xmm4,%%xmm2 \n" \
1639 "psraw $0x6,%%xmm0 \n" \
1640 "psraw $0x6,%%xmm1 \n" \
1641 "psraw $0x6,%%xmm2 \n" \
1642 "packuswb %%xmm0,%%xmm0 \n" \
1643 "packuswb %%xmm1,%%xmm1 \n" \
1644 "packuswb %%xmm2,%%xmm2 \n"
1645 #define YUVTORGB_REGS
1646 #endif
1647
1648 // Store 8 ARGB values.
1649 #define STOREARGB \
1650 "punpcklbw %%xmm1,%%xmm0 \n" \
1651 "punpcklbw %%xmm5,%%xmm2 \n" \
1652 "movdqa %%xmm0,%%xmm1 \n" \
1653 "punpcklwd %%xmm2,%%xmm0 \n" \
1654 "punpckhwd %%xmm2,%%xmm1 \n" \
1655 "movdqu %%xmm0," MEMACCESS([dst_argb]) " \n" \
1656 "movdqu %%xmm1," MEMACCESS2(0x10, [dst_argb]) " \n" \
1657 "lea " MEMLEA(0x20, [dst_argb]) ", %[dst_argb] \n"
1658
1659 // Store 8 RGBA values.
1660 #define STORERGBA \
1661 "pcmpeqb %%xmm5,%%xmm5 \n" \
1662 "punpcklbw %%xmm2,%%xmm1 \n" \
1663 "punpcklbw %%xmm0,%%xmm5 \n" \
1664 "movdqa %%xmm5,%%xmm0 \n" \
1665 "punpcklwd %%xmm1,%%xmm5 \n" \
1666 "punpckhwd %%xmm1,%%xmm0 \n" \
1667 "movdqu %%xmm5," MEMACCESS([dst_rgba]) " \n" \
1668 "movdqu %%xmm0," MEMACCESS2(0x10, [dst_rgba]) " \n" \
1669 "lea " MEMLEA(0x20, [dst_rgba]) ",%[dst_rgba] \n"
1670
I444ToARGBRow_SSSE3(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * dst_argb,const struct YuvConstants * yuvconstants,int width)1671 void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf,
1672 const uint8* u_buf,
1673 const uint8* v_buf,
1674 uint8* dst_argb,
1675 const struct YuvConstants* yuvconstants,
1676 int width) {
1677 asm volatile (
1678 YUVTORGB_SETUP(yuvconstants)
1679 "sub %[u_buf],%[v_buf] \n"
1680 "pcmpeqb %%xmm5,%%xmm5 \n"
1681 LABELALIGN
1682 "1: \n"
1683 READYUV444
1684 YUVTORGB(yuvconstants)
1685 STOREARGB
1686 "sub $0x8,%[width] \n"
1687 "jg 1b \n"
1688 : [y_buf]"+r"(y_buf), // %[y_buf]
1689 [u_buf]"+r"(u_buf), // %[u_buf]
1690 [v_buf]"+r"(v_buf), // %[v_buf]
1691 [dst_argb]"+r"(dst_argb), // %[dst_argb]
1692 [width]"+rm"(width) // %[width]
1693 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
1694 : "memory", "cc", NACL_R14 YUVTORGB_REGS
1695 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1696 );
1697 }
1698
I422ToRGB24Row_SSSE3(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * dst_rgb24,const struct YuvConstants * yuvconstants,int width)1699 void OMITFP I422ToRGB24Row_SSSE3(const uint8* y_buf,
1700 const uint8* u_buf,
1701 const uint8* v_buf,
1702 uint8* dst_rgb24,
1703 const struct YuvConstants* yuvconstants,
1704 int width) {
1705 asm volatile (
1706 YUVTORGB_SETUP(yuvconstants)
1707 "movdqa %[kShuffleMaskARGBToRGB24_0],%%xmm5 \n"
1708 "movdqa %[kShuffleMaskARGBToRGB24],%%xmm6 \n"
1709 "sub %[u_buf],%[v_buf] \n"
1710 LABELALIGN
1711 "1: \n"
1712 READYUV422
1713 YUVTORGB(yuvconstants)
1714 "punpcklbw %%xmm1,%%xmm0 \n"
1715 "punpcklbw %%xmm2,%%xmm2 \n"
1716 "movdqa %%xmm0,%%xmm1 \n"
1717 "punpcklwd %%xmm2,%%xmm0 \n"
1718 "punpckhwd %%xmm2,%%xmm1 \n"
1719 "pshufb %%xmm5,%%xmm0 \n"
1720 "pshufb %%xmm6,%%xmm1 \n"
1721 "palignr $0xc,%%xmm0,%%xmm1 \n"
1722 "movq %%xmm0," MEMACCESS([dst_rgb24]) "\n"
1723 "movdqu %%xmm1," MEMACCESS2(0x8,[dst_rgb24]) "\n"
1724 "lea " MEMLEA(0x18,[dst_rgb24]) ",%[dst_rgb24] \n"
1725 "subl $0x8,%[width] \n"
1726 "jg 1b \n"
1727 : [y_buf]"+r"(y_buf), // %[y_buf]
1728 [u_buf]"+r"(u_buf), // %[u_buf]
1729 [v_buf]"+r"(v_buf), // %[v_buf]
1730 [dst_rgb24]"+r"(dst_rgb24), // %[dst_rgb24]
1731 #if defined(__i386__) && defined(__pic__)
1732 [width]"+m"(width) // %[width]
1733 #else
1734 [width]"+rm"(width) // %[width]
1735 #endif
1736 : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
1737 [kShuffleMaskARGBToRGB24_0]"m"(kShuffleMaskARGBToRGB24_0),
1738 [kShuffleMaskARGBToRGB24]"m"(kShuffleMaskARGBToRGB24)
1739 : "memory", "cc", NACL_R14 YUVTORGB_REGS
1740 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
1741 );
1742 }
1743
I422ToARGBRow_SSSE3(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * dst_argb,const struct YuvConstants * yuvconstants,int width)1744 void OMITFP I422ToARGBRow_SSSE3(const uint8* y_buf,
1745 const uint8* u_buf,
1746 const uint8* v_buf,
1747 uint8* dst_argb,
1748 const struct YuvConstants* yuvconstants,
1749 int width) {
1750 asm volatile (
1751 YUVTORGB_SETUP(yuvconstants)
1752 "sub %[u_buf],%[v_buf] \n"
1753 "pcmpeqb %%xmm5,%%xmm5 \n"
1754 LABELALIGN
1755 "1: \n"
1756 READYUV422
1757 YUVTORGB(yuvconstants)
1758 STOREARGB
1759 "sub $0x8,%[width] \n"
1760 "jg 1b \n"
1761 : [y_buf]"+r"(y_buf), // %[y_buf]
1762 [u_buf]"+r"(u_buf), // %[u_buf]
1763 [v_buf]"+r"(v_buf), // %[v_buf]
1764 [dst_argb]"+r"(dst_argb), // %[dst_argb]
1765 [width]"+rm"(width) // %[width]
1766 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
1767 : "memory", "cc", NACL_R14 YUVTORGB_REGS
1768 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1769 );
1770 }
1771
1772 #ifdef HAS_I422ALPHATOARGBROW_SSSE3
I422AlphaToARGBRow_SSSE3(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,const uint8 * a_buf,uint8 * dst_argb,const struct YuvConstants * yuvconstants,int width)1773 void OMITFP I422AlphaToARGBRow_SSSE3(const uint8* y_buf,
1774 const uint8* u_buf,
1775 const uint8* v_buf,
1776 const uint8* a_buf,
1777 uint8* dst_argb,
1778 const struct YuvConstants* yuvconstants,
1779 int width) {
1780 asm volatile (
1781 YUVTORGB_SETUP(yuvconstants)
1782 "sub %[u_buf],%[v_buf] \n"
1783 LABELALIGN
1784 "1: \n"
1785 READYUVA422
1786 YUVTORGB(yuvconstants)
1787 STOREARGB
1788 "subl $0x8,%[width] \n"
1789 "jg 1b \n"
1790 : [y_buf]"+r"(y_buf), // %[y_buf]
1791 [u_buf]"+r"(u_buf), // %[u_buf]
1792 [v_buf]"+r"(v_buf), // %[v_buf]
1793 [a_buf]"+r"(a_buf), // %[a_buf]
1794 [dst_argb]"+r"(dst_argb), // %[dst_argb]
1795 #if defined(__i386__) && defined(__pic__)
1796 [width]"+m"(width) // %[width]
1797 #else
1798 [width]"+rm"(width) // %[width]
1799 #endif
1800 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
1801 : "memory", "cc", NACL_R14 YUVTORGB_REGS
1802 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1803 );
1804 }
1805 #endif // HAS_I422ALPHATOARGBROW_SSSE3
1806
1807 #ifdef HAS_I411TOARGBROW_SSSE3
I411ToARGBRow_SSSE3(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * dst_argb,const struct YuvConstants * yuvconstants,int width)1808 void OMITFP I411ToARGBRow_SSSE3(const uint8* y_buf,
1809 const uint8* u_buf,
1810 const uint8* v_buf,
1811 uint8* dst_argb,
1812 const struct YuvConstants* yuvconstants,
1813 int width) {
1814 int temp;
1815 asm volatile (
1816 YUVTORGB_SETUP(yuvconstants)
1817 "sub %[u_buf],%[v_buf] \n"
1818 "pcmpeqb %%xmm5,%%xmm5 \n"
1819 LABELALIGN
1820 "1: \n"
1821 READYUV411_TEMP
1822 YUVTORGB(yuvconstants)
1823 STOREARGB
1824 "subl $0x8,%[width] \n"
1825 "jg 1b \n"
1826 : [y_buf]"+r"(y_buf), // %[y_buf]
1827 [u_buf]"+r"(u_buf), // %[u_buf]
1828 [v_buf]"+r"(v_buf), // %[v_buf]
1829 [dst_argb]"+r"(dst_argb), // %[dst_argb]
1830 [temp]"=&r"(temp), // %[temp]
1831 #if defined(__i386__) && defined(__pic__)
1832 [width]"+m"(width) // %[width]
1833 #else
1834 [width]"+rm"(width) // %[width]
1835 #endif
1836 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
1837 : "memory", "cc", NACL_R14 YUVTORGB_REGS
1838 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1839 );
1840 }
1841 #endif
1842
NV12ToARGBRow_SSSE3(const uint8 * y_buf,const uint8 * uv_buf,uint8 * dst_argb,const struct YuvConstants * yuvconstants,int width)1843 void OMITFP NV12ToARGBRow_SSSE3(const uint8* y_buf,
1844 const uint8* uv_buf,
1845 uint8* dst_argb,
1846 const struct YuvConstants* yuvconstants,
1847 int width) {
1848 asm volatile (
1849 YUVTORGB_SETUP(yuvconstants)
1850 "pcmpeqb %%xmm5,%%xmm5 \n"
1851 LABELALIGN
1852 "1: \n"
1853 READNV12
1854 YUVTORGB(yuvconstants)
1855 STOREARGB
1856 "sub $0x8,%[width] \n"
1857 "jg 1b \n"
1858 : [y_buf]"+r"(y_buf), // %[y_buf]
1859 [uv_buf]"+r"(uv_buf), // %[uv_buf]
1860 [dst_argb]"+r"(dst_argb), // %[dst_argb]
1861 [width]"+rm"(width) // %[width]
1862 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
1863 : "memory", "cc", YUVTORGB_REGS // Does not use r14.
1864 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1865 );
1866 }
1867
NV21ToARGBRow_SSSE3(const uint8 * y_buf,const uint8 * vu_buf,uint8 * dst_argb,const struct YuvConstants * yuvconstants,int width)1868 void OMITFP NV21ToARGBRow_SSSE3(const uint8* y_buf,
1869 const uint8* vu_buf,
1870 uint8* dst_argb,
1871 const struct YuvConstants* yuvconstants,
1872 int width) {
1873 asm volatile (
1874 YUVTORGB_SETUP(yuvconstants)
1875 "pcmpeqb %%xmm5,%%xmm5 \n"
1876 LABELALIGN
1877 "1: \n"
1878 READNV21
1879 YUVTORGB(yuvconstants)
1880 STOREARGB
1881 "sub $0x8,%[width] \n"
1882 "jg 1b \n"
1883 : [y_buf]"+r"(y_buf), // %[y_buf]
1884 [vu_buf]"+r"(vu_buf), // %[vu_buf]
1885 [dst_argb]"+r"(dst_argb), // %[dst_argb]
1886 [width]"+rm"(width) // %[width]
1887 : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
1888 [kShuffleNV21]"m"(kShuffleNV21)
1889 : "memory", "cc", YUVTORGB_REGS // Does not use r14.
1890 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1891 );
1892 }
1893
YUY2ToARGBRow_SSSE3(const uint8 * yuy2_buf,uint8 * dst_argb,const struct YuvConstants * yuvconstants,int width)1894 void OMITFP YUY2ToARGBRow_SSSE3(const uint8* yuy2_buf,
1895 uint8* dst_argb,
1896 const struct YuvConstants* yuvconstants,
1897 int width) {
1898 asm volatile (
1899 YUVTORGB_SETUP(yuvconstants)
1900 "pcmpeqb %%xmm5,%%xmm5 \n"
1901 LABELALIGN
1902 "1: \n"
1903 READYUY2
1904 YUVTORGB(yuvconstants)
1905 STOREARGB
1906 "sub $0x8,%[width] \n"
1907 "jg 1b \n"
1908 : [yuy2_buf]"+r"(yuy2_buf), // %[yuy2_buf]
1909 [dst_argb]"+r"(dst_argb), // %[dst_argb]
1910 [width]"+rm"(width) // %[width]
1911 : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
1912 [kShuffleYUY2Y]"m"(kShuffleYUY2Y),
1913 [kShuffleYUY2UV]"m"(kShuffleYUY2UV)
1914 : "memory", "cc", YUVTORGB_REGS // Does not use r14.
1915 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1916 );
1917 }
1918
UYVYToARGBRow_SSSE3(const uint8 * uyvy_buf,uint8 * dst_argb,const struct YuvConstants * yuvconstants,int width)1919 void OMITFP UYVYToARGBRow_SSSE3(const uint8* uyvy_buf,
1920 uint8* dst_argb,
1921 const struct YuvConstants* yuvconstants,
1922 int width) {
1923 asm volatile (
1924 YUVTORGB_SETUP(yuvconstants)
1925 "pcmpeqb %%xmm5,%%xmm5 \n"
1926 LABELALIGN
1927 "1: \n"
1928 READUYVY
1929 YUVTORGB(yuvconstants)
1930 STOREARGB
1931 "sub $0x8,%[width] \n"
1932 "jg 1b \n"
1933 : [uyvy_buf]"+r"(uyvy_buf), // %[uyvy_buf]
1934 [dst_argb]"+r"(dst_argb), // %[dst_argb]
1935 [width]"+rm"(width) // %[width]
1936 : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
1937 [kShuffleUYVYY]"m"(kShuffleUYVYY),
1938 [kShuffleUYVYUV]"m"(kShuffleUYVYUV)
1939 : "memory", "cc", YUVTORGB_REGS // Does not use r14.
1940 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1941 );
1942 }
1943
I422ToRGBARow_SSSE3(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * dst_rgba,const struct YuvConstants * yuvconstants,int width)1944 void OMITFP I422ToRGBARow_SSSE3(const uint8* y_buf,
1945 const uint8* u_buf,
1946 const uint8* v_buf,
1947 uint8* dst_rgba,
1948 const struct YuvConstants* yuvconstants,
1949 int width) {
1950 asm volatile (
1951 YUVTORGB_SETUP(yuvconstants)
1952 "sub %[u_buf],%[v_buf] \n"
1953 "pcmpeqb %%xmm5,%%xmm5 \n"
1954 LABELALIGN
1955 "1: \n"
1956 READYUV422
1957 YUVTORGB(yuvconstants)
1958 STORERGBA
1959 "sub $0x8,%[width] \n"
1960 "jg 1b \n"
1961 : [y_buf]"+r"(y_buf), // %[y_buf]
1962 [u_buf]"+r"(u_buf), // %[u_buf]
1963 [v_buf]"+r"(v_buf), // %[v_buf]
1964 [dst_rgba]"+r"(dst_rgba), // %[dst_rgba]
1965 [width]"+rm"(width) // %[width]
1966 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
1967 : "memory", "cc", NACL_R14 YUVTORGB_REGS
1968 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1969 );
1970 }
1971
1972 #endif // HAS_I422TOARGBROW_SSSE3
1973
1974 // Read 16 UV from 444
1975 #define READYUV444_AVX2 \
1976 "vmovdqu " MEMACCESS([u_buf]) ",%%xmm0 \n" \
1977 MEMOPREG(vmovdqu, 0x00, [u_buf], [v_buf], 1, xmm1) \
1978 "lea " MEMLEA(0x10, [u_buf]) ",%[u_buf] \n" \
1979 "vpermq $0xd8,%%ymm0,%%ymm0 \n" \
1980 "vpermq $0xd8,%%ymm1,%%ymm1 \n" \
1981 "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \
1982 "vmovdqu " MEMACCESS([y_buf]) ",%%xmm4 \n" \
1983 "vpermq $0xd8,%%ymm4,%%ymm4 \n" \
1984 "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \
1985 "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n"
1986
1987 // Read 8 UV from 422, upsample to 16 UV.
1988 #define READYUV422_AVX2 \
1989 "vmovq " MEMACCESS([u_buf]) ",%%xmm0 \n" \
1990 MEMOPREG(vmovq, 0x00, [u_buf], [v_buf], 1, xmm1) \
1991 "lea " MEMLEA(0x8, [u_buf]) ",%[u_buf] \n" \
1992 "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \
1993 "vpermq $0xd8,%%ymm0,%%ymm0 \n" \
1994 "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" \
1995 "vmovdqu " MEMACCESS([y_buf]) ",%%xmm4 \n" \
1996 "vpermq $0xd8,%%ymm4,%%ymm4 \n" \
1997 "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \
1998 "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n"
1999
2000 // Read 8 UV from 422, upsample to 16 UV. With 16 Alpha.
2001 #define READYUVA422_AVX2 \
2002 "vmovq " MEMACCESS([u_buf]) ",%%xmm0 \n" \
2003 MEMOPREG(vmovq, 0x00, [u_buf], [v_buf], 1, xmm1) \
2004 "lea " MEMLEA(0x8, [u_buf]) ",%[u_buf] \n" \
2005 "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \
2006 "vpermq $0xd8,%%ymm0,%%ymm0 \n" \
2007 "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" \
2008 "vmovdqu " MEMACCESS([y_buf]) ",%%xmm4 \n" \
2009 "vpermq $0xd8,%%ymm4,%%ymm4 \n" \
2010 "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \
2011 "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n" \
2012 "vmovdqu " MEMACCESS([a_buf]) ",%%xmm5 \n" \
2013 "vpermq $0xd8,%%ymm5,%%ymm5 \n" \
2014 "lea " MEMLEA(0x10, [a_buf]) ",%[a_buf] \n"
2015
2016 // Read 4 UV from 411, upsample to 16 UV.
2017 #define READYUV411_AVX2 \
2018 "vmovd " MEMACCESS([u_buf]) ",%%xmm0 \n" \
2019 MEMOPREG(vmovd, 0x00, [u_buf], [v_buf], 1, xmm1) \
2020 "lea " MEMLEA(0x4, [u_buf]) ",%[u_buf] \n" \
2021 "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \
2022 "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" \
2023 "vpermq $0xd8,%%ymm0,%%ymm0 \n" \
2024 "vpunpckldq %%ymm0,%%ymm0,%%ymm0 \n" \
2025 "vmovdqu " MEMACCESS([y_buf]) ",%%xmm4 \n" \
2026 "vpermq $0xd8,%%ymm4,%%ymm4 \n" \
2027 "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \
2028 "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n"
2029
2030 // Read 8 UV from NV12, upsample to 16 UV.
2031 #define READNV12_AVX2 \
2032 "vmovdqu " MEMACCESS([uv_buf]) ",%%xmm0 \n" \
2033 "lea " MEMLEA(0x10, [uv_buf]) ",%[uv_buf] \n" \
2034 "vpermq $0xd8,%%ymm0,%%ymm0 \n" \
2035 "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" \
2036 "vmovdqu " MEMACCESS([y_buf]) ",%%xmm4 \n" \
2037 "vpermq $0xd8,%%ymm4,%%ymm4 \n" \
2038 "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \
2039 "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n"
2040
2041 // Read 8 VU from NV21, upsample to 16 UV.
2042 #define READNV21_AVX2 \
2043 "vmovdqu " MEMACCESS([vu_buf]) ",%%xmm0 \n" \
2044 "lea " MEMLEA(0x10, [vu_buf]) ",%[vu_buf] \n" \
2045 "vpermq $0xd8,%%ymm0,%%ymm0 \n" \
2046 "vpshufb %[kShuffleNV21], %%ymm0, %%ymm0 \n" \
2047 "vmovdqu " MEMACCESS([y_buf]) ",%%xmm4 \n" \
2048 "vpermq $0xd8,%%ymm4,%%ymm4 \n" \
2049 "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \
2050 "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n"
2051
2052 // Read 8 YUY2 with 16 Y and upsample 8 UV to 16 UV.
2053 #define READYUY2_AVX2 \
2054 "vmovdqu " MEMACCESS([yuy2_buf]) ",%%ymm4 \n" \
2055 "vpshufb %[kShuffleYUY2Y], %%ymm4, %%ymm4 \n" \
2056 "vmovdqu " MEMACCESS([yuy2_buf]) ",%%ymm0 \n" \
2057 "vpshufb %[kShuffleYUY2UV], %%ymm0, %%ymm0 \n" \
2058 "lea " MEMLEA(0x20, [yuy2_buf]) ",%[yuy2_buf] \n"
2059
2060 // Read 8 UYVY with 16 Y and upsample 8 UV to 16 UV.
2061 #define READUYVY_AVX2 \
2062 "vmovdqu " MEMACCESS([uyvy_buf]) ",%%ymm4 \n" \
2063 "vpshufb %[kShuffleUYVYY], %%ymm4, %%ymm4 \n" \
2064 "vmovdqu " MEMACCESS([uyvy_buf]) ",%%ymm0 \n" \
2065 "vpshufb %[kShuffleUYVYUV], %%ymm0, %%ymm0 \n" \
2066 "lea " MEMLEA(0x20, [uyvy_buf]) ",%[uyvy_buf] \n"
2067
2068 #if defined(__x86_64__)
2069 #define YUVTORGB_SETUP_AVX2(yuvconstants) \
2070 "vmovdqa " MEMACCESS([yuvconstants]) ",%%ymm8 \n" \
2071 "vmovdqa " MEMACCESS2(32, [yuvconstants]) ",%%ymm9 \n" \
2072 "vmovdqa " MEMACCESS2(64, [yuvconstants]) ",%%ymm10 \n" \
2073 "vmovdqa " MEMACCESS2(96, [yuvconstants]) ",%%ymm11 \n" \
2074 "vmovdqa " MEMACCESS2(128, [yuvconstants]) ",%%ymm12 \n" \
2075 "vmovdqa " MEMACCESS2(160, [yuvconstants]) ",%%ymm13 \n" \
2076 "vmovdqa " MEMACCESS2(192, [yuvconstants]) ",%%ymm14 \n"
2077 #define YUVTORGB_AVX2(yuvconstants) \
2078 "vpmaddubsw %%ymm10,%%ymm0,%%ymm2 \n" \
2079 "vpmaddubsw %%ymm9,%%ymm0,%%ymm1 \n" \
2080 "vpmaddubsw %%ymm8,%%ymm0,%%ymm0 \n" \
2081 "vpsubw %%ymm2,%%ymm13,%%ymm2 \n" \
2082 "vpsubw %%ymm1,%%ymm12,%%ymm1 \n" \
2083 "vpsubw %%ymm0,%%ymm11,%%ymm0 \n" \
2084 "vpmulhuw %%ymm14,%%ymm4,%%ymm4 \n" \
2085 "vpaddsw %%ymm4,%%ymm0,%%ymm0 \n" \
2086 "vpaddsw %%ymm4,%%ymm1,%%ymm1 \n" \
2087 "vpaddsw %%ymm4,%%ymm2,%%ymm2 \n" \
2088 "vpsraw $0x6,%%ymm0,%%ymm0 \n" \
2089 "vpsraw $0x6,%%ymm1,%%ymm1 \n" \
2090 "vpsraw $0x6,%%ymm2,%%ymm2 \n" \
2091 "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" \
2092 "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" \
2093 "vpackuswb %%ymm2,%%ymm2,%%ymm2 \n"
2094 #define YUVTORGB_REGS_AVX2 \
2095 "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14",
2096 #else // Convert 16 pixels: 16 UV and 16 Y.
2097 #define YUVTORGB_SETUP_AVX2(yuvconstants)
2098 #define YUVTORGB_AVX2(yuvconstants) \
2099 "vpmaddubsw " MEMACCESS2(64, [yuvconstants]) ",%%ymm0,%%ymm2 \n" \
2100 "vpmaddubsw " MEMACCESS2(32, [yuvconstants]) ",%%ymm0,%%ymm1 \n" \
2101 "vpmaddubsw " MEMACCESS([yuvconstants]) ",%%ymm0,%%ymm0 \n" \
2102 "vmovdqu " MEMACCESS2(160, [yuvconstants]) ",%%ymm3 \n" \
2103 "vpsubw %%ymm2,%%ymm3,%%ymm2 \n" \
2104 "vmovdqu " MEMACCESS2(128, [yuvconstants]) ",%%ymm3 \n" \
2105 "vpsubw %%ymm1,%%ymm3,%%ymm1 \n" \
2106 "vmovdqu " MEMACCESS2(96, [yuvconstants]) ",%%ymm3 \n" \
2107 "vpsubw %%ymm0,%%ymm3,%%ymm0 \n" \
2108 "vpmulhuw " MEMACCESS2(192, [yuvconstants]) ",%%ymm4,%%ymm4 \n" \
2109 "vpaddsw %%ymm4,%%ymm0,%%ymm0 \n" \
2110 "vpaddsw %%ymm4,%%ymm1,%%ymm1 \n" \
2111 "vpaddsw %%ymm4,%%ymm2,%%ymm2 \n" \
2112 "vpsraw $0x6,%%ymm0,%%ymm0 \n" \
2113 "vpsraw $0x6,%%ymm1,%%ymm1 \n" \
2114 "vpsraw $0x6,%%ymm2,%%ymm2 \n" \
2115 "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" \
2116 "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" \
2117 "vpackuswb %%ymm2,%%ymm2,%%ymm2 \n"
2118 #define YUVTORGB_REGS_AVX2
2119 #endif
2120
2121 // Store 16 ARGB values.
2122 #define STOREARGB_AVX2 \
2123 "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \
2124 "vpermq $0xd8,%%ymm0,%%ymm0 \n" \
2125 "vpunpcklbw %%ymm5,%%ymm2,%%ymm2 \n" \
2126 "vpermq $0xd8,%%ymm2,%%ymm2 \n" \
2127 "vpunpcklwd %%ymm2,%%ymm0,%%ymm1 \n" \
2128 "vpunpckhwd %%ymm2,%%ymm0,%%ymm0 \n" \
2129 "vmovdqu %%ymm1," MEMACCESS([dst_argb]) " \n" \
2130 "vmovdqu %%ymm0," MEMACCESS2(0x20, [dst_argb]) " \n" \
2131 "lea " MEMLEA(0x40, [dst_argb]) ", %[dst_argb] \n"
2132
2133 #ifdef HAS_I444TOARGBROW_AVX2
2134 // 16 pixels
2135 // 16 UV values with 16 Y producing 16 ARGB (64 bytes).
I444ToARGBRow_AVX2(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * dst_argb,const struct YuvConstants * yuvconstants,int width)2136 void OMITFP I444ToARGBRow_AVX2(const uint8* y_buf,
2137 const uint8* u_buf,
2138 const uint8* v_buf,
2139 uint8* dst_argb,
2140 const struct YuvConstants* yuvconstants,
2141 int width) {
2142 asm volatile (
2143 YUVTORGB_SETUP_AVX2(yuvconstants)
2144 "sub %[u_buf],%[v_buf] \n"
2145 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
2146 LABELALIGN
2147 "1: \n"
2148 READYUV444_AVX2
2149 YUVTORGB_AVX2(yuvconstants)
2150 STOREARGB_AVX2
2151 "sub $0x10,%[width] \n"
2152 "jg 1b \n"
2153 "vzeroupper \n"
2154 : [y_buf]"+r"(y_buf), // %[y_buf]
2155 [u_buf]"+r"(u_buf), // %[u_buf]
2156 [v_buf]"+r"(v_buf), // %[v_buf]
2157 [dst_argb]"+r"(dst_argb), // %[dst_argb]
2158 [width]"+rm"(width) // %[width]
2159 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
2160 : "memory", "cc", NACL_R14 YUVTORGB_REGS_AVX2
2161 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2162 );
2163 }
2164 #endif // HAS_I444TOARGBROW_AVX2
2165
2166 #ifdef HAS_I411TOARGBROW_AVX2
2167 // 16 pixels
2168 // 4 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
I411ToARGBRow_AVX2(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * dst_argb,const struct YuvConstants * yuvconstants,int width)2169 void OMITFP I411ToARGBRow_AVX2(const uint8* y_buf,
2170 const uint8* u_buf,
2171 const uint8* v_buf,
2172 uint8* dst_argb,
2173 const struct YuvConstants* yuvconstants,
2174 int width) {
2175 asm volatile (
2176 YUVTORGB_SETUP_AVX2(yuvconstants)
2177 "sub %[u_buf],%[v_buf] \n"
2178 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
2179 LABELALIGN
2180 "1: \n"
2181 READYUV411_AVX2
2182 YUVTORGB_AVX2(yuvconstants)
2183 STOREARGB_AVX2
2184 "sub $0x10,%[width] \n"
2185 "jg 1b \n"
2186 "vzeroupper \n"
2187 : [y_buf]"+r"(y_buf), // %[y_buf]
2188 [u_buf]"+r"(u_buf), // %[u_buf]
2189 [v_buf]"+r"(v_buf), // %[v_buf]
2190 [dst_argb]"+r"(dst_argb), // %[dst_argb]
2191 [width]"+rm"(width) // %[width]
2192 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
2193 : "memory", "cc", NACL_R14 YUVTORGB_REGS_AVX2
2194 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2195 );
2196 }
2197 #endif // HAS_I411TOARGBROW_AVX2
2198
2199 #if defined(HAS_I422TOARGBROW_AVX2)
2200 // 16 pixels
2201 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
I422ToARGBRow_AVX2(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * dst_argb,const struct YuvConstants * yuvconstants,int width)2202 void OMITFP I422ToARGBRow_AVX2(const uint8* y_buf,
2203 const uint8* u_buf,
2204 const uint8* v_buf,
2205 uint8* dst_argb,
2206 const struct YuvConstants* yuvconstants,
2207 int width) {
2208 asm volatile (
2209 YUVTORGB_SETUP_AVX2(yuvconstants)
2210 "sub %[u_buf],%[v_buf] \n"
2211 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
2212 LABELALIGN
2213 "1: \n"
2214 READYUV422_AVX2
2215 YUVTORGB_AVX2(yuvconstants)
2216 STOREARGB_AVX2
2217 "sub $0x10,%[width] \n"
2218 "jg 1b \n"
2219 "vzeroupper \n"
2220 : [y_buf]"+r"(y_buf), // %[y_buf]
2221 [u_buf]"+r"(u_buf), // %[u_buf]
2222 [v_buf]"+r"(v_buf), // %[v_buf]
2223 [dst_argb]"+r"(dst_argb), // %[dst_argb]
2224 [width]"+rm"(width) // %[width]
2225 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
2226 : "memory", "cc", NACL_R14 YUVTORGB_REGS_AVX2
2227 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2228 );
2229 }
2230 #endif // HAS_I422TOARGBROW_AVX2
2231
2232 #if defined(HAS_I422ALPHATOARGBROW_AVX2)
2233 // 16 pixels
2234 // 8 UV values upsampled to 16 UV, mixed with 16 Y and 16 A producing 16 ARGB.
I422AlphaToARGBRow_AVX2(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,const uint8 * a_buf,uint8 * dst_argb,const struct YuvConstants * yuvconstants,int width)2235 void OMITFP I422AlphaToARGBRow_AVX2(const uint8* y_buf,
2236 const uint8* u_buf,
2237 const uint8* v_buf,
2238 const uint8* a_buf,
2239 uint8* dst_argb,
2240 const struct YuvConstants* yuvconstants,
2241 int width) {
2242 asm volatile (
2243 YUVTORGB_SETUP_AVX2(yuvconstants)
2244 "sub %[u_buf],%[v_buf] \n"
2245 LABELALIGN
2246 "1: \n"
2247 READYUVA422_AVX2
2248 YUVTORGB_AVX2(yuvconstants)
2249 STOREARGB_AVX2
2250 "subl $0x10,%[width] \n"
2251 "jg 1b \n"
2252 "vzeroupper \n"
2253 : [y_buf]"+r"(y_buf), // %[y_buf]
2254 [u_buf]"+r"(u_buf), // %[u_buf]
2255 [v_buf]"+r"(v_buf), // %[v_buf]
2256 [a_buf]"+r"(a_buf), // %[a_buf]
2257 [dst_argb]"+r"(dst_argb), // %[dst_argb]
2258 #if defined(__i386__) && defined(__pic__)
2259 [width]"+m"(width) // %[width]
2260 #else
2261 [width]"+rm"(width) // %[width]
2262 #endif
2263 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
2264 : "memory", "cc", NACL_R14 YUVTORGB_REGS_AVX2
2265 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2266 );
2267 }
2268 #endif // HAS_I422ALPHATOARGBROW_AVX2
2269
2270 #if defined(HAS_I422TORGBAROW_AVX2)
2271 // 16 pixels
2272 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 RGBA (64 bytes).
I422ToRGBARow_AVX2(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * dst_argb,const struct YuvConstants * yuvconstants,int width)2273 void OMITFP I422ToRGBARow_AVX2(const uint8* y_buf,
2274 const uint8* u_buf,
2275 const uint8* v_buf,
2276 uint8* dst_argb,
2277 const struct YuvConstants* yuvconstants,
2278 int width) {
2279 asm volatile (
2280 YUVTORGB_SETUP_AVX2(yuvconstants)
2281 "sub %[u_buf],%[v_buf] \n"
2282 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
2283 LABELALIGN
2284 "1: \n"
2285 READYUV422_AVX2
2286 YUVTORGB_AVX2(yuvconstants)
2287
2288 // Step 3: Weave into RGBA
2289 "vpunpcklbw %%ymm2,%%ymm1,%%ymm1 \n"
2290 "vpermq $0xd8,%%ymm1,%%ymm1 \n"
2291 "vpunpcklbw %%ymm0,%%ymm5,%%ymm2 \n"
2292 "vpermq $0xd8,%%ymm2,%%ymm2 \n"
2293 "vpunpcklwd %%ymm1,%%ymm2,%%ymm0 \n"
2294 "vpunpckhwd %%ymm1,%%ymm2,%%ymm1 \n"
2295 "vmovdqu %%ymm0," MEMACCESS([dst_argb]) "\n"
2296 "vmovdqu %%ymm1," MEMACCESS2(0x20,[dst_argb]) "\n"
2297 "lea " MEMLEA(0x40,[dst_argb]) ",%[dst_argb] \n"
2298 "sub $0x10,%[width] \n"
2299 "jg 1b \n"
2300 "vzeroupper \n"
2301 : [y_buf]"+r"(y_buf), // %[y_buf]
2302 [u_buf]"+r"(u_buf), // %[u_buf]
2303 [v_buf]"+r"(v_buf), // %[v_buf]
2304 [dst_argb]"+r"(dst_argb), // %[dst_argb]
2305 [width]"+rm"(width) // %[width]
2306 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
2307 : "memory", "cc", NACL_R14 YUVTORGB_REGS_AVX2
2308 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2309 );
2310 }
2311 #endif // HAS_I422TORGBAROW_AVX2
2312
2313 #if defined(HAS_NV12TOARGBROW_AVX2)
2314 // 16 pixels.
2315 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
NV12ToARGBRow_AVX2(const uint8 * y_buf,const uint8 * uv_buf,uint8 * dst_argb,const struct YuvConstants * yuvconstants,int width)2316 void OMITFP NV12ToARGBRow_AVX2(const uint8* y_buf,
2317 const uint8* uv_buf,
2318 uint8* dst_argb,
2319 const struct YuvConstants* yuvconstants,
2320 int width) {
2321 asm volatile (
2322 YUVTORGB_SETUP_AVX2(yuvconstants)
2323 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
2324 LABELALIGN
2325 "1: \n"
2326 READNV12_AVX2
2327 YUVTORGB_AVX2(yuvconstants)
2328 STOREARGB_AVX2
2329 "sub $0x10,%[width] \n"
2330 "jg 1b \n"
2331 "vzeroupper \n"
2332 : [y_buf]"+r"(y_buf), // %[y_buf]
2333 [uv_buf]"+r"(uv_buf), // %[uv_buf]
2334 [dst_argb]"+r"(dst_argb), // %[dst_argb]
2335 [width]"+rm"(width) // %[width]
2336 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
2337 : "memory", "cc", YUVTORGB_REGS_AVX2 // Does not use r14.
2338 "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2339 );
2340 }
2341 #endif // HAS_NV12TOARGBROW_AVX2
2342
2343 #if defined(HAS_NV21TOARGBROW_AVX2)
2344 // 16 pixels.
2345 // 8 VU values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
NV21ToARGBRow_AVX2(const uint8 * y_buf,const uint8 * vu_buf,uint8 * dst_argb,const struct YuvConstants * yuvconstants,int width)2346 void OMITFP NV21ToARGBRow_AVX2(const uint8* y_buf,
2347 const uint8* vu_buf,
2348 uint8* dst_argb,
2349 const struct YuvConstants* yuvconstants,
2350 int width) {
2351 asm volatile (
2352 YUVTORGB_SETUP_AVX2(yuvconstants)
2353 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
2354 LABELALIGN
2355 "1: \n"
2356 READNV21_AVX2
2357 YUVTORGB_AVX2(yuvconstants)
2358 STOREARGB_AVX2
2359 "sub $0x10,%[width] \n"
2360 "jg 1b \n"
2361 "vzeroupper \n"
2362 : [y_buf]"+r"(y_buf), // %[y_buf]
2363 [vu_buf]"+r"(vu_buf), // %[vu_buf]
2364 [dst_argb]"+r"(dst_argb), // %[dst_argb]
2365 [width]"+rm"(width) // %[width]
2366 : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
2367 [kShuffleNV21]"m"(kShuffleNV21)
2368 : "memory", "cc", YUVTORGB_REGS_AVX2 // Does not use r14.
2369 "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2370 );
2371 }
2372 #endif // HAS_NV21TOARGBROW_AVX2
2373
2374 #if defined(HAS_YUY2TOARGBROW_AVX2)
2375 // 16 pixels.
2376 // 8 YUY2 values with 16 Y and 8 UV producing 16 ARGB (64 bytes).
YUY2ToARGBRow_AVX2(const uint8 * yuy2_buf,uint8 * dst_argb,const struct YuvConstants * yuvconstants,int width)2377 void OMITFP YUY2ToARGBRow_AVX2(const uint8* yuy2_buf,
2378 uint8* dst_argb,
2379 const struct YuvConstants* yuvconstants,
2380 int width) {
2381 asm volatile (
2382 YUVTORGB_SETUP_AVX2(yuvconstants)
2383 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
2384 LABELALIGN
2385 "1: \n"
2386 READYUY2_AVX2
2387 YUVTORGB_AVX2(yuvconstants)
2388 STOREARGB_AVX2
2389 "sub $0x10,%[width] \n"
2390 "jg 1b \n"
2391 "vzeroupper \n"
2392 : [yuy2_buf]"+r"(yuy2_buf), // %[yuy2_buf]
2393 [dst_argb]"+r"(dst_argb), // %[dst_argb]
2394 [width]"+rm"(width) // %[width]
2395 : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
2396 [kShuffleYUY2Y]"m"(kShuffleYUY2Y),
2397 [kShuffleYUY2UV]"m"(kShuffleYUY2UV)
2398 : "memory", "cc", YUVTORGB_REGS_AVX2 // Does not use r14.
2399 "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2400 );
2401 }
2402 #endif // HAS_YUY2TOARGBROW_AVX2
2403
2404 #if defined(HAS_UYVYTOARGBROW_AVX2)
2405 // 16 pixels.
2406 // 8 UYVY values with 16 Y and 8 UV producing 16 ARGB (64 bytes).
UYVYToARGBRow_AVX2(const uint8 * uyvy_buf,uint8 * dst_argb,const struct YuvConstants * yuvconstants,int width)2407 void OMITFP UYVYToARGBRow_AVX2(const uint8* uyvy_buf,
2408 uint8* dst_argb,
2409 const struct YuvConstants* yuvconstants,
2410 int width) {
2411 asm volatile (
2412 YUVTORGB_SETUP_AVX2(yuvconstants)
2413 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
2414 LABELALIGN
2415 "1: \n"
2416 READUYVY_AVX2
2417 YUVTORGB_AVX2(yuvconstants)
2418 STOREARGB_AVX2
2419 "sub $0x10,%[width] \n"
2420 "jg 1b \n"
2421 "vzeroupper \n"
2422 : [uyvy_buf]"+r"(uyvy_buf), // %[uyvy_buf]
2423 [dst_argb]"+r"(dst_argb), // %[dst_argb]
2424 [width]"+rm"(width) // %[width]
2425 : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
2426 [kShuffleUYVYY]"m"(kShuffleUYVYY),
2427 [kShuffleUYVYUV]"m"(kShuffleUYVYUV)
2428 : "memory", "cc", YUVTORGB_REGS_AVX2 // Does not use r14.
2429 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2430 );
2431 }
2432 #endif // HAS_UYVYTOARGBROW_AVX2
2433
2434 #ifdef HAS_I400TOARGBROW_SSE2
I400ToARGBRow_SSE2(const uint8 * y_buf,uint8 * dst_argb,int width)2435 void I400ToARGBRow_SSE2(const uint8* y_buf, uint8* dst_argb, int width) {
2436 asm volatile (
2437 "mov $0x4a354a35,%%eax \n" // 4a35 = 18997 = 1.164
2438 "movd %%eax,%%xmm2 \n"
2439 "pshufd $0x0,%%xmm2,%%xmm2 \n"
2440 "mov $0x04880488,%%eax \n" // 0488 = 1160 = 1.164 * 16
2441 "movd %%eax,%%xmm3 \n"
2442 "pshufd $0x0,%%xmm3,%%xmm3 \n"
2443 "pcmpeqb %%xmm4,%%xmm4 \n"
2444 "pslld $0x18,%%xmm4 \n"
2445 LABELALIGN
2446 "1: \n"
2447 // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
2448 "movq " MEMACCESS(0) ",%%xmm0 \n"
2449 "lea " MEMLEA(0x8,0) ",%0 \n"
2450 "punpcklbw %%xmm0,%%xmm0 \n"
2451 "pmulhuw %%xmm2,%%xmm0 \n"
2452 "psubusw %%xmm3,%%xmm0 \n"
2453 "psrlw $6, %%xmm0 \n"
2454 "packuswb %%xmm0,%%xmm0 \n"
2455
2456 // Step 2: Weave into ARGB
2457 "punpcklbw %%xmm0,%%xmm0 \n"
2458 "movdqa %%xmm0,%%xmm1 \n"
2459 "punpcklwd %%xmm0,%%xmm0 \n"
2460 "punpckhwd %%xmm1,%%xmm1 \n"
2461 "por %%xmm4,%%xmm0 \n"
2462 "por %%xmm4,%%xmm1 \n"
2463 "movdqu %%xmm0," MEMACCESS(1) " \n"
2464 "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
2465 "lea " MEMLEA(0x20,1) ",%1 \n"
2466
2467 "sub $0x8,%2 \n"
2468 "jg 1b \n"
2469 : "+r"(y_buf), // %0
2470 "+r"(dst_argb), // %1
2471 "+rm"(width) // %2
2472 :
2473 : "memory", "cc", "eax"
2474 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
2475 );
2476 }
2477 #endif // HAS_I400TOARGBROW_SSE2
2478
2479 #ifdef HAS_I400TOARGBROW_AVX2
2480 // 16 pixels of Y converted to 16 pixels of ARGB (64 bytes).
2481 // note: vpunpcklbw mutates and vpackuswb unmutates.
I400ToARGBRow_AVX2(const uint8 * y_buf,uint8 * dst_argb,int width)2482 void I400ToARGBRow_AVX2(const uint8* y_buf, uint8* dst_argb, int width) {
2483 asm volatile (
2484 "mov $0x4a354a35,%%eax \n" // 0488 = 1160 = 1.164 * 16
2485 "vmovd %%eax,%%xmm2 \n"
2486 "vbroadcastss %%xmm2,%%ymm2 \n"
2487 "mov $0x4880488,%%eax \n" // 4a35 = 18997 = 1.164
2488 "vmovd %%eax,%%xmm3 \n"
2489 "vbroadcastss %%xmm3,%%ymm3 \n"
2490 "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"
2491 "vpslld $0x18,%%ymm4,%%ymm4 \n"
2492
2493 LABELALIGN
2494 "1: \n"
2495 // Step 1: Scale Y contribution to 16 G values. G = (y - 16) * 1.164
2496 "vmovdqu " MEMACCESS(0) ",%%xmm0 \n"
2497 "lea " MEMLEA(0x10,0) ",%0 \n"
2498 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
2499 "vpunpcklbw %%ymm0,%%ymm0,%%ymm0 \n"
2500 "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n"
2501 "vpsubusw %%ymm3,%%ymm0,%%ymm0 \n"
2502 "vpsrlw $0x6,%%ymm0,%%ymm0 \n"
2503 "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
2504 "vpunpcklbw %%ymm0,%%ymm0,%%ymm1 \n"
2505 "vpermq $0xd8,%%ymm1,%%ymm1 \n"
2506 "vpunpcklwd %%ymm1,%%ymm1,%%ymm0 \n"
2507 "vpunpckhwd %%ymm1,%%ymm1,%%ymm1 \n"
2508 "vpor %%ymm4,%%ymm0,%%ymm0 \n"
2509 "vpor %%ymm4,%%ymm1,%%ymm1 \n"
2510 "vmovdqu %%ymm0," MEMACCESS(1) " \n"
2511 "vmovdqu %%ymm1," MEMACCESS2(0x20,1) " \n"
2512 "lea " MEMLEA(0x40,1) ",%1 \n"
2513 "sub $0x10,%2 \n"
2514 "jg 1b \n"
2515 "vzeroupper \n"
2516 : "+r"(y_buf), // %0
2517 "+r"(dst_argb), // %1
2518 "+rm"(width) // %2
2519 :
2520 : "memory", "cc", "eax"
2521 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
2522 );
2523 }
2524 #endif // HAS_I400TOARGBROW_AVX2
2525
2526 #ifdef HAS_MIRRORROW_SSSE3
2527 // Shuffle table for reversing the bytes.
2528 static uvec8 kShuffleMirror = {
2529 15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
2530 };
2531
MirrorRow_SSSE3(const uint8 * src,uint8 * dst,int width)2532 void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
2533 intptr_t temp_width = (intptr_t)(width);
2534 asm volatile (
2535 "movdqa %3,%%xmm5 \n"
2536 LABELALIGN
2537 "1: \n"
2538 MEMOPREG(movdqu,-0x10,0,2,1,xmm0) // movdqu -0x10(%0,%2),%%xmm0
2539 "pshufb %%xmm5,%%xmm0 \n"
2540 "movdqu %%xmm0," MEMACCESS(1) " \n"
2541 "lea " MEMLEA(0x10,1) ",%1 \n"
2542 "sub $0x10,%2 \n"
2543 "jg 1b \n"
2544 : "+r"(src), // %0
2545 "+r"(dst), // %1
2546 "+r"(temp_width) // %2
2547 : "m"(kShuffleMirror) // %3
2548 : "memory", "cc", NACL_R14
2549 "xmm0", "xmm5"
2550 );
2551 }
2552 #endif // HAS_MIRRORROW_SSSE3
2553
2554 #ifdef HAS_MIRRORROW_AVX2
MirrorRow_AVX2(const uint8 * src,uint8 * dst,int width)2555 void MirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
2556 intptr_t temp_width = (intptr_t)(width);
2557 asm volatile (
2558 "vbroadcastf128 %3,%%ymm5 \n"
2559 LABELALIGN
2560 "1: \n"
2561 MEMOPREG(vmovdqu,-0x20,0,2,1,ymm0) // vmovdqu -0x20(%0,%2),%%ymm0
2562 "vpshufb %%ymm5,%%ymm0,%%ymm0 \n"
2563 "vpermq $0x4e,%%ymm0,%%ymm0 \n"
2564 "vmovdqu %%ymm0," MEMACCESS(1) " \n"
2565 "lea " MEMLEA(0x20,1) ",%1 \n"
2566 "sub $0x20,%2 \n"
2567 "jg 1b \n"
2568 "vzeroupper \n"
2569 : "+r"(src), // %0
2570 "+r"(dst), // %1
2571 "+r"(temp_width) // %2
2572 : "m"(kShuffleMirror) // %3
2573 : "memory", "cc", NACL_R14
2574 "xmm0", "xmm5"
2575 );
2576 }
2577 #endif // HAS_MIRRORROW_AVX2
2578
2579 #ifdef HAS_MIRRORUVROW_SSSE3
2580 // Shuffle table for reversing the bytes of UV channels.
2581 static uvec8 kShuffleMirrorUV = {
2582 14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u
2583 };
MirrorUVRow_SSSE3(const uint8 * src,uint8 * dst_u,uint8 * dst_v,int width)2584 void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
2585 int width) {
2586 intptr_t temp_width = (intptr_t)(width);
2587 asm volatile (
2588 "movdqa %4,%%xmm1 \n"
2589 "lea " MEMLEA4(-0x10,0,3,2) ",%0 \n"
2590 "sub %1,%2 \n"
2591 LABELALIGN
2592 "1: \n"
2593 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
2594 "lea " MEMLEA(-0x10,0) ",%0 \n"
2595 "pshufb %%xmm1,%%xmm0 \n"
2596 "movlpd %%xmm0," MEMACCESS(1) " \n"
2597 MEMOPMEM(movhpd,xmm0,0x00,1,2,1) // movhpd %%xmm0,(%1,%2)
2598 "lea " MEMLEA(0x8,1) ",%1 \n"
2599 "sub $8,%3 \n"
2600 "jg 1b \n"
2601 : "+r"(src), // %0
2602 "+r"(dst_u), // %1
2603 "+r"(dst_v), // %2
2604 "+r"(temp_width) // %3
2605 : "m"(kShuffleMirrorUV) // %4
2606 : "memory", "cc", NACL_R14
2607 "xmm0", "xmm1"
2608 );
2609 }
2610 #endif // HAS_MIRRORUVROW_SSSE3
2611
2612 #ifdef HAS_ARGBMIRRORROW_SSE2
2613
ARGBMirrorRow_SSE2(const uint8 * src,uint8 * dst,int width)2614 void ARGBMirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
2615 intptr_t temp_width = (intptr_t)(width);
2616 asm volatile (
2617 "lea " MEMLEA4(-0x10,0,2,4) ",%0 \n"
2618 LABELALIGN
2619 "1: \n"
2620 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
2621 "pshufd $0x1b,%%xmm0,%%xmm0 \n"
2622 "lea " MEMLEA(-0x10,0) ",%0 \n"
2623 "movdqu %%xmm0," MEMACCESS(1) " \n"
2624 "lea " MEMLEA(0x10,1) ",%1 \n"
2625 "sub $0x4,%2 \n"
2626 "jg 1b \n"
2627 : "+r"(src), // %0
2628 "+r"(dst), // %1
2629 "+r"(temp_width) // %2
2630 :
2631 : "memory", "cc"
2632 , "xmm0"
2633 );
2634 }
2635 #endif // HAS_ARGBMIRRORROW_SSE2
2636
2637 #ifdef HAS_ARGBMIRRORROW_AVX2
2638 // Shuffle table for reversing the bytes.
2639 static const ulvec32 kARGBShuffleMirror_AVX2 = {
2640 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
2641 };
ARGBMirrorRow_AVX2(const uint8 * src,uint8 * dst,int width)2642 void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
2643 intptr_t temp_width = (intptr_t)(width);
2644 asm volatile (
2645 "vmovdqu %3,%%ymm5 \n"
2646 LABELALIGN
2647 "1: \n"
2648 VMEMOPREG(vpermd,-0x20,0,2,4,ymm5,ymm0) // vpermd -0x20(%0,%2,4),ymm5,ymm0
2649 "vmovdqu %%ymm0," MEMACCESS(1) " \n"
2650 "lea " MEMLEA(0x20,1) ",%1 \n"
2651 "sub $0x8,%2 \n"
2652 "jg 1b \n"
2653 "vzeroupper \n"
2654 : "+r"(src), // %0
2655 "+r"(dst), // %1
2656 "+r"(temp_width) // %2
2657 : "m"(kARGBShuffleMirror_AVX2) // %3
2658 : "memory", "cc", NACL_R14
2659 "xmm0", "xmm5"
2660 );
2661 }
2662 #endif // HAS_ARGBMIRRORROW_AVX2
2663
2664 #ifdef HAS_SPLITUVROW_AVX2
SplitUVRow_AVX2(const uint8 * src_uv,uint8 * dst_u,uint8 * dst_v,int width)2665 void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
2666 int width) {
2667 asm volatile (
2668 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
2669 "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
2670 "sub %1,%2 \n"
2671 LABELALIGN
2672 "1: \n"
2673 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
2674 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
2675 "lea " MEMLEA(0x40,0) ",%0 \n"
2676 "vpsrlw $0x8,%%ymm0,%%ymm2 \n"
2677 "vpsrlw $0x8,%%ymm1,%%ymm3 \n"
2678 "vpand %%ymm5,%%ymm0,%%ymm0 \n"
2679 "vpand %%ymm5,%%ymm1,%%ymm1 \n"
2680 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
2681 "vpackuswb %%ymm3,%%ymm2,%%ymm2 \n"
2682 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
2683 "vpermq $0xd8,%%ymm2,%%ymm2 \n"
2684 "vmovdqu %%ymm0," MEMACCESS(1) " \n"
2685 MEMOPMEM(vmovdqu,ymm2,0x00,1,2,1) // vmovdqu %%ymm2,(%1,%2)
2686 "lea " MEMLEA(0x20,1) ",%1 \n"
2687 "sub $0x20,%3 \n"
2688 "jg 1b \n"
2689 "vzeroupper \n"
2690 : "+r"(src_uv), // %0
2691 "+r"(dst_u), // %1
2692 "+r"(dst_v), // %2
2693 "+r"(width) // %3
2694 :
2695 : "memory", "cc", NACL_R14
2696 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
2697 );
2698 }
2699 #endif // HAS_SPLITUVROW_AVX2
2700
2701 #ifdef HAS_SPLITUVROW_SSE2
SplitUVRow_SSE2(const uint8 * src_uv,uint8 * dst_u,uint8 * dst_v,int width)2702 void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
2703 int width) {
2704 asm volatile (
2705 "pcmpeqb %%xmm5,%%xmm5 \n"
2706 "psrlw $0x8,%%xmm5 \n"
2707 "sub %1,%2 \n"
2708 LABELALIGN
2709 "1: \n"
2710 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
2711 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
2712 "lea " MEMLEA(0x20,0) ",%0 \n"
2713 "movdqa %%xmm0,%%xmm2 \n"
2714 "movdqa %%xmm1,%%xmm3 \n"
2715 "pand %%xmm5,%%xmm0 \n"
2716 "pand %%xmm5,%%xmm1 \n"
2717 "packuswb %%xmm1,%%xmm0 \n"
2718 "psrlw $0x8,%%xmm2 \n"
2719 "psrlw $0x8,%%xmm3 \n"
2720 "packuswb %%xmm3,%%xmm2 \n"
2721 "movdqu %%xmm0," MEMACCESS(1) " \n"
2722 MEMOPMEM(movdqu,xmm2,0x00,1,2,1) // movdqu %%xmm2,(%1,%2)
2723 "lea " MEMLEA(0x10,1) ",%1 \n"
2724 "sub $0x10,%3 \n"
2725 "jg 1b \n"
2726 : "+r"(src_uv), // %0
2727 "+r"(dst_u), // %1
2728 "+r"(dst_v), // %2
2729 "+r"(width) // %3
2730 :
2731 : "memory", "cc", NACL_R14
2732 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
2733 );
2734 }
2735 #endif // HAS_SPLITUVROW_SSE2
2736
2737 #ifdef HAS_MERGEUVROW_AVX2
MergeUVRow_AVX2(const uint8 * src_u,const uint8 * src_v,uint8 * dst_uv,int width)2738 void MergeUVRow_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
2739 int width) {
2740 asm volatile (
2741 "sub %0,%1 \n"
2742 LABELALIGN
2743 "1: \n"
2744 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
2745 MEMOPREG(vmovdqu,0x00,0,1,1,ymm1) // vmovdqu (%0,%1,1),%%ymm1
2746 "lea " MEMLEA(0x20,0) ",%0 \n"
2747 "vpunpcklbw %%ymm1,%%ymm0,%%ymm2 \n"
2748 "vpunpckhbw %%ymm1,%%ymm0,%%ymm0 \n"
2749 "vextractf128 $0x0,%%ymm2," MEMACCESS(2) " \n"
2750 "vextractf128 $0x0,%%ymm0," MEMACCESS2(0x10,2) "\n"
2751 "vextractf128 $0x1,%%ymm2," MEMACCESS2(0x20,2) "\n"
2752 "vextractf128 $0x1,%%ymm0," MEMACCESS2(0x30,2) "\n"
2753 "lea " MEMLEA(0x40,2) ",%2 \n"
2754 "sub $0x20,%3 \n"
2755 "jg 1b \n"
2756 "vzeroupper \n"
2757 : "+r"(src_u), // %0
2758 "+r"(src_v), // %1
2759 "+r"(dst_uv), // %2
2760 "+r"(width) // %3
2761 :
2762 : "memory", "cc", NACL_R14
2763 "xmm0", "xmm1", "xmm2"
2764 );
2765 }
2766 #endif // HAS_MERGEUVROW_AVX2
2767
2768 #ifdef HAS_MERGEUVROW_SSE2
MergeUVRow_SSE2(const uint8 * src_u,const uint8 * src_v,uint8 * dst_uv,int width)2769 void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
2770 int width) {
2771 asm volatile (
2772 "sub %0,%1 \n"
2773 LABELALIGN
2774 "1: \n"
2775 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
2776 MEMOPREG(movdqu,0x00,0,1,1,xmm1) // movdqu (%0,%1,1),%%xmm1
2777 "lea " MEMLEA(0x10,0) ",%0 \n"
2778 "movdqa %%xmm0,%%xmm2 \n"
2779 "punpcklbw %%xmm1,%%xmm0 \n"
2780 "punpckhbw %%xmm1,%%xmm2 \n"
2781 "movdqu %%xmm0," MEMACCESS(2) " \n"
2782 "movdqu %%xmm2," MEMACCESS2(0x10,2) " \n"
2783 "lea " MEMLEA(0x20,2) ",%2 \n"
2784 "sub $0x10,%3 \n"
2785 "jg 1b \n"
2786 : "+r"(src_u), // %0
2787 "+r"(src_v), // %1
2788 "+r"(dst_uv), // %2
2789 "+r"(width) // %3
2790 :
2791 : "memory", "cc", NACL_R14
2792 "xmm0", "xmm1", "xmm2"
2793 );
2794 }
2795 #endif // HAS_MERGEUVROW_SSE2
2796
2797 #ifdef HAS_COPYROW_SSE2
CopyRow_SSE2(const uint8 * src,uint8 * dst,int count)2798 void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
2799 asm volatile (
2800 "test $0xf,%0 \n"
2801 "jne 2f \n"
2802 "test $0xf,%1 \n"
2803 "jne 2f \n"
2804 LABELALIGN
2805 "1: \n"
2806 "movdqa " MEMACCESS(0) ",%%xmm0 \n"
2807 "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
2808 "lea " MEMLEA(0x20,0) ",%0 \n"
2809 "movdqa %%xmm0," MEMACCESS(1) " \n"
2810 "movdqa %%xmm1," MEMACCESS2(0x10,1) " \n"
2811 "lea " MEMLEA(0x20,1) ",%1 \n"
2812 "sub $0x20,%2 \n"
2813 "jg 1b \n"
2814 "jmp 9f \n"
2815 LABELALIGN
2816 "2: \n"
2817 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
2818 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
2819 "lea " MEMLEA(0x20,0) ",%0 \n"
2820 "movdqu %%xmm0," MEMACCESS(1) " \n"
2821 "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
2822 "lea " MEMLEA(0x20,1) ",%1 \n"
2823 "sub $0x20,%2 \n"
2824 "jg 2b \n"
2825 "9: \n"
2826 : "+r"(src), // %0
2827 "+r"(dst), // %1
2828 "+r"(count) // %2
2829 :
2830 : "memory", "cc"
2831 , "xmm0", "xmm1"
2832 );
2833 }
2834 #endif // HAS_COPYROW_SSE2
2835
2836 #ifdef HAS_COPYROW_AVX
CopyRow_AVX(const uint8 * src,uint8 * dst,int count)2837 void CopyRow_AVX(const uint8* src, uint8* dst, int count) {
2838 asm volatile (
2839 LABELALIGN
2840 "1: \n"
2841 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
2842 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
2843 "lea " MEMLEA(0x40,0) ",%0 \n"
2844 "vmovdqu %%ymm0," MEMACCESS(1) " \n"
2845 "vmovdqu %%ymm1," MEMACCESS2(0x20,1) " \n"
2846 "lea " MEMLEA(0x40,1) ",%1 \n"
2847 "sub $0x40,%2 \n"
2848 "jg 1b \n"
2849 : "+r"(src), // %0
2850 "+r"(dst), // %1
2851 "+r"(count) // %2
2852 :
2853 : "memory", "cc"
2854 , "xmm0", "xmm1"
2855 );
2856 }
2857 #endif // HAS_COPYROW_AVX
2858
2859 #ifdef HAS_COPYROW_ERMS
2860 // Multiple of 1.
CopyRow_ERMS(const uint8 * src,uint8 * dst,int width)2861 void CopyRow_ERMS(const uint8* src, uint8* dst, int width) {
2862 size_t width_tmp = (size_t)(width);
2863 asm volatile (
2864 "rep movsb " MEMMOVESTRING(0,1) " \n"
2865 : "+S"(src), // %0
2866 "+D"(dst), // %1
2867 "+c"(width_tmp) // %2
2868 :
2869 : "memory", "cc"
2870 );
2871 }
2872 #endif // HAS_COPYROW_ERMS
2873
2874 #ifdef HAS_ARGBCOPYALPHAROW_SSE2
2875 // width in pixels
ARGBCopyAlphaRow_SSE2(const uint8 * src,uint8 * dst,int width)2876 void ARGBCopyAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
2877 asm volatile (
2878 "pcmpeqb %%xmm0,%%xmm0 \n"
2879 "pslld $0x18,%%xmm0 \n"
2880 "pcmpeqb %%xmm1,%%xmm1 \n"
2881 "psrld $0x8,%%xmm1 \n"
2882 LABELALIGN
2883 "1: \n"
2884 "movdqu " MEMACCESS(0) ",%%xmm2 \n"
2885 "movdqu " MEMACCESS2(0x10,0) ",%%xmm3 \n"
2886 "lea " MEMLEA(0x20,0) ",%0 \n"
2887 "movdqu " MEMACCESS(1) ",%%xmm4 \n"
2888 "movdqu " MEMACCESS2(0x10,1) ",%%xmm5 \n"
2889 "pand %%xmm0,%%xmm2 \n"
2890 "pand %%xmm0,%%xmm3 \n"
2891 "pand %%xmm1,%%xmm4 \n"
2892 "pand %%xmm1,%%xmm5 \n"
2893 "por %%xmm4,%%xmm2 \n"
2894 "por %%xmm5,%%xmm3 \n"
2895 "movdqu %%xmm2," MEMACCESS(1) " \n"
2896 "movdqu %%xmm3," MEMACCESS2(0x10,1) " \n"
2897 "lea " MEMLEA(0x20,1) ",%1 \n"
2898 "sub $0x8,%2 \n"
2899 "jg 1b \n"
2900 : "+r"(src), // %0
2901 "+r"(dst), // %1
2902 "+r"(width) // %2
2903 :
2904 : "memory", "cc"
2905 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2906 );
2907 }
2908 #endif // HAS_ARGBCOPYALPHAROW_SSE2
2909
2910 #ifdef HAS_ARGBCOPYALPHAROW_AVX2
2911 // width in pixels
ARGBCopyAlphaRow_AVX2(const uint8 * src,uint8 * dst,int width)2912 void ARGBCopyAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
2913 asm volatile (
2914 "vpcmpeqb %%ymm0,%%ymm0,%%ymm0 \n"
2915 "vpsrld $0x8,%%ymm0,%%ymm0 \n"
2916 LABELALIGN
2917 "1: \n"
2918 "vmovdqu " MEMACCESS(0) ",%%ymm1 \n"
2919 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm2 \n"
2920 "lea " MEMLEA(0x40,0) ",%0 \n"
2921 "vpblendvb %%ymm0," MEMACCESS(1) ",%%ymm1,%%ymm1 \n"
2922 "vpblendvb %%ymm0," MEMACCESS2(0x20,1) ",%%ymm2,%%ymm2 \n"
2923 "vmovdqu %%ymm1," MEMACCESS(1) " \n"
2924 "vmovdqu %%ymm2," MEMACCESS2(0x20,1) " \n"
2925 "lea " MEMLEA(0x40,1) ",%1 \n"
2926 "sub $0x10,%2 \n"
2927 "jg 1b \n"
2928 "vzeroupper \n"
2929 : "+r"(src), // %0
2930 "+r"(dst), // %1
2931 "+r"(width) // %2
2932 :
2933 : "memory", "cc"
2934 , "xmm0", "xmm1", "xmm2"
2935 );
2936 }
2937 #endif // HAS_ARGBCOPYALPHAROW_AVX2
2938
2939 #ifdef HAS_ARGBEXTRACTALPHAROW_SSE2
2940 // width in pixels
ARGBExtractAlphaRow_SSE2(const uint8 * src_argb,uint8 * dst_a,int width)2941 void ARGBExtractAlphaRow_SSE2(const uint8* src_argb, uint8* dst_a, int width) {
2942 asm volatile (
2943 LABELALIGN
2944 "1: \n"
2945 "movdqu " MEMACCESS(0) ", %%xmm0 \n"
2946 "movdqu " MEMACCESS2(0x10, 0) ", %%xmm1 \n"
2947 "lea " MEMLEA(0x20, 0) ", %0 \n"
2948 "psrld $0x18, %%xmm0 \n"
2949 "psrld $0x18, %%xmm1 \n"
2950 "packssdw %%xmm1, %%xmm0 \n"
2951 "packuswb %%xmm0, %%xmm0 \n"
2952 "movq %%xmm0," MEMACCESS(1) " \n"
2953 "lea " MEMLEA(0x8, 1) ", %1 \n"
2954 "sub $0x8, %2 \n"
2955 "jg 1b \n"
2956 : "+r"(src_argb), // %0
2957 "+r"(dst_a), // %1
2958 "+rm"(width) // %2
2959 :
2960 : "memory", "cc"
2961 , "xmm0", "xmm1"
2962 );
2963 }
2964 #endif // HAS_ARGBEXTRACTALPHAROW_SSE2
2965
2966 #ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2
2967 // width in pixels
ARGBCopyYToAlphaRow_SSE2(const uint8 * src,uint8 * dst,int width)2968 void ARGBCopyYToAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
2969 asm volatile (
2970 "pcmpeqb %%xmm0,%%xmm0 \n"
2971 "pslld $0x18,%%xmm0 \n"
2972 "pcmpeqb %%xmm1,%%xmm1 \n"
2973 "psrld $0x8,%%xmm1 \n"
2974 LABELALIGN
2975 "1: \n"
2976 "movq " MEMACCESS(0) ",%%xmm2 \n"
2977 "lea " MEMLEA(0x8,0) ",%0 \n"
2978 "punpcklbw %%xmm2,%%xmm2 \n"
2979 "punpckhwd %%xmm2,%%xmm3 \n"
2980 "punpcklwd %%xmm2,%%xmm2 \n"
2981 "movdqu " MEMACCESS(1) ",%%xmm4 \n"
2982 "movdqu " MEMACCESS2(0x10,1) ",%%xmm5 \n"
2983 "pand %%xmm0,%%xmm2 \n"
2984 "pand %%xmm0,%%xmm3 \n"
2985 "pand %%xmm1,%%xmm4 \n"
2986 "pand %%xmm1,%%xmm5 \n"
2987 "por %%xmm4,%%xmm2 \n"
2988 "por %%xmm5,%%xmm3 \n"
2989 "movdqu %%xmm2," MEMACCESS(1) " \n"
2990 "movdqu %%xmm3," MEMACCESS2(0x10,1) " \n"
2991 "lea " MEMLEA(0x20,1) ",%1 \n"
2992 "sub $0x8,%2 \n"
2993 "jg 1b \n"
2994 : "+r"(src), // %0
2995 "+r"(dst), // %1
2996 "+r"(width) // %2
2997 :
2998 : "memory", "cc"
2999 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
3000 );
3001 }
3002 #endif // HAS_ARGBCOPYYTOALPHAROW_SSE2
3003
3004 #ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2
3005 // width in pixels
ARGBCopyYToAlphaRow_AVX2(const uint8 * src,uint8 * dst,int width)3006 void ARGBCopyYToAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
3007 asm volatile (
3008 "vpcmpeqb %%ymm0,%%ymm0,%%ymm0 \n"
3009 "vpsrld $0x8,%%ymm0,%%ymm0 \n"
3010 LABELALIGN
3011 "1: \n"
3012 "vpmovzxbd " MEMACCESS(0) ",%%ymm1 \n"
3013 "vpmovzxbd " MEMACCESS2(0x8,0) ",%%ymm2 \n"
3014 "lea " MEMLEA(0x10,0) ",%0 \n"
3015 "vpslld $0x18,%%ymm1,%%ymm1 \n"
3016 "vpslld $0x18,%%ymm2,%%ymm2 \n"
3017 "vpblendvb %%ymm0," MEMACCESS(1) ",%%ymm1,%%ymm1 \n"
3018 "vpblendvb %%ymm0," MEMACCESS2(0x20,1) ",%%ymm2,%%ymm2 \n"
3019 "vmovdqu %%ymm1," MEMACCESS(1) " \n"
3020 "vmovdqu %%ymm2," MEMACCESS2(0x20,1) " \n"
3021 "lea " MEMLEA(0x40,1) ",%1 \n"
3022 "sub $0x10,%2 \n"
3023 "jg 1b \n"
3024 "vzeroupper \n"
3025 : "+r"(src), // %0
3026 "+r"(dst), // %1
3027 "+r"(width) // %2
3028 :
3029 : "memory", "cc"
3030 , "xmm0", "xmm1", "xmm2"
3031 );
3032 }
3033 #endif // HAS_ARGBCOPYYTOALPHAROW_AVX2
3034
3035 #ifdef HAS_SETROW_X86
SetRow_X86(uint8 * dst,uint8 v8,int width)3036 void SetRow_X86(uint8* dst, uint8 v8, int width) {
3037 size_t width_tmp = (size_t)(width >> 2);
3038 const uint32 v32 = v8 * 0x01010101u; // Duplicate byte to all bytes.
3039 asm volatile (
3040 "rep stosl " MEMSTORESTRING(eax,0) " \n"
3041 : "+D"(dst), // %0
3042 "+c"(width_tmp) // %1
3043 : "a"(v32) // %2
3044 : "memory", "cc");
3045 }
3046
SetRow_ERMS(uint8 * dst,uint8 v8,int width)3047 void SetRow_ERMS(uint8* dst, uint8 v8, int width) {
3048 size_t width_tmp = (size_t)(width);
3049 asm volatile (
3050 "rep stosb " MEMSTORESTRING(al,0) " \n"
3051 : "+D"(dst), // %0
3052 "+c"(width_tmp) // %1
3053 : "a"(v8) // %2
3054 : "memory", "cc");
3055 }
3056
ARGBSetRow_X86(uint8 * dst_argb,uint32 v32,int width)3057 void ARGBSetRow_X86(uint8* dst_argb, uint32 v32, int width) {
3058 size_t width_tmp = (size_t)(width);
3059 asm volatile (
3060 "rep stosl " MEMSTORESTRING(eax,0) " \n"
3061 : "+D"(dst_argb), // %0
3062 "+c"(width_tmp) // %1
3063 : "a"(v32) // %2
3064 : "memory", "cc");
3065 }
3066 #endif // HAS_SETROW_X86
3067
3068 #ifdef HAS_YUY2TOYROW_SSE2
YUY2ToYRow_SSE2(const uint8 * src_yuy2,uint8 * dst_y,int width)3069 void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int width) {
3070 asm volatile (
3071 "pcmpeqb %%xmm5,%%xmm5 \n"
3072 "psrlw $0x8,%%xmm5 \n"
3073 LABELALIGN
3074 "1: \n"
3075 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
3076 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
3077 "lea " MEMLEA(0x20,0) ",%0 \n"
3078 "pand %%xmm5,%%xmm0 \n"
3079 "pand %%xmm5,%%xmm1 \n"
3080 "packuswb %%xmm1,%%xmm0 \n"
3081 "movdqu %%xmm0," MEMACCESS(1) " \n"
3082 "lea " MEMLEA(0x10,1) ",%1 \n"
3083 "sub $0x10,%2 \n"
3084 "jg 1b \n"
3085 : "+r"(src_yuy2), // %0
3086 "+r"(dst_y), // %1
3087 "+r"(width) // %2
3088 :
3089 : "memory", "cc"
3090 , "xmm0", "xmm1", "xmm5"
3091 );
3092 }
3093
YUY2ToUVRow_SSE2(const uint8 * src_yuy2,int stride_yuy2,uint8 * dst_u,uint8 * dst_v,int width)3094 void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
3095 uint8* dst_u, uint8* dst_v, int width) {
3096 asm volatile (
3097 "pcmpeqb %%xmm5,%%xmm5 \n"
3098 "psrlw $0x8,%%xmm5 \n"
3099 "sub %1,%2 \n"
3100 LABELALIGN
3101 "1: \n"
3102 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
3103 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
3104 MEMOPREG(movdqu,0x00,0,4,1,xmm2) // movdqu (%0,%4,1),%%xmm2
3105 MEMOPREG(movdqu,0x10,0,4,1,xmm3) // movdqu 0x10(%0,%4,1),%%xmm3
3106 "lea " MEMLEA(0x20,0) ",%0 \n"
3107 "pavgb %%xmm2,%%xmm0 \n"
3108 "pavgb %%xmm3,%%xmm1 \n"
3109 "psrlw $0x8,%%xmm0 \n"
3110 "psrlw $0x8,%%xmm1 \n"
3111 "packuswb %%xmm1,%%xmm0 \n"
3112 "movdqa %%xmm0,%%xmm1 \n"
3113 "pand %%xmm5,%%xmm0 \n"
3114 "packuswb %%xmm0,%%xmm0 \n"
3115 "psrlw $0x8,%%xmm1 \n"
3116 "packuswb %%xmm1,%%xmm1 \n"
3117 "movq %%xmm0," MEMACCESS(1) " \n"
3118 MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2)
3119 "lea " MEMLEA(0x8,1) ",%1 \n"
3120 "sub $0x10,%3 \n"
3121 "jg 1b \n"
3122 : "+r"(src_yuy2), // %0
3123 "+r"(dst_u), // %1
3124 "+r"(dst_v), // %2
3125 "+r"(width) // %3
3126 : "r"((intptr_t)(stride_yuy2)) // %4
3127 : "memory", "cc", NACL_R14
3128 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
3129 );
3130 }
3131
YUY2ToUV422Row_SSE2(const uint8 * src_yuy2,uint8 * dst_u,uint8 * dst_v,int width)3132 void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
3133 uint8* dst_u, uint8* dst_v, int width) {
3134 asm volatile (
3135 "pcmpeqb %%xmm5,%%xmm5 \n"
3136 "psrlw $0x8,%%xmm5 \n"
3137 "sub %1,%2 \n"
3138 LABELALIGN
3139 "1: \n"
3140 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
3141 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
3142 "lea " MEMLEA(0x20,0) ",%0 \n"
3143 "psrlw $0x8,%%xmm0 \n"
3144 "psrlw $0x8,%%xmm1 \n"
3145 "packuswb %%xmm1,%%xmm0 \n"
3146 "movdqa %%xmm0,%%xmm1 \n"
3147 "pand %%xmm5,%%xmm0 \n"
3148 "packuswb %%xmm0,%%xmm0 \n"
3149 "psrlw $0x8,%%xmm1 \n"
3150 "packuswb %%xmm1,%%xmm1 \n"
3151 "movq %%xmm0," MEMACCESS(1) " \n"
3152 MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2)
3153 "lea " MEMLEA(0x8,1) ",%1 \n"
3154 "sub $0x10,%3 \n"
3155 "jg 1b \n"
3156 : "+r"(src_yuy2), // %0
3157 "+r"(dst_u), // %1
3158 "+r"(dst_v), // %2
3159 "+r"(width) // %3
3160 :
3161 : "memory", "cc", NACL_R14
3162 "xmm0", "xmm1", "xmm5"
3163 );
3164 }
3165
UYVYToYRow_SSE2(const uint8 * src_uyvy,uint8 * dst_y,int width)3166 void UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int width) {
3167 asm volatile (
3168 LABELALIGN
3169 "1: \n"
3170 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
3171 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
3172 "lea " MEMLEA(0x20,0) ",%0 \n"
3173 "psrlw $0x8,%%xmm0 \n"
3174 "psrlw $0x8,%%xmm1 \n"
3175 "packuswb %%xmm1,%%xmm0 \n"
3176 "movdqu %%xmm0," MEMACCESS(1) " \n"
3177 "lea " MEMLEA(0x10,1) ",%1 \n"
3178 "sub $0x10,%2 \n"
3179 "jg 1b \n"
3180 : "+r"(src_uyvy), // %0
3181 "+r"(dst_y), // %1
3182 "+r"(width) // %2
3183 :
3184 : "memory", "cc"
3185 , "xmm0", "xmm1"
3186 );
3187 }
3188
UYVYToUVRow_SSE2(const uint8 * src_uyvy,int stride_uyvy,uint8 * dst_u,uint8 * dst_v,int width)3189 void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
3190 uint8* dst_u, uint8* dst_v, int width) {
3191 asm volatile (
3192 "pcmpeqb %%xmm5,%%xmm5 \n"
3193 "psrlw $0x8,%%xmm5 \n"
3194 "sub %1,%2 \n"
3195 LABELALIGN
3196 "1: \n"
3197 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
3198 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
3199 MEMOPREG(movdqu,0x00,0,4,1,xmm2) // movdqu (%0,%4,1),%%xmm2
3200 MEMOPREG(movdqu,0x10,0,4,1,xmm3) // movdqu 0x10(%0,%4,1),%%xmm3
3201 "lea " MEMLEA(0x20,0) ",%0 \n"
3202 "pavgb %%xmm2,%%xmm0 \n"
3203 "pavgb %%xmm3,%%xmm1 \n"
3204 "pand %%xmm5,%%xmm0 \n"
3205 "pand %%xmm5,%%xmm1 \n"
3206 "packuswb %%xmm1,%%xmm0 \n"
3207 "movdqa %%xmm0,%%xmm1 \n"
3208 "pand %%xmm5,%%xmm0 \n"
3209 "packuswb %%xmm0,%%xmm0 \n"
3210 "psrlw $0x8,%%xmm1 \n"
3211 "packuswb %%xmm1,%%xmm1 \n"
3212 "movq %%xmm0," MEMACCESS(1) " \n"
3213 MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2)
3214 "lea " MEMLEA(0x8,1) ",%1 \n"
3215 "sub $0x10,%3 \n"
3216 "jg 1b \n"
3217 : "+r"(src_uyvy), // %0
3218 "+r"(dst_u), // %1
3219 "+r"(dst_v), // %2
3220 "+r"(width) // %3
3221 : "r"((intptr_t)(stride_uyvy)) // %4
3222 : "memory", "cc", NACL_R14
3223 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
3224 );
3225 }
3226
UYVYToUV422Row_SSE2(const uint8 * src_uyvy,uint8 * dst_u,uint8 * dst_v,int width)3227 void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
3228 uint8* dst_u, uint8* dst_v, int width) {
3229 asm volatile (
3230 "pcmpeqb %%xmm5,%%xmm5 \n"
3231 "psrlw $0x8,%%xmm5 \n"
3232 "sub %1,%2 \n"
3233 LABELALIGN
3234 "1: \n"
3235 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
3236 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
3237 "lea " MEMLEA(0x20,0) ",%0 \n"
3238 "pand %%xmm5,%%xmm0 \n"
3239 "pand %%xmm5,%%xmm1 \n"
3240 "packuswb %%xmm1,%%xmm0 \n"
3241 "movdqa %%xmm0,%%xmm1 \n"
3242 "pand %%xmm5,%%xmm0 \n"
3243 "packuswb %%xmm0,%%xmm0 \n"
3244 "psrlw $0x8,%%xmm1 \n"
3245 "packuswb %%xmm1,%%xmm1 \n"
3246 "movq %%xmm0," MEMACCESS(1) " \n"
3247 MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2)
3248 "lea " MEMLEA(0x8,1) ",%1 \n"
3249 "sub $0x10,%3 \n"
3250 "jg 1b \n"
3251 : "+r"(src_uyvy), // %0
3252 "+r"(dst_u), // %1
3253 "+r"(dst_v), // %2
3254 "+r"(width) // %3
3255 :
3256 : "memory", "cc", NACL_R14
3257 "xmm0", "xmm1", "xmm5"
3258 );
3259 }
3260 #endif // HAS_YUY2TOYROW_SSE2
3261
3262 #ifdef HAS_YUY2TOYROW_AVX2
YUY2ToYRow_AVX2(const uint8 * src_yuy2,uint8 * dst_y,int width)3263 void YUY2ToYRow_AVX2(const uint8* src_yuy2, uint8* dst_y, int width) {
3264 asm volatile (
3265 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
3266 "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
3267 LABELALIGN
3268 "1: \n"
3269 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
3270 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
3271 "lea " MEMLEA(0x40,0) ",%0 \n"
3272 "vpand %%ymm5,%%ymm0,%%ymm0 \n"
3273 "vpand %%ymm5,%%ymm1,%%ymm1 \n"
3274 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
3275 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
3276 "vmovdqu %%ymm0," MEMACCESS(1) " \n"
3277 "lea " MEMLEA(0x20,1) ",%1 \n"
3278 "sub $0x20,%2 \n"
3279 "jg 1b \n"
3280 "vzeroupper \n"
3281 : "+r"(src_yuy2), // %0
3282 "+r"(dst_y), // %1
3283 "+r"(width) // %2
3284 :
3285 : "memory", "cc"
3286 , "xmm0", "xmm1", "xmm5"
3287 );
3288 }
3289
YUY2ToUVRow_AVX2(const uint8 * src_yuy2,int stride_yuy2,uint8 * dst_u,uint8 * dst_v,int width)3290 void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2,
3291 uint8* dst_u, uint8* dst_v, int width) {
3292 asm volatile (
3293 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
3294 "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
3295 "sub %1,%2 \n"
3296 LABELALIGN
3297 "1: \n"
3298 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
3299 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
3300 VMEMOPREG(vpavgb,0x00,0,4,1,ymm0,ymm0) // vpavgb (%0,%4,1),%%ymm0,%%ymm0
3301 VMEMOPREG(vpavgb,0x20,0,4,1,ymm1,ymm1)
3302 "lea " MEMLEA(0x40,0) ",%0 \n"
3303 "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
3304 "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
3305 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
3306 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
3307 "vpand %%ymm5,%%ymm0,%%ymm1 \n"
3308 "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
3309 "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n"
3310 "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
3311 "vpermq $0xd8,%%ymm1,%%ymm1 \n"
3312 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
3313 "vextractf128 $0x0,%%ymm1," MEMACCESS(1) " \n"
3314 VEXTOPMEM(vextractf128,0,ymm0,0x00,1,2,1) // vextractf128 $0x0,%%ymm0,(%1,%2,1)
3315 "lea " MEMLEA(0x10,1) ",%1 \n"
3316 "sub $0x20,%3 \n"
3317 "jg 1b \n"
3318 "vzeroupper \n"
3319 : "+r"(src_yuy2), // %0
3320 "+r"(dst_u), // %1
3321 "+r"(dst_v), // %2
3322 "+r"(width) // %3
3323 : "r"((intptr_t)(stride_yuy2)) // %4
3324 : "memory", "cc", NACL_R14
3325 "xmm0", "xmm1", "xmm5"
3326 );
3327 }
3328
YUY2ToUV422Row_AVX2(const uint8 * src_yuy2,uint8 * dst_u,uint8 * dst_v,int width)3329 void YUY2ToUV422Row_AVX2(const uint8* src_yuy2,
3330 uint8* dst_u, uint8* dst_v, int width) {
3331 asm volatile (
3332 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
3333 "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
3334 "sub %1,%2 \n"
3335 LABELALIGN
3336 "1: \n"
3337 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
3338 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
3339 "lea " MEMLEA(0x40,0) ",%0 \n"
3340 "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
3341 "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
3342 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
3343 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
3344 "vpand %%ymm5,%%ymm0,%%ymm1 \n"
3345 "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
3346 "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n"
3347 "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
3348 "vpermq $0xd8,%%ymm1,%%ymm1 \n"
3349 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
3350 "vextractf128 $0x0,%%ymm1," MEMACCESS(1) " \n"
3351 VEXTOPMEM(vextractf128,0,ymm0,0x00,1,2,1) // vextractf128 $0x0,%%ymm0,(%1,%2,1)
3352 "lea " MEMLEA(0x10,1) ",%1 \n"
3353 "sub $0x20,%3 \n"
3354 "jg 1b \n"
3355 "vzeroupper \n"
3356 : "+r"(src_yuy2), // %0
3357 "+r"(dst_u), // %1
3358 "+r"(dst_v), // %2
3359 "+r"(width) // %3
3360 :
3361 : "memory", "cc", NACL_R14
3362 "xmm0", "xmm1", "xmm5"
3363 );
3364 }
3365
UYVYToYRow_AVX2(const uint8 * src_uyvy,uint8 * dst_y,int width)3366 void UYVYToYRow_AVX2(const uint8* src_uyvy, uint8* dst_y, int width) {
3367 asm volatile (
3368 LABELALIGN
3369 "1: \n"
3370 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
3371 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
3372 "lea " MEMLEA(0x40,0) ",%0 \n"
3373 "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
3374 "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
3375 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
3376 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
3377 "vmovdqu %%ymm0," MEMACCESS(1) " \n"
3378 "lea " MEMLEA(0x20,1) ",%1 \n"
3379 "sub $0x20,%2 \n"
3380 "jg 1b \n"
3381 "vzeroupper \n"
3382 : "+r"(src_uyvy), // %0
3383 "+r"(dst_y), // %1
3384 "+r"(width) // %2
3385 :
3386 : "memory", "cc"
3387 , "xmm0", "xmm1", "xmm5"
3388 );
3389 }
UYVYToUVRow_AVX2(const uint8 * src_uyvy,int stride_uyvy,uint8 * dst_u,uint8 * dst_v,int width)3390 void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy,
3391 uint8* dst_u, uint8* dst_v, int width) {
3392 asm volatile (
3393 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
3394 "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
3395 "sub %1,%2 \n"
3396
3397 LABELALIGN
3398 "1: \n"
3399 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
3400 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
3401 VMEMOPREG(vpavgb,0x00,0,4,1,ymm0,ymm0) // vpavgb (%0,%4,1),%%ymm0,%%ymm0
3402 VMEMOPREG(vpavgb,0x20,0,4,1,ymm1,ymm1)
3403 "lea " MEMLEA(0x40,0) ",%0 \n"
3404 "vpand %%ymm5,%%ymm0,%%ymm0 \n"
3405 "vpand %%ymm5,%%ymm1,%%ymm1 \n"
3406 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
3407 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
3408 "vpand %%ymm5,%%ymm0,%%ymm1 \n"
3409 "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
3410 "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n"
3411 "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
3412 "vpermq $0xd8,%%ymm1,%%ymm1 \n"
3413 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
3414 "vextractf128 $0x0,%%ymm1," MEMACCESS(1) " \n"
3415 VEXTOPMEM(vextractf128,0,ymm0,0x00,1,2,1) // vextractf128 $0x0,%%ymm0,(%1,%2,1)
3416 "lea " MEMLEA(0x10,1) ",%1 \n"
3417 "sub $0x20,%3 \n"
3418 "jg 1b \n"
3419 "vzeroupper \n"
3420 : "+r"(src_uyvy), // %0
3421 "+r"(dst_u), // %1
3422 "+r"(dst_v), // %2
3423 "+r"(width) // %3
3424 : "r"((intptr_t)(stride_uyvy)) // %4
3425 : "memory", "cc", NACL_R14
3426 "xmm0", "xmm1", "xmm5"
3427 );
3428 }
3429
UYVYToUV422Row_AVX2(const uint8 * src_uyvy,uint8 * dst_u,uint8 * dst_v,int width)3430 void UYVYToUV422Row_AVX2(const uint8* src_uyvy,
3431 uint8* dst_u, uint8* dst_v, int width) {
3432 asm volatile (
3433 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
3434 "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
3435 "sub %1,%2 \n"
3436 LABELALIGN
3437 "1: \n"
3438 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
3439 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
3440 "lea " MEMLEA(0x40,0) ",%0 \n"
3441 "vpand %%ymm5,%%ymm0,%%ymm0 \n"
3442 "vpand %%ymm5,%%ymm1,%%ymm1 \n"
3443 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
3444 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
3445 "vpand %%ymm5,%%ymm0,%%ymm1 \n"
3446 "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
3447 "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n"
3448 "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
3449 "vpermq $0xd8,%%ymm1,%%ymm1 \n"
3450 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
3451 "vextractf128 $0x0,%%ymm1," MEMACCESS(1) " \n"
3452 VEXTOPMEM(vextractf128,0,ymm0,0x00,1,2,1) // vextractf128 $0x0,%%ymm0,(%1,%2,1)
3453 "lea " MEMLEA(0x10,1) ",%1 \n"
3454 "sub $0x20,%3 \n"
3455 "jg 1b \n"
3456 "vzeroupper \n"
3457 : "+r"(src_uyvy), // %0
3458 "+r"(dst_u), // %1
3459 "+r"(dst_v), // %2
3460 "+r"(width) // %3
3461 :
3462 : "memory", "cc", NACL_R14
3463 "xmm0", "xmm1", "xmm5"
3464 );
3465 }
3466 #endif // HAS_YUY2TOYROW_AVX2
3467
3468 #ifdef HAS_ARGBBLENDROW_SSSE3
3469 // Shuffle table for isolating alpha.
3470 static uvec8 kShuffleAlpha = {
3471 3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80,
3472 11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80
3473 };
3474
3475 // Blend 8 pixels at a time
ARGBBlendRow_SSSE3(const uint8 * src_argb0,const uint8 * src_argb1,uint8 * dst_argb,int width)3476 void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
3477 uint8* dst_argb, int width) {
3478 asm volatile (
3479 "pcmpeqb %%xmm7,%%xmm7 \n"
3480 "psrlw $0xf,%%xmm7 \n"
3481 "pcmpeqb %%xmm6,%%xmm6 \n"
3482 "psrlw $0x8,%%xmm6 \n"
3483 "pcmpeqb %%xmm5,%%xmm5 \n"
3484 "psllw $0x8,%%xmm5 \n"
3485 "pcmpeqb %%xmm4,%%xmm4 \n"
3486 "pslld $0x18,%%xmm4 \n"
3487 "sub $0x4,%3 \n"
3488 "jl 49f \n"
3489
3490 // 4 pixel loop.
3491 LABELALIGN
3492 "40: \n"
3493 "movdqu " MEMACCESS(0) ",%%xmm3 \n"
3494 "lea " MEMLEA(0x10,0) ",%0 \n"
3495 "movdqa %%xmm3,%%xmm0 \n"
3496 "pxor %%xmm4,%%xmm3 \n"
3497 "movdqu " MEMACCESS(1) ",%%xmm2 \n"
3498 "pshufb %4,%%xmm3 \n"
3499 "pand %%xmm6,%%xmm2 \n"
3500 "paddw %%xmm7,%%xmm3 \n"
3501 "pmullw %%xmm3,%%xmm2 \n"
3502 "movdqu " MEMACCESS(1) ",%%xmm1 \n"
3503 "lea " MEMLEA(0x10,1) ",%1 \n"
3504 "psrlw $0x8,%%xmm1 \n"
3505 "por %%xmm4,%%xmm0 \n"
3506 "pmullw %%xmm3,%%xmm1 \n"
3507 "psrlw $0x8,%%xmm2 \n"
3508 "paddusb %%xmm2,%%xmm0 \n"
3509 "pand %%xmm5,%%xmm1 \n"
3510 "paddusb %%xmm1,%%xmm0 \n"
3511 "movdqu %%xmm0," MEMACCESS(2) " \n"
3512 "lea " MEMLEA(0x10,2) ",%2 \n"
3513 "sub $0x4,%3 \n"
3514 "jge 40b \n"
3515
3516 "49: \n"
3517 "add $0x3,%3 \n"
3518 "jl 99f \n"
3519
3520 // 1 pixel loop.
3521 "91: \n"
3522 "movd " MEMACCESS(0) ",%%xmm3 \n"
3523 "lea " MEMLEA(0x4,0) ",%0 \n"
3524 "movdqa %%xmm3,%%xmm0 \n"
3525 "pxor %%xmm4,%%xmm3 \n"
3526 "movd " MEMACCESS(1) ",%%xmm2 \n"
3527 "pshufb %4,%%xmm3 \n"
3528 "pand %%xmm6,%%xmm2 \n"
3529 "paddw %%xmm7,%%xmm3 \n"
3530 "pmullw %%xmm3,%%xmm2 \n"
3531 "movd " MEMACCESS(1) ",%%xmm1 \n"
3532 "lea " MEMLEA(0x4,1) ",%1 \n"
3533 "psrlw $0x8,%%xmm1 \n"
3534 "por %%xmm4,%%xmm0 \n"
3535 "pmullw %%xmm3,%%xmm1 \n"
3536 "psrlw $0x8,%%xmm2 \n"
3537 "paddusb %%xmm2,%%xmm0 \n"
3538 "pand %%xmm5,%%xmm1 \n"
3539 "paddusb %%xmm1,%%xmm0 \n"
3540 "movd %%xmm0," MEMACCESS(2) " \n"
3541 "lea " MEMLEA(0x4,2) ",%2 \n"
3542 "sub $0x1,%3 \n"
3543 "jge 91b \n"
3544 "99: \n"
3545 : "+r"(src_argb0), // %0
3546 "+r"(src_argb1), // %1
3547 "+r"(dst_argb), // %2
3548 "+r"(width) // %3
3549 : "m"(kShuffleAlpha) // %4
3550 : "memory", "cc"
3551 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
3552 );
3553 }
3554 #endif // HAS_ARGBBLENDROW_SSSE3
3555
3556 #ifdef HAS_BLENDPLANEROW_SSSE3
3557 // Blend 8 pixels at a time.
3558 // unsigned version of math
3559 // =((A2*C2)+(B2*(255-C2))+255)/256
3560 // signed version of math
3561 // =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256
BlendPlaneRow_SSSE3(const uint8 * src0,const uint8 * src1,const uint8 * alpha,uint8 * dst,int width)3562 void BlendPlaneRow_SSSE3(const uint8* src0, const uint8* src1,
3563 const uint8* alpha, uint8* dst, int width) {
3564 asm volatile (
3565 "pcmpeqb %%xmm5,%%xmm5 \n"
3566 "psllw $0x8,%%xmm5 \n"
3567 "mov $0x80808080,%%eax \n"
3568 "movd %%eax,%%xmm6 \n"
3569 "pshufd $0x0,%%xmm6,%%xmm6 \n"
3570 "mov $0x807f807f,%%eax \n"
3571 "movd %%eax,%%xmm7 \n"
3572 "pshufd $0x0,%%xmm7,%%xmm7 \n"
3573 "sub %2,%0 \n"
3574 "sub %2,%1 \n"
3575 "sub %2,%3 \n"
3576
3577 // 8 pixel loop.
3578 LABELALIGN
3579 "1: \n"
3580 "movq (%2),%%xmm0 \n"
3581 "punpcklbw %%xmm0,%%xmm0 \n"
3582 "pxor %%xmm5,%%xmm0 \n"
3583 "movq (%0,%2,1),%%xmm1 \n"
3584 "movq (%1,%2,1),%%xmm2 \n"
3585 "punpcklbw %%xmm2,%%xmm1 \n"
3586 "psubb %%xmm6,%%xmm1 \n"
3587 "pmaddubsw %%xmm1,%%xmm0 \n"
3588 "paddw %%xmm7,%%xmm0 \n"
3589 "psrlw $0x8,%%xmm0 \n"
3590 "packuswb %%xmm0,%%xmm0 \n"
3591 "movq %%xmm0,(%3,%2,1) \n"
3592 "lea 0x8(%2),%2 \n"
3593 "sub $0x8,%4 \n"
3594 "jg 1b \n"
3595 : "+r"(src0), // %0
3596 "+r"(src1), // %1
3597 "+r"(alpha), // %2
3598 "+r"(dst), // %3
3599 "+rm"(width) // %4
3600 :: "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm5", "xmm6", "xmm7"
3601 );
3602 }
3603 #endif // HAS_BLENDPLANEROW_SSSE3
3604
3605 #ifdef HAS_BLENDPLANEROW_AVX2
3606 // Blend 32 pixels at a time.
3607 // unsigned version of math
3608 // =((A2*C2)+(B2*(255-C2))+255)/256
3609 // signed version of math
3610 // =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256
BlendPlaneRow_AVX2(const uint8 * src0,const uint8 * src1,const uint8 * alpha,uint8 * dst,int width)3611 void BlendPlaneRow_AVX2(const uint8* src0, const uint8* src1,
3612 const uint8* alpha, uint8* dst, int width) {
3613 asm volatile (
3614 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
3615 "vpsllw $0x8,%%ymm5,%%ymm5 \n"
3616 "mov $0x80808080,%%eax \n"
3617 "vmovd %%eax,%%xmm6 \n"
3618 "vbroadcastss %%xmm6,%%ymm6 \n"
3619 "mov $0x807f807f,%%eax \n"
3620 "vmovd %%eax,%%xmm7 \n"
3621 "vbroadcastss %%xmm7,%%ymm7 \n"
3622 "sub %2,%0 \n"
3623 "sub %2,%1 \n"
3624 "sub %2,%3 \n"
3625
3626 // 32 pixel loop.
3627 LABELALIGN
3628 "1: \n"
3629 "vmovdqu (%2),%%ymm0 \n"
3630 "vpunpckhbw %%ymm0,%%ymm0,%%ymm3 \n"
3631 "vpunpcklbw %%ymm0,%%ymm0,%%ymm0 \n"
3632 "vpxor %%ymm5,%%ymm3,%%ymm3 \n"
3633 "vpxor %%ymm5,%%ymm0,%%ymm0 \n"
3634 "vmovdqu (%0,%2,1),%%ymm1 \n"
3635 "vmovdqu (%1,%2,1),%%ymm2 \n"
3636 "vpunpckhbw %%ymm2,%%ymm1,%%ymm4 \n"
3637 "vpunpcklbw %%ymm2,%%ymm1,%%ymm1 \n"
3638 "vpsubb %%ymm6,%%ymm4,%%ymm4 \n"
3639 "vpsubb %%ymm6,%%ymm1,%%ymm1 \n"
3640 "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
3641 "vpmaddubsw %%ymm1,%%ymm0,%%ymm0 \n"
3642 "vpaddw %%ymm7,%%ymm3,%%ymm3 \n"
3643 "vpaddw %%ymm7,%%ymm0,%%ymm0 \n"
3644 "vpsrlw $0x8,%%ymm3,%%ymm3 \n"
3645 "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
3646 "vpackuswb %%ymm3,%%ymm0,%%ymm0 \n"
3647 "vmovdqu %%ymm0,(%3,%2,1) \n"
3648 "lea 0x20(%2),%2 \n"
3649 "sub $0x20,%4 \n"
3650 "jg 1b \n"
3651 "vzeroupper \n"
3652 : "+r"(src0), // %0
3653 "+r"(src1), // %1
3654 "+r"(alpha), // %2
3655 "+r"(dst), // %3
3656 "+rm"(width) // %4
3657 :: "memory", "cc", "eax",
3658 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
3659 );
3660 }
3661 #endif // HAS_BLENDPLANEROW_AVX2
3662
3663 #ifdef HAS_ARGBATTENUATEROW_SSSE3
3664 // Shuffle table duplicating alpha
3665 static uvec8 kShuffleAlpha0 = {
3666 3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u
3667 };
3668 static uvec8 kShuffleAlpha1 = {
3669 11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
3670 15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u
3671 };
3672 // Attenuate 4 pixels at a time.
ARGBAttenuateRow_SSSE3(const uint8 * src_argb,uint8 * dst_argb,int width)3673 void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
3674 asm volatile (
3675 "pcmpeqb %%xmm3,%%xmm3 \n"
3676 "pslld $0x18,%%xmm3 \n"
3677 "movdqa %3,%%xmm4 \n"
3678 "movdqa %4,%%xmm5 \n"
3679
3680 // 4 pixel loop.
3681 LABELALIGN
3682 "1: \n"
3683 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
3684 "pshufb %%xmm4,%%xmm0 \n"
3685 "movdqu " MEMACCESS(0) ",%%xmm1 \n"
3686 "punpcklbw %%xmm1,%%xmm1 \n"
3687 "pmulhuw %%xmm1,%%xmm0 \n"
3688 "movdqu " MEMACCESS(0) ",%%xmm1 \n"
3689 "pshufb %%xmm5,%%xmm1 \n"
3690 "movdqu " MEMACCESS(0) ",%%xmm2 \n"
3691 "punpckhbw %%xmm2,%%xmm2 \n"
3692 "pmulhuw %%xmm2,%%xmm1 \n"
3693 "movdqu " MEMACCESS(0) ",%%xmm2 \n"
3694 "lea " MEMLEA(0x10,0) ",%0 \n"
3695 "pand %%xmm3,%%xmm2 \n"
3696 "psrlw $0x8,%%xmm0 \n"
3697 "psrlw $0x8,%%xmm1 \n"
3698 "packuswb %%xmm1,%%xmm0 \n"
3699 "por %%xmm2,%%xmm0 \n"
3700 "movdqu %%xmm0," MEMACCESS(1) " \n"
3701 "lea " MEMLEA(0x10,1) ",%1 \n"
3702 "sub $0x4,%2 \n"
3703 "jg 1b \n"
3704 : "+r"(src_argb), // %0
3705 "+r"(dst_argb), // %1
3706 "+r"(width) // %2
3707 : "m"(kShuffleAlpha0), // %3
3708 "m"(kShuffleAlpha1) // %4
3709 : "memory", "cc"
3710 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
3711 );
3712 }
3713 #endif // HAS_ARGBATTENUATEROW_SSSE3
3714
3715 #ifdef HAS_ARGBATTENUATEROW_AVX2
3716 // Shuffle table duplicating alpha.
3717 static const uvec8 kShuffleAlpha_AVX2 = {
3718 6u, 7u, 6u, 7u, 6u, 7u, 128u, 128u, 14u, 15u, 14u, 15u, 14u, 15u, 128u, 128u
3719 };
3720 // Attenuate 8 pixels at a time.
ARGBAttenuateRow_AVX2(const uint8 * src_argb,uint8 * dst_argb,int width)3721 void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) {
3722 asm volatile (
3723 "vbroadcastf128 %3,%%ymm4 \n"
3724 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
3725 "vpslld $0x18,%%ymm5,%%ymm5 \n"
3726 "sub %0,%1 \n"
3727
3728 // 8 pixel loop.
3729 LABELALIGN
3730 "1: \n"
3731 "vmovdqu " MEMACCESS(0) ",%%ymm6 \n"
3732 "vpunpcklbw %%ymm6,%%ymm6,%%ymm0 \n"
3733 "vpunpckhbw %%ymm6,%%ymm6,%%ymm1 \n"
3734 "vpshufb %%ymm4,%%ymm0,%%ymm2 \n"
3735 "vpshufb %%ymm4,%%ymm1,%%ymm3 \n"
3736 "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n"
3737 "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n"
3738 "vpand %%ymm5,%%ymm6,%%ymm6 \n"
3739 "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
3740 "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
3741 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
3742 "vpor %%ymm6,%%ymm0,%%ymm0 \n"
3743 MEMOPMEM(vmovdqu,ymm0,0x00,0,1,1) // vmovdqu %%ymm0,(%0,%1)
3744 "lea " MEMLEA(0x20,0) ",%0 \n"
3745 "sub $0x8,%2 \n"
3746 "jg 1b \n"
3747 "vzeroupper \n"
3748 : "+r"(src_argb), // %0
3749 "+r"(dst_argb), // %1
3750 "+r"(width) // %2
3751 : "m"(kShuffleAlpha_AVX2) // %3
3752 : "memory", "cc"
3753 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
3754 );
3755 }
3756 #endif // HAS_ARGBATTENUATEROW_AVX2
3757
3758 #ifdef HAS_ARGBUNATTENUATEROW_SSE2
3759 // Unattenuate 4 pixels at a time.
ARGBUnattenuateRow_SSE2(const uint8 * src_argb,uint8 * dst_argb,int width)3760 void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
3761 int width) {
3762 uintptr_t alpha;
3763 asm volatile (
3764 // 4 pixel loop.
3765 LABELALIGN
3766 "1: \n"
3767 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
3768 "movzb " MEMACCESS2(0x03,0) ",%3 \n"
3769 "punpcklbw %%xmm0,%%xmm0 \n"
3770 MEMOPREG(movd,0x00,4,3,4,xmm2) // movd 0x0(%4,%3,4),%%xmm2
3771 "movzb " MEMACCESS2(0x07,0) ",%3 \n"
3772 MEMOPREG(movd,0x00,4,3,4,xmm3) // movd 0x0(%4,%3,4),%%xmm3
3773 "pshuflw $0x40,%%xmm2,%%xmm2 \n"
3774 "pshuflw $0x40,%%xmm3,%%xmm3 \n"
3775 "movlhps %%xmm3,%%xmm2 \n"
3776 "pmulhuw %%xmm2,%%xmm0 \n"
3777 "movdqu " MEMACCESS(0) ",%%xmm1 \n"
3778 "movzb " MEMACCESS2(0x0b,0) ",%3 \n"
3779 "punpckhbw %%xmm1,%%xmm1 \n"
3780 MEMOPREG(movd,0x00,4,3,4,xmm2) // movd 0x0(%4,%3,4),%%xmm2
3781 "movzb " MEMACCESS2(0x0f,0) ",%3 \n"
3782 MEMOPREG(movd,0x00,4,3,4,xmm3) // movd 0x0(%4,%3,4),%%xmm3
3783 "pshuflw $0x40,%%xmm2,%%xmm2 \n"
3784 "pshuflw $0x40,%%xmm3,%%xmm3 \n"
3785 "movlhps %%xmm3,%%xmm2 \n"
3786 "pmulhuw %%xmm2,%%xmm1 \n"
3787 "lea " MEMLEA(0x10,0) ",%0 \n"
3788 "packuswb %%xmm1,%%xmm0 \n"
3789 "movdqu %%xmm0," MEMACCESS(1) " \n"
3790 "lea " MEMLEA(0x10,1) ",%1 \n"
3791 "sub $0x4,%2 \n"
3792 "jg 1b \n"
3793 : "+r"(src_argb), // %0
3794 "+r"(dst_argb), // %1
3795 "+r"(width), // %2
3796 "=&r"(alpha) // %3
3797 : "r"(fixed_invtbl8) // %4
3798 : "memory", "cc", NACL_R14
3799 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
3800 );
3801 }
3802 #endif // HAS_ARGBUNATTENUATEROW_SSE2
3803
3804 #ifdef HAS_ARGBUNATTENUATEROW_AVX2
3805 // Shuffle table duplicating alpha.
3806 static const uvec8 kUnattenShuffleAlpha_AVX2 = {
3807 0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15u
3808 };
3809 // Unattenuate 8 pixels at a time.
ARGBUnattenuateRow_AVX2(const uint8 * src_argb,uint8 * dst_argb,int width)3810 void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,
3811 int width) {
3812 uintptr_t alpha;
3813 asm volatile (
3814 "sub %0,%1 \n"
3815 "vbroadcastf128 %5,%%ymm5 \n"
3816
3817 // 8 pixel loop.
3818 LABELALIGN
3819 "1: \n"
3820 // replace VPGATHER
3821 "movzb " MEMACCESS2(0x03,0) ",%3 \n"
3822 MEMOPREG(vmovd,0x00,4,3,4,xmm0) // vmovd 0x0(%4,%3,4),%%xmm0
3823 "movzb " MEMACCESS2(0x07,0) ",%3 \n"
3824 MEMOPREG(vmovd,0x00,4,3,4,xmm1) // vmovd 0x0(%4,%3,4),%%xmm1
3825 "movzb " MEMACCESS2(0x0b,0) ",%3 \n"
3826 "vpunpckldq %%xmm1,%%xmm0,%%xmm6 \n"
3827 MEMOPREG(vmovd,0x00,4,3,4,xmm2) // vmovd 0x0(%4,%3,4),%%xmm2
3828 "movzb " MEMACCESS2(0x0f,0) ",%3 \n"
3829 MEMOPREG(vmovd,0x00,4,3,4,xmm3) // vmovd 0x0(%4,%3,4),%%xmm3
3830 "movzb " MEMACCESS2(0x13,0) ",%3 \n"
3831 "vpunpckldq %%xmm3,%%xmm2,%%xmm7 \n"
3832 MEMOPREG(vmovd,0x00,4,3,4,xmm0) // vmovd 0x0(%4,%3,4),%%xmm0
3833 "movzb " MEMACCESS2(0x17,0) ",%3 \n"
3834 MEMOPREG(vmovd,0x00,4,3,4,xmm1) // vmovd 0x0(%4,%3,4),%%xmm1
3835 "movzb " MEMACCESS2(0x1b,0) ",%3 \n"
3836 "vpunpckldq %%xmm1,%%xmm0,%%xmm0 \n"
3837 MEMOPREG(vmovd,0x00,4,3,4,xmm2) // vmovd 0x0(%4,%3,4),%%xmm2
3838 "movzb " MEMACCESS2(0x1f,0) ",%3 \n"
3839 MEMOPREG(vmovd,0x00,4,3,4,xmm3) // vmovd 0x0(%4,%3,4),%%xmm3
3840 "vpunpckldq %%xmm3,%%xmm2,%%xmm2 \n"
3841 "vpunpcklqdq %%xmm7,%%xmm6,%%xmm3 \n"
3842 "vpunpcklqdq %%xmm2,%%xmm0,%%xmm0 \n"
3843 "vinserti128 $0x1,%%xmm0,%%ymm3,%%ymm3 \n"
3844 // end of VPGATHER
3845
3846 "vmovdqu " MEMACCESS(0) ",%%ymm6 \n"
3847 "vpunpcklbw %%ymm6,%%ymm6,%%ymm0 \n"
3848 "vpunpckhbw %%ymm6,%%ymm6,%%ymm1 \n"
3849 "vpunpcklwd %%ymm3,%%ymm3,%%ymm2 \n"
3850 "vpunpckhwd %%ymm3,%%ymm3,%%ymm3 \n"
3851 "vpshufb %%ymm5,%%ymm2,%%ymm2 \n"
3852 "vpshufb %%ymm5,%%ymm3,%%ymm3 \n"
3853 "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n"
3854 "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n"
3855 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
3856 MEMOPMEM(vmovdqu,ymm0,0x00,0,1,1) // vmovdqu %%ymm0,(%0,%1)
3857 "lea " MEMLEA(0x20,0) ",%0 \n"
3858 "sub $0x8,%2 \n"
3859 "jg 1b \n"
3860 "vzeroupper \n"
3861 : "+r"(src_argb), // %0
3862 "+r"(dst_argb), // %1
3863 "+r"(width), // %2
3864 "=&r"(alpha) // %3
3865 : "r"(fixed_invtbl8), // %4
3866 "m"(kUnattenShuffleAlpha_AVX2) // %5
3867 : "memory", "cc", NACL_R14
3868 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
3869 );
3870 }
3871 #endif // HAS_ARGBUNATTENUATEROW_AVX2
3872
3873 #ifdef HAS_ARGBGRAYROW_SSSE3
3874 // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
ARGBGrayRow_SSSE3(const uint8 * src_argb,uint8 * dst_argb,int width)3875 void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
3876 asm volatile (
3877 "movdqa %3,%%xmm4 \n"
3878 "movdqa %4,%%xmm5 \n"
3879
3880 // 8 pixel loop.
3881 LABELALIGN
3882 "1: \n"
3883 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
3884 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
3885 "pmaddubsw %%xmm4,%%xmm0 \n"
3886 "pmaddubsw %%xmm4,%%xmm1 \n"
3887 "phaddw %%xmm1,%%xmm0 \n"
3888 "paddw %%xmm5,%%xmm0 \n"
3889 "psrlw $0x7,%%xmm0 \n"
3890 "packuswb %%xmm0,%%xmm0 \n"
3891 "movdqu " MEMACCESS(0) ",%%xmm2 \n"
3892 "movdqu " MEMACCESS2(0x10,0) ",%%xmm3 \n"
3893 "lea " MEMLEA(0x20,0) ",%0 \n"
3894 "psrld $0x18,%%xmm2 \n"
3895 "psrld $0x18,%%xmm3 \n"
3896 "packuswb %%xmm3,%%xmm2 \n"
3897 "packuswb %%xmm2,%%xmm2 \n"
3898 "movdqa %%xmm0,%%xmm3 \n"
3899 "punpcklbw %%xmm0,%%xmm0 \n"
3900 "punpcklbw %%xmm2,%%xmm3 \n"
3901 "movdqa %%xmm0,%%xmm1 \n"
3902 "punpcklwd %%xmm3,%%xmm0 \n"
3903 "punpckhwd %%xmm3,%%xmm1 \n"
3904 "movdqu %%xmm0," MEMACCESS(1) " \n"
3905 "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
3906 "lea " MEMLEA(0x20,1) ",%1 \n"
3907 "sub $0x8,%2 \n"
3908 "jg 1b \n"
3909 : "+r"(src_argb), // %0
3910 "+r"(dst_argb), // %1
3911 "+r"(width) // %2
3912 : "m"(kARGBToYJ), // %3
3913 "m"(kAddYJ64) // %4
3914 : "memory", "cc"
3915 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
3916 );
3917 }
3918 #endif // HAS_ARGBGRAYROW_SSSE3
3919
3920 #ifdef HAS_ARGBSEPIAROW_SSSE3
3921 // b = (r * 35 + g * 68 + b * 17) >> 7
3922 // g = (r * 45 + g * 88 + b * 22) >> 7
3923 // r = (r * 50 + g * 98 + b * 24) >> 7
3924 // Constant for ARGB color to sepia tone
3925 static vec8 kARGBToSepiaB = {
3926 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0
3927 };
3928
3929 static vec8 kARGBToSepiaG = {
3930 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0
3931 };
3932
3933 static vec8 kARGBToSepiaR = {
3934 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0
3935 };
3936
3937 // Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
ARGBSepiaRow_SSSE3(uint8 * dst_argb,int width)3938 void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
3939 asm volatile (
3940 "movdqa %2,%%xmm2 \n"
3941 "movdqa %3,%%xmm3 \n"
3942 "movdqa %4,%%xmm4 \n"
3943
3944 // 8 pixel loop.
3945 LABELALIGN
3946 "1: \n"
3947 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
3948 "movdqu " MEMACCESS2(0x10,0) ",%%xmm6 \n"
3949 "pmaddubsw %%xmm2,%%xmm0 \n"
3950 "pmaddubsw %%xmm2,%%xmm6 \n"
3951 "phaddw %%xmm6,%%xmm0 \n"
3952 "psrlw $0x7,%%xmm0 \n"
3953 "packuswb %%xmm0,%%xmm0 \n"
3954 "movdqu " MEMACCESS(0) ",%%xmm5 \n"
3955 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
3956 "pmaddubsw %%xmm3,%%xmm5 \n"
3957 "pmaddubsw %%xmm3,%%xmm1 \n"
3958 "phaddw %%xmm1,%%xmm5 \n"
3959 "psrlw $0x7,%%xmm5 \n"
3960 "packuswb %%xmm5,%%xmm5 \n"
3961 "punpcklbw %%xmm5,%%xmm0 \n"
3962 "movdqu " MEMACCESS(0) ",%%xmm5 \n"
3963 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
3964 "pmaddubsw %%xmm4,%%xmm5 \n"
3965 "pmaddubsw %%xmm4,%%xmm1 \n"
3966 "phaddw %%xmm1,%%xmm5 \n"
3967 "psrlw $0x7,%%xmm5 \n"
3968 "packuswb %%xmm5,%%xmm5 \n"
3969 "movdqu " MEMACCESS(0) ",%%xmm6 \n"
3970 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
3971 "psrld $0x18,%%xmm6 \n"
3972 "psrld $0x18,%%xmm1 \n"
3973 "packuswb %%xmm1,%%xmm6 \n"
3974 "packuswb %%xmm6,%%xmm6 \n"
3975 "punpcklbw %%xmm6,%%xmm5 \n"
3976 "movdqa %%xmm0,%%xmm1 \n"
3977 "punpcklwd %%xmm5,%%xmm0 \n"
3978 "punpckhwd %%xmm5,%%xmm1 \n"
3979 "movdqu %%xmm0," MEMACCESS(0) " \n"
3980 "movdqu %%xmm1," MEMACCESS2(0x10,0) " \n"
3981 "lea " MEMLEA(0x20,0) ",%0 \n"
3982 "sub $0x8,%1 \n"
3983 "jg 1b \n"
3984 : "+r"(dst_argb), // %0
3985 "+r"(width) // %1
3986 : "m"(kARGBToSepiaB), // %2
3987 "m"(kARGBToSepiaG), // %3
3988 "m"(kARGBToSepiaR) // %4
3989 : "memory", "cc"
3990 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
3991 );
3992 }
3993 #endif // HAS_ARGBSEPIAROW_SSSE3
3994
3995 #ifdef HAS_ARGBCOLORMATRIXROW_SSSE3
3996 // Tranform 8 ARGB pixels (32 bytes) with color matrix.
3997 // Same as Sepia except matrix is provided.
ARGBColorMatrixRow_SSSE3(const uint8 * src_argb,uint8 * dst_argb,const int8 * matrix_argb,int width)3998 void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
3999 const int8* matrix_argb, int width) {
4000 asm volatile (
4001 "movdqu " MEMACCESS(3) ",%%xmm5 \n"
4002 "pshufd $0x00,%%xmm5,%%xmm2 \n"
4003 "pshufd $0x55,%%xmm5,%%xmm3 \n"
4004 "pshufd $0xaa,%%xmm5,%%xmm4 \n"
4005 "pshufd $0xff,%%xmm5,%%xmm5 \n"
4006
4007 // 8 pixel loop.
4008 LABELALIGN
4009 "1: \n"
4010 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
4011 "movdqu " MEMACCESS2(0x10,0) ",%%xmm7 \n"
4012 "pmaddubsw %%xmm2,%%xmm0 \n"
4013 "pmaddubsw %%xmm2,%%xmm7 \n"
4014 "movdqu " MEMACCESS(0) ",%%xmm6 \n"
4015 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
4016 "pmaddubsw %%xmm3,%%xmm6 \n"
4017 "pmaddubsw %%xmm3,%%xmm1 \n"
4018 "phaddsw %%xmm7,%%xmm0 \n"
4019 "phaddsw %%xmm1,%%xmm6 \n"
4020 "psraw $0x6,%%xmm0 \n"
4021 "psraw $0x6,%%xmm6 \n"
4022 "packuswb %%xmm0,%%xmm0 \n"
4023 "packuswb %%xmm6,%%xmm6 \n"
4024 "punpcklbw %%xmm6,%%xmm0 \n"
4025 "movdqu " MEMACCESS(0) ",%%xmm1 \n"
4026 "movdqu " MEMACCESS2(0x10,0) ",%%xmm7 \n"
4027 "pmaddubsw %%xmm4,%%xmm1 \n"
4028 "pmaddubsw %%xmm4,%%xmm7 \n"
4029 "phaddsw %%xmm7,%%xmm1 \n"
4030 "movdqu " MEMACCESS(0) ",%%xmm6 \n"
4031 "movdqu " MEMACCESS2(0x10,0) ",%%xmm7 \n"
4032 "pmaddubsw %%xmm5,%%xmm6 \n"
4033 "pmaddubsw %%xmm5,%%xmm7 \n"
4034 "phaddsw %%xmm7,%%xmm6 \n"
4035 "psraw $0x6,%%xmm1 \n"
4036 "psraw $0x6,%%xmm6 \n"
4037 "packuswb %%xmm1,%%xmm1 \n"
4038 "packuswb %%xmm6,%%xmm6 \n"
4039 "punpcklbw %%xmm6,%%xmm1 \n"
4040 "movdqa %%xmm0,%%xmm6 \n"
4041 "punpcklwd %%xmm1,%%xmm0 \n"
4042 "punpckhwd %%xmm1,%%xmm6 \n"
4043 "movdqu %%xmm0," MEMACCESS(1) " \n"
4044 "movdqu %%xmm6," MEMACCESS2(0x10,1) " \n"
4045 "lea " MEMLEA(0x20,0) ",%0 \n"
4046 "lea " MEMLEA(0x20,1) ",%1 \n"
4047 "sub $0x8,%2 \n"
4048 "jg 1b \n"
4049 : "+r"(src_argb), // %0
4050 "+r"(dst_argb), // %1
4051 "+r"(width) // %2
4052 : "r"(matrix_argb) // %3
4053 : "memory", "cc"
4054 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
4055 );
4056 }
4057 #endif // HAS_ARGBCOLORMATRIXROW_SSSE3
4058
4059 #ifdef HAS_ARGBQUANTIZEROW_SSE2
4060 // Quantize 4 ARGB pixels (16 bytes).
ARGBQuantizeRow_SSE2(uint8 * dst_argb,int scale,int interval_size,int interval_offset,int width)4061 void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,
4062 int interval_offset, int width) {
4063 asm volatile (
4064 "movd %2,%%xmm2 \n"
4065 "movd %3,%%xmm3 \n"
4066 "movd %4,%%xmm4 \n"
4067 "pshuflw $0x40,%%xmm2,%%xmm2 \n"
4068 "pshufd $0x44,%%xmm2,%%xmm2 \n"
4069 "pshuflw $0x40,%%xmm3,%%xmm3 \n"
4070 "pshufd $0x44,%%xmm3,%%xmm3 \n"
4071 "pshuflw $0x40,%%xmm4,%%xmm4 \n"
4072 "pshufd $0x44,%%xmm4,%%xmm4 \n"
4073 "pxor %%xmm5,%%xmm5 \n"
4074 "pcmpeqb %%xmm6,%%xmm6 \n"
4075 "pslld $0x18,%%xmm6 \n"
4076
4077 // 4 pixel loop.
4078 LABELALIGN
4079 "1: \n"
4080 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
4081 "punpcklbw %%xmm5,%%xmm0 \n"
4082 "pmulhuw %%xmm2,%%xmm0 \n"
4083 "movdqu " MEMACCESS(0) ",%%xmm1 \n"
4084 "punpckhbw %%xmm5,%%xmm1 \n"
4085 "pmulhuw %%xmm2,%%xmm1 \n"
4086 "pmullw %%xmm3,%%xmm0 \n"
4087 "movdqu " MEMACCESS(0) ",%%xmm7 \n"
4088 "pmullw %%xmm3,%%xmm1 \n"
4089 "pand %%xmm6,%%xmm7 \n"
4090 "paddw %%xmm4,%%xmm0 \n"
4091 "paddw %%xmm4,%%xmm1 \n"
4092 "packuswb %%xmm1,%%xmm0 \n"
4093 "por %%xmm7,%%xmm0 \n"
4094 "movdqu %%xmm0," MEMACCESS(0) " \n"
4095 "lea " MEMLEA(0x10,0) ",%0 \n"
4096 "sub $0x4,%1 \n"
4097 "jg 1b \n"
4098 : "+r"(dst_argb), // %0
4099 "+r"(width) // %1
4100 : "r"(scale), // %2
4101 "r"(interval_size), // %3
4102 "r"(interval_offset) // %4
4103 : "memory", "cc"
4104 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
4105 );
4106 }
4107 #endif // HAS_ARGBQUANTIZEROW_SSE2
4108
4109 #ifdef HAS_ARGBSHADEROW_SSE2
4110 // Shade 4 pixels at a time by specified value.
ARGBShadeRow_SSE2(const uint8 * src_argb,uint8 * dst_argb,int width,uint32 value)4111 void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
4112 uint32 value) {
4113 asm volatile (
4114 "movd %3,%%xmm2 \n"
4115 "punpcklbw %%xmm2,%%xmm2 \n"
4116 "punpcklqdq %%xmm2,%%xmm2 \n"
4117
4118 // 4 pixel loop.
4119 LABELALIGN
4120 "1: \n"
4121 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
4122 "lea " MEMLEA(0x10,0) ",%0 \n"
4123 "movdqa %%xmm0,%%xmm1 \n"
4124 "punpcklbw %%xmm0,%%xmm0 \n"
4125 "punpckhbw %%xmm1,%%xmm1 \n"
4126 "pmulhuw %%xmm2,%%xmm0 \n"
4127 "pmulhuw %%xmm2,%%xmm1 \n"
4128 "psrlw $0x8,%%xmm0 \n"
4129 "psrlw $0x8,%%xmm1 \n"
4130 "packuswb %%xmm1,%%xmm0 \n"
4131 "movdqu %%xmm0," MEMACCESS(1) " \n"
4132 "lea " MEMLEA(0x10,1) ",%1 \n"
4133 "sub $0x4,%2 \n"
4134 "jg 1b \n"
4135 : "+r"(src_argb), // %0
4136 "+r"(dst_argb), // %1
4137 "+r"(width) // %2
4138 : "r"(value) // %3
4139 : "memory", "cc"
4140 , "xmm0", "xmm1", "xmm2"
4141 );
4142 }
4143 #endif // HAS_ARGBSHADEROW_SSE2
4144
4145 #ifdef HAS_ARGBMULTIPLYROW_SSE2
4146 // Multiply 2 rows of ARGB pixels together, 4 pixels at a time.
ARGBMultiplyRow_SSE2(const uint8 * src_argb0,const uint8 * src_argb1,uint8 * dst_argb,int width)4147 void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
4148 uint8* dst_argb, int width) {
4149 asm volatile (
4150 "pxor %%xmm5,%%xmm5 \n"
4151
4152 // 4 pixel loop.
4153 LABELALIGN
4154 "1: \n"
4155 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
4156 "lea " MEMLEA(0x10,0) ",%0 \n"
4157 "movdqu " MEMACCESS(1) ",%%xmm2 \n"
4158 "lea " MEMLEA(0x10,1) ",%1 \n"
4159 "movdqu %%xmm0,%%xmm1 \n"
4160 "movdqu %%xmm2,%%xmm3 \n"
4161 "punpcklbw %%xmm0,%%xmm0 \n"
4162 "punpckhbw %%xmm1,%%xmm1 \n"
4163 "punpcklbw %%xmm5,%%xmm2 \n"
4164 "punpckhbw %%xmm5,%%xmm3 \n"
4165 "pmulhuw %%xmm2,%%xmm0 \n"
4166 "pmulhuw %%xmm3,%%xmm1 \n"
4167 "packuswb %%xmm1,%%xmm0 \n"
4168 "movdqu %%xmm0," MEMACCESS(2) " \n"
4169 "lea " MEMLEA(0x10,2) ",%2 \n"
4170 "sub $0x4,%3 \n"
4171 "jg 1b \n"
4172 : "+r"(src_argb0), // %0
4173 "+r"(src_argb1), // %1
4174 "+r"(dst_argb), // %2
4175 "+r"(width) // %3
4176 :
4177 : "memory", "cc"
4178 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
4179 );
4180 }
4181 #endif // HAS_ARGBMULTIPLYROW_SSE2
4182
4183 #ifdef HAS_ARGBMULTIPLYROW_AVX2
4184 // Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
ARGBMultiplyRow_AVX2(const uint8 * src_argb0,const uint8 * src_argb1,uint8 * dst_argb,int width)4185 void ARGBMultiplyRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
4186 uint8* dst_argb, int width) {
4187 asm volatile (
4188 "vpxor %%ymm5,%%ymm5,%%ymm5 \n"
4189
4190 // 4 pixel loop.
4191 LABELALIGN
4192 "1: \n"
4193 "vmovdqu " MEMACCESS(0) ",%%ymm1 \n"
4194 "lea " MEMLEA(0x20,0) ",%0 \n"
4195 "vmovdqu " MEMACCESS(1) ",%%ymm3 \n"
4196 "lea " MEMLEA(0x20,1) ",%1 \n"
4197 "vpunpcklbw %%ymm1,%%ymm1,%%ymm0 \n"
4198 "vpunpckhbw %%ymm1,%%ymm1,%%ymm1 \n"
4199 "vpunpcklbw %%ymm5,%%ymm3,%%ymm2 \n"
4200 "vpunpckhbw %%ymm5,%%ymm3,%%ymm3 \n"
4201 "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n"
4202 "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n"
4203 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
4204 "vmovdqu %%ymm0," MEMACCESS(2) " \n"
4205 "lea " MEMLEA(0x20,2) ",%2 \n"
4206 "sub $0x8,%3 \n"
4207 "jg 1b \n"
4208 "vzeroupper \n"
4209 : "+r"(src_argb0), // %0
4210 "+r"(src_argb1), // %1
4211 "+r"(dst_argb), // %2
4212 "+r"(width) // %3
4213 :
4214 : "memory", "cc"
4215 #if defined(__AVX2__)
4216 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
4217 #endif
4218 );
4219 }
4220 #endif // HAS_ARGBMULTIPLYROW_AVX2
4221
4222 #ifdef HAS_ARGBADDROW_SSE2
4223 // Add 2 rows of ARGB pixels together, 4 pixels at a time.
ARGBAddRow_SSE2(const uint8 * src_argb0,const uint8 * src_argb1,uint8 * dst_argb,int width)4224 void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
4225 uint8* dst_argb, int width) {
4226 asm volatile (
4227 // 4 pixel loop.
4228 LABELALIGN
4229 "1: \n"
4230 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
4231 "lea " MEMLEA(0x10,0) ",%0 \n"
4232 "movdqu " MEMACCESS(1) ",%%xmm1 \n"
4233 "lea " MEMLEA(0x10,1) ",%1 \n"
4234 "paddusb %%xmm1,%%xmm0 \n"
4235 "movdqu %%xmm0," MEMACCESS(2) " \n"
4236 "lea " MEMLEA(0x10,2) ",%2 \n"
4237 "sub $0x4,%3 \n"
4238 "jg 1b \n"
4239 : "+r"(src_argb0), // %0
4240 "+r"(src_argb1), // %1
4241 "+r"(dst_argb), // %2
4242 "+r"(width) // %3
4243 :
4244 : "memory", "cc"
4245 , "xmm0", "xmm1"
4246 );
4247 }
4248 #endif // HAS_ARGBADDROW_SSE2
4249
4250 #ifdef HAS_ARGBADDROW_AVX2
4251 // Add 2 rows of ARGB pixels together, 4 pixels at a time.
ARGBAddRow_AVX2(const uint8 * src_argb0,const uint8 * src_argb1,uint8 * dst_argb,int width)4252 void ARGBAddRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
4253 uint8* dst_argb, int width) {
4254 asm volatile (
4255 // 4 pixel loop.
4256 LABELALIGN
4257 "1: \n"
4258 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
4259 "lea " MEMLEA(0x20,0) ",%0 \n"
4260 "vpaddusb " MEMACCESS(1) ",%%ymm0,%%ymm0 \n"
4261 "lea " MEMLEA(0x20,1) ",%1 \n"
4262 "vmovdqu %%ymm0," MEMACCESS(2) " \n"
4263 "lea " MEMLEA(0x20,2) ",%2 \n"
4264 "sub $0x8,%3 \n"
4265 "jg 1b \n"
4266 "vzeroupper \n"
4267 : "+r"(src_argb0), // %0
4268 "+r"(src_argb1), // %1
4269 "+r"(dst_argb), // %2
4270 "+r"(width) // %3
4271 :
4272 : "memory", "cc"
4273 , "xmm0"
4274 );
4275 }
4276 #endif // HAS_ARGBADDROW_AVX2
4277
4278 #ifdef HAS_ARGBSUBTRACTROW_SSE2
4279 // Subtract 2 rows of ARGB pixels, 4 pixels at a time.
ARGBSubtractRow_SSE2(const uint8 * src_argb0,const uint8 * src_argb1,uint8 * dst_argb,int width)4280 void ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
4281 uint8* dst_argb, int width) {
4282 asm volatile (
4283 // 4 pixel loop.
4284 LABELALIGN
4285 "1: \n"
4286 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
4287 "lea " MEMLEA(0x10,0) ",%0 \n"
4288 "movdqu " MEMACCESS(1) ",%%xmm1 \n"
4289 "lea " MEMLEA(0x10,1) ",%1 \n"
4290 "psubusb %%xmm1,%%xmm0 \n"
4291 "movdqu %%xmm0," MEMACCESS(2) " \n"
4292 "lea " MEMLEA(0x10,2) ",%2 \n"
4293 "sub $0x4,%3 \n"
4294 "jg 1b \n"
4295 : "+r"(src_argb0), // %0
4296 "+r"(src_argb1), // %1
4297 "+r"(dst_argb), // %2
4298 "+r"(width) // %3
4299 :
4300 : "memory", "cc"
4301 , "xmm0", "xmm1"
4302 );
4303 }
4304 #endif // HAS_ARGBSUBTRACTROW_SSE2
4305
4306 #ifdef HAS_ARGBSUBTRACTROW_AVX2
4307 // Subtract 2 rows of ARGB pixels, 8 pixels at a time.
ARGBSubtractRow_AVX2(const uint8 * src_argb0,const uint8 * src_argb1,uint8 * dst_argb,int width)4308 void ARGBSubtractRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
4309 uint8* dst_argb, int width) {
4310 asm volatile (
4311 // 4 pixel loop.
4312 LABELALIGN
4313 "1: \n"
4314 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
4315 "lea " MEMLEA(0x20,0) ",%0 \n"
4316 "vpsubusb " MEMACCESS(1) ",%%ymm0,%%ymm0 \n"
4317 "lea " MEMLEA(0x20,1) ",%1 \n"
4318 "vmovdqu %%ymm0," MEMACCESS(2) " \n"
4319 "lea " MEMLEA(0x20,2) ",%2 \n"
4320 "sub $0x8,%3 \n"
4321 "jg 1b \n"
4322 "vzeroupper \n"
4323 : "+r"(src_argb0), // %0
4324 "+r"(src_argb1), // %1
4325 "+r"(dst_argb), // %2
4326 "+r"(width) // %3
4327 :
4328 : "memory", "cc"
4329 , "xmm0"
4330 );
4331 }
4332 #endif // HAS_ARGBSUBTRACTROW_AVX2
4333
4334 #ifdef HAS_SOBELXROW_SSE2
4335 // SobelX as a matrix is
4336 // -1 0 1
4337 // -2 0 2
4338 // -1 0 1
SobelXRow_SSE2(const uint8 * src_y0,const uint8 * src_y1,const uint8 * src_y2,uint8 * dst_sobelx,int width)4339 void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1,
4340 const uint8* src_y2, uint8* dst_sobelx, int width) {
4341 asm volatile (
4342 "sub %0,%1 \n"
4343 "sub %0,%2 \n"
4344 "sub %0,%3 \n"
4345 "pxor %%xmm5,%%xmm5 \n"
4346
4347 // 8 pixel loop.
4348 LABELALIGN
4349 "1: \n"
4350 "movq " MEMACCESS(0) ",%%xmm0 \n"
4351 "movq " MEMACCESS2(0x2,0) ",%%xmm1 \n"
4352 "punpcklbw %%xmm5,%%xmm0 \n"
4353 "punpcklbw %%xmm5,%%xmm1 \n"
4354 "psubw %%xmm1,%%xmm0 \n"
4355 MEMOPREG(movq,0x00,0,1,1,xmm1) // movq (%0,%1,1),%%xmm1
4356 MEMOPREG(movq,0x02,0,1,1,xmm2) // movq 0x2(%0,%1,1),%%xmm2
4357 "punpcklbw %%xmm5,%%xmm1 \n"
4358 "punpcklbw %%xmm5,%%xmm2 \n"
4359 "psubw %%xmm2,%%xmm1 \n"
4360 MEMOPREG(movq,0x00,0,2,1,xmm2) // movq (%0,%2,1),%%xmm2
4361 MEMOPREG(movq,0x02,0,2,1,xmm3) // movq 0x2(%0,%2,1),%%xmm3
4362 "punpcklbw %%xmm5,%%xmm2 \n"
4363 "punpcklbw %%xmm5,%%xmm3 \n"
4364 "psubw %%xmm3,%%xmm2 \n"
4365 "paddw %%xmm2,%%xmm0 \n"
4366 "paddw %%xmm1,%%xmm0 \n"
4367 "paddw %%xmm1,%%xmm0 \n"
4368 "pxor %%xmm1,%%xmm1 \n"
4369 "psubw %%xmm0,%%xmm1 \n"
4370 "pmaxsw %%xmm1,%%xmm0 \n"
4371 "packuswb %%xmm0,%%xmm0 \n"
4372 MEMOPMEM(movq,xmm0,0x00,0,3,1) // movq %%xmm0,(%0,%3,1)
4373 "lea " MEMLEA(0x8,0) ",%0 \n"
4374 "sub $0x8,%4 \n"
4375 "jg 1b \n"
4376 : "+r"(src_y0), // %0
4377 "+r"(src_y1), // %1
4378 "+r"(src_y2), // %2
4379 "+r"(dst_sobelx), // %3
4380 "+r"(width) // %4
4381 :
4382 : "memory", "cc", NACL_R14
4383 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
4384 );
4385 }
4386 #endif // HAS_SOBELXROW_SSE2
4387
4388 #ifdef HAS_SOBELYROW_SSE2
4389 // SobelY as a matrix is
4390 // -1 -2 -1
4391 // 0 0 0
4392 // 1 2 1
SobelYRow_SSE2(const uint8 * src_y0,const uint8 * src_y1,uint8 * dst_sobely,int width)4393 void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1,
4394 uint8* dst_sobely, int width) {
4395 asm volatile (
4396 "sub %0,%1 \n"
4397 "sub %0,%2 \n"
4398 "pxor %%xmm5,%%xmm5 \n"
4399
4400 // 8 pixel loop.
4401 LABELALIGN
4402 "1: \n"
4403 "movq " MEMACCESS(0) ",%%xmm0 \n"
4404 MEMOPREG(movq,0x00,0,1,1,xmm1) // movq (%0,%1,1),%%xmm1
4405 "punpcklbw %%xmm5,%%xmm0 \n"
4406 "punpcklbw %%xmm5,%%xmm1 \n"
4407 "psubw %%xmm1,%%xmm0 \n"
4408 "movq " MEMACCESS2(0x1,0) ",%%xmm1 \n"
4409 MEMOPREG(movq,0x01,0,1,1,xmm2) // movq 0x1(%0,%1,1),%%xmm2
4410 "punpcklbw %%xmm5,%%xmm1 \n"
4411 "punpcklbw %%xmm5,%%xmm2 \n"
4412 "psubw %%xmm2,%%xmm1 \n"
4413 "movq " MEMACCESS2(0x2,0) ",%%xmm2 \n"
4414 MEMOPREG(movq,0x02,0,1,1,xmm3) // movq 0x2(%0,%1,1),%%xmm3
4415 "punpcklbw %%xmm5,%%xmm2 \n"
4416 "punpcklbw %%xmm5,%%xmm3 \n"
4417 "psubw %%xmm3,%%xmm2 \n"
4418 "paddw %%xmm2,%%xmm0 \n"
4419 "paddw %%xmm1,%%xmm0 \n"
4420 "paddw %%xmm1,%%xmm0 \n"
4421 "pxor %%xmm1,%%xmm1 \n"
4422 "psubw %%xmm0,%%xmm1 \n"
4423 "pmaxsw %%xmm1,%%xmm0 \n"
4424 "packuswb %%xmm0,%%xmm0 \n"
4425 MEMOPMEM(movq,xmm0,0x00,0,2,1) // movq %%xmm0,(%0,%2,1)
4426 "lea " MEMLEA(0x8,0) ",%0 \n"
4427 "sub $0x8,%3 \n"
4428 "jg 1b \n"
4429 : "+r"(src_y0), // %0
4430 "+r"(src_y1), // %1
4431 "+r"(dst_sobely), // %2
4432 "+r"(width) // %3
4433 :
4434 : "memory", "cc", NACL_R14
4435 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
4436 );
4437 }
4438 #endif // HAS_SOBELYROW_SSE2
4439
4440 #ifdef HAS_SOBELROW_SSE2
4441 // Adds Sobel X and Sobel Y and stores Sobel into ARGB.
4442 // A = 255
4443 // R = Sobel
4444 // G = Sobel
4445 // B = Sobel
SobelRow_SSE2(const uint8 * src_sobelx,const uint8 * src_sobely,uint8 * dst_argb,int width)4446 void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
4447 uint8* dst_argb, int width) {
4448 asm volatile (
4449 "sub %0,%1 \n"
4450 "pcmpeqb %%xmm5,%%xmm5 \n"
4451 "pslld $0x18,%%xmm5 \n"
4452
4453 // 8 pixel loop.
4454 LABELALIGN
4455 "1: \n"
4456 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
4457 MEMOPREG(movdqu,0x00,0,1,1,xmm1) // movdqu (%0,%1,1),%%xmm1
4458 "lea " MEMLEA(0x10,0) ",%0 \n"
4459 "paddusb %%xmm1,%%xmm0 \n"
4460 "movdqa %%xmm0,%%xmm2 \n"
4461 "punpcklbw %%xmm0,%%xmm2 \n"
4462 "punpckhbw %%xmm0,%%xmm0 \n"
4463 "movdqa %%xmm2,%%xmm1 \n"
4464 "punpcklwd %%xmm2,%%xmm1 \n"
4465 "punpckhwd %%xmm2,%%xmm2 \n"
4466 "por %%xmm5,%%xmm1 \n"
4467 "por %%xmm5,%%xmm2 \n"
4468 "movdqa %%xmm0,%%xmm3 \n"
4469 "punpcklwd %%xmm0,%%xmm3 \n"
4470 "punpckhwd %%xmm0,%%xmm0 \n"
4471 "por %%xmm5,%%xmm3 \n"
4472 "por %%xmm5,%%xmm0 \n"
4473 "movdqu %%xmm1," MEMACCESS(2) " \n"
4474 "movdqu %%xmm2," MEMACCESS2(0x10,2) " \n"
4475 "movdqu %%xmm3," MEMACCESS2(0x20,2) " \n"
4476 "movdqu %%xmm0," MEMACCESS2(0x30,2) " \n"
4477 "lea " MEMLEA(0x40,2) ",%2 \n"
4478 "sub $0x10,%3 \n"
4479 "jg 1b \n"
4480 : "+r"(src_sobelx), // %0
4481 "+r"(src_sobely), // %1
4482 "+r"(dst_argb), // %2
4483 "+r"(width) // %3
4484 :
4485 : "memory", "cc", NACL_R14
4486 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
4487 );
4488 }
4489 #endif // HAS_SOBELROW_SSE2
4490
4491 #ifdef HAS_SOBELTOPLANEROW_SSE2
4492 // Adds Sobel X and Sobel Y and stores Sobel into a plane.
SobelToPlaneRow_SSE2(const uint8 * src_sobelx,const uint8 * src_sobely,uint8 * dst_y,int width)4493 void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
4494 uint8* dst_y, int width) {
4495 asm volatile (
4496 "sub %0,%1 \n"
4497 "pcmpeqb %%xmm5,%%xmm5 \n"
4498 "pslld $0x18,%%xmm5 \n"
4499
4500 // 8 pixel loop.
4501 LABELALIGN
4502 "1: \n"
4503 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
4504 MEMOPREG(movdqu,0x00,0,1,1,xmm1) // movdqu (%0,%1,1),%%xmm1
4505 "lea " MEMLEA(0x10,0) ",%0 \n"
4506 "paddusb %%xmm1,%%xmm0 \n"
4507 "movdqu %%xmm0," MEMACCESS(2) " \n"
4508 "lea " MEMLEA(0x10,2) ",%2 \n"
4509 "sub $0x10,%3 \n"
4510 "jg 1b \n"
4511 : "+r"(src_sobelx), // %0
4512 "+r"(src_sobely), // %1
4513 "+r"(dst_y), // %2
4514 "+r"(width) // %3
4515 :
4516 : "memory", "cc", NACL_R14
4517 "xmm0", "xmm1"
4518 );
4519 }
4520 #endif // HAS_SOBELTOPLANEROW_SSE2
4521
4522 #ifdef HAS_SOBELXYROW_SSE2
4523 // Mixes Sobel X, Sobel Y and Sobel into ARGB.
4524 // A = 255
4525 // R = Sobel X
4526 // G = Sobel
4527 // B = Sobel Y
SobelXYRow_SSE2(const uint8 * src_sobelx,const uint8 * src_sobely,uint8 * dst_argb,int width)4528 void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
4529 uint8* dst_argb, int width) {
4530 asm volatile (
4531 "sub %0,%1 \n"
4532 "pcmpeqb %%xmm5,%%xmm5 \n"
4533
4534 // 8 pixel loop.
4535 LABELALIGN
4536 "1: \n"
4537 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
4538 MEMOPREG(movdqu,0x00,0,1,1,xmm1) // movdqu (%0,%1,1),%%xmm1
4539 "lea " MEMLEA(0x10,0) ",%0 \n"
4540 "movdqa %%xmm0,%%xmm2 \n"
4541 "paddusb %%xmm1,%%xmm2 \n"
4542 "movdqa %%xmm0,%%xmm3 \n"
4543 "punpcklbw %%xmm5,%%xmm3 \n"
4544 "punpckhbw %%xmm5,%%xmm0 \n"
4545 "movdqa %%xmm1,%%xmm4 \n"
4546 "punpcklbw %%xmm2,%%xmm4 \n"
4547 "punpckhbw %%xmm2,%%xmm1 \n"
4548 "movdqa %%xmm4,%%xmm6 \n"
4549 "punpcklwd %%xmm3,%%xmm6 \n"
4550 "punpckhwd %%xmm3,%%xmm4 \n"
4551 "movdqa %%xmm1,%%xmm7 \n"
4552 "punpcklwd %%xmm0,%%xmm7 \n"
4553 "punpckhwd %%xmm0,%%xmm1 \n"
4554 "movdqu %%xmm6," MEMACCESS(2) " \n"
4555 "movdqu %%xmm4," MEMACCESS2(0x10,2) " \n"
4556 "movdqu %%xmm7," MEMACCESS2(0x20,2) " \n"
4557 "movdqu %%xmm1," MEMACCESS2(0x30,2) " \n"
4558 "lea " MEMLEA(0x40,2) ",%2 \n"
4559 "sub $0x10,%3 \n"
4560 "jg 1b \n"
4561 : "+r"(src_sobelx), // %0
4562 "+r"(src_sobely), // %1
4563 "+r"(dst_argb), // %2
4564 "+r"(width) // %3
4565 :
4566 : "memory", "cc", NACL_R14
4567 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
4568 );
4569 }
4570 #endif // HAS_SOBELXYROW_SSE2
4571
4572 #ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2
4573 // Creates a table of cumulative sums where each value is a sum of all values
4574 // above and to the left of the value, inclusive of the value.
ComputeCumulativeSumRow_SSE2(const uint8 * row,int32 * cumsum,const int32 * previous_cumsum,int width)4575 void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
4576 const int32* previous_cumsum, int width) {
4577 asm volatile (
4578 "pxor %%xmm0,%%xmm0 \n"
4579 "pxor %%xmm1,%%xmm1 \n"
4580 "sub $0x4,%3 \n"
4581 "jl 49f \n"
4582 "test $0xf,%1 \n"
4583 "jne 49f \n"
4584
4585 // 4 pixel loop \n"
4586 LABELALIGN
4587 "40: \n"
4588 "movdqu " MEMACCESS(0) ",%%xmm2 \n"
4589 "lea " MEMLEA(0x10,0) ",%0 \n"
4590 "movdqa %%xmm2,%%xmm4 \n"
4591 "punpcklbw %%xmm1,%%xmm2 \n"
4592 "movdqa %%xmm2,%%xmm3 \n"
4593 "punpcklwd %%xmm1,%%xmm2 \n"
4594 "punpckhwd %%xmm1,%%xmm3 \n"
4595 "punpckhbw %%xmm1,%%xmm4 \n"
4596 "movdqa %%xmm4,%%xmm5 \n"
4597 "punpcklwd %%xmm1,%%xmm4 \n"
4598 "punpckhwd %%xmm1,%%xmm5 \n"
4599 "paddd %%xmm2,%%xmm0 \n"
4600 "movdqu " MEMACCESS(2) ",%%xmm2 \n"
4601 "paddd %%xmm0,%%xmm2 \n"
4602 "paddd %%xmm3,%%xmm0 \n"
4603 "movdqu " MEMACCESS2(0x10,2) ",%%xmm3 \n"
4604 "paddd %%xmm0,%%xmm3 \n"
4605 "paddd %%xmm4,%%xmm0 \n"
4606 "movdqu " MEMACCESS2(0x20,2) ",%%xmm4 \n"
4607 "paddd %%xmm0,%%xmm4 \n"
4608 "paddd %%xmm5,%%xmm0 \n"
4609 "movdqu " MEMACCESS2(0x30,2) ",%%xmm5 \n"
4610 "lea " MEMLEA(0x40,2) ",%2 \n"
4611 "paddd %%xmm0,%%xmm5 \n"
4612 "movdqu %%xmm2," MEMACCESS(1) " \n"
4613 "movdqu %%xmm3," MEMACCESS2(0x10,1) " \n"
4614 "movdqu %%xmm4," MEMACCESS2(0x20,1) " \n"
4615 "movdqu %%xmm5," MEMACCESS2(0x30,1) " \n"
4616 "lea " MEMLEA(0x40,1) ",%1 \n"
4617 "sub $0x4,%3 \n"
4618 "jge 40b \n"
4619
4620 "49: \n"
4621 "add $0x3,%3 \n"
4622 "jl 19f \n"
4623
4624 // 1 pixel loop \n"
4625 LABELALIGN
4626 "10: \n"
4627 "movd " MEMACCESS(0) ",%%xmm2 \n"
4628 "lea " MEMLEA(0x4,0) ",%0 \n"
4629 "punpcklbw %%xmm1,%%xmm2 \n"
4630 "punpcklwd %%xmm1,%%xmm2 \n"
4631 "paddd %%xmm2,%%xmm0 \n"
4632 "movdqu " MEMACCESS(2) ",%%xmm2 \n"
4633 "lea " MEMLEA(0x10,2) ",%2 \n"
4634 "paddd %%xmm0,%%xmm2 \n"
4635 "movdqu %%xmm2," MEMACCESS(1) " \n"
4636 "lea " MEMLEA(0x10,1) ",%1 \n"
4637 "sub $0x1,%3 \n"
4638 "jge 10b \n"
4639
4640 "19: \n"
4641 : "+r"(row), // %0
4642 "+r"(cumsum), // %1
4643 "+r"(previous_cumsum), // %2
4644 "+r"(width) // %3
4645 :
4646 : "memory", "cc"
4647 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
4648 );
4649 }
4650 #endif // HAS_COMPUTECUMULATIVESUMROW_SSE2
4651
4652 #ifdef HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
CumulativeSumToAverageRow_SSE2(const int32 * topleft,const int32 * botleft,int width,int area,uint8 * dst,int count)4653 void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft,
4654 int width, int area, uint8* dst,
4655 int count) {
4656 asm volatile (
4657 "movd %5,%%xmm5 \n"
4658 "cvtdq2ps %%xmm5,%%xmm5 \n"
4659 "rcpss %%xmm5,%%xmm4 \n"
4660 "pshufd $0x0,%%xmm4,%%xmm4 \n"
4661 "sub $0x4,%3 \n"
4662 "jl 49f \n"
4663 "cmpl $0x80,%5 \n"
4664 "ja 40f \n"
4665
4666 "pshufd $0x0,%%xmm5,%%xmm5 \n"
4667 "pcmpeqb %%xmm6,%%xmm6 \n"
4668 "psrld $0x10,%%xmm6 \n"
4669 "cvtdq2ps %%xmm6,%%xmm6 \n"
4670 "addps %%xmm6,%%xmm5 \n"
4671 "mulps %%xmm4,%%xmm5 \n"
4672 "cvtps2dq %%xmm5,%%xmm5 \n"
4673 "packssdw %%xmm5,%%xmm5 \n"
4674
4675 // 4 pixel small loop \n"
4676 LABELALIGN
4677 "4: \n"
4678 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
4679 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
4680 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
4681 "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n"
4682 MEMOPREG(psubd,0x00,0,4,4,xmm0) // psubd 0x00(%0,%4,4),%%xmm0
4683 MEMOPREG(psubd,0x10,0,4,4,xmm1) // psubd 0x10(%0,%4,4),%%xmm1
4684 MEMOPREG(psubd,0x20,0,4,4,xmm2) // psubd 0x20(%0,%4,4),%%xmm2
4685 MEMOPREG(psubd,0x30,0,4,4,xmm3) // psubd 0x30(%0,%4,4),%%xmm3
4686 "lea " MEMLEA(0x40,0) ",%0 \n"
4687 "psubd " MEMACCESS(1) ",%%xmm0 \n"
4688 "psubd " MEMACCESS2(0x10,1) ",%%xmm1 \n"
4689 "psubd " MEMACCESS2(0x20,1) ",%%xmm2 \n"
4690 "psubd " MEMACCESS2(0x30,1) ",%%xmm3 \n"
4691 MEMOPREG(paddd,0x00,1,4,4,xmm0) // paddd 0x00(%1,%4,4),%%xmm0
4692 MEMOPREG(paddd,0x10,1,4,4,xmm1) // paddd 0x10(%1,%4,4),%%xmm1
4693 MEMOPREG(paddd,0x20,1,4,4,xmm2) // paddd 0x20(%1,%4,4),%%xmm2
4694 MEMOPREG(paddd,0x30,1,4,4,xmm3) // paddd 0x30(%1,%4,4),%%xmm3
4695 "lea " MEMLEA(0x40,1) ",%1 \n"
4696 "packssdw %%xmm1,%%xmm0 \n"
4697 "packssdw %%xmm3,%%xmm2 \n"
4698 "pmulhuw %%xmm5,%%xmm0 \n"
4699 "pmulhuw %%xmm5,%%xmm2 \n"
4700 "packuswb %%xmm2,%%xmm0 \n"
4701 "movdqu %%xmm0," MEMACCESS(2) " \n"
4702 "lea " MEMLEA(0x10,2) ",%2 \n"
4703 "sub $0x4,%3 \n"
4704 "jge 4b \n"
4705 "jmp 49f \n"
4706
4707 // 4 pixel loop \n"
4708 LABELALIGN
4709 "40: \n"
4710 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
4711 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
4712 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
4713 "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n"
4714 MEMOPREG(psubd,0x00,0,4,4,xmm0) // psubd 0x00(%0,%4,4),%%xmm0
4715 MEMOPREG(psubd,0x10,0,4,4,xmm1) // psubd 0x10(%0,%4,4),%%xmm1
4716 MEMOPREG(psubd,0x20,0,4,4,xmm2) // psubd 0x20(%0,%4,4),%%xmm2
4717 MEMOPREG(psubd,0x30,0,4,4,xmm3) // psubd 0x30(%0,%4,4),%%xmm3
4718 "lea " MEMLEA(0x40,0) ",%0 \n"
4719 "psubd " MEMACCESS(1) ",%%xmm0 \n"
4720 "psubd " MEMACCESS2(0x10,1) ",%%xmm1 \n"
4721 "psubd " MEMACCESS2(0x20,1) ",%%xmm2 \n"
4722 "psubd " MEMACCESS2(0x30,1) ",%%xmm3 \n"
4723 MEMOPREG(paddd,0x00,1,4,4,xmm0) // paddd 0x00(%1,%4,4),%%xmm0
4724 MEMOPREG(paddd,0x10,1,4,4,xmm1) // paddd 0x10(%1,%4,4),%%xmm1
4725 MEMOPREG(paddd,0x20,1,4,4,xmm2) // paddd 0x20(%1,%4,4),%%xmm2
4726 MEMOPREG(paddd,0x30,1,4,4,xmm3) // paddd 0x30(%1,%4,4),%%xmm3
4727 "lea " MEMLEA(0x40,1) ",%1 \n"
4728 "cvtdq2ps %%xmm0,%%xmm0 \n"
4729 "cvtdq2ps %%xmm1,%%xmm1 \n"
4730 "mulps %%xmm4,%%xmm0 \n"
4731 "mulps %%xmm4,%%xmm1 \n"
4732 "cvtdq2ps %%xmm2,%%xmm2 \n"
4733 "cvtdq2ps %%xmm3,%%xmm3 \n"
4734 "mulps %%xmm4,%%xmm2 \n"
4735 "mulps %%xmm4,%%xmm3 \n"
4736 "cvtps2dq %%xmm0,%%xmm0 \n"
4737 "cvtps2dq %%xmm1,%%xmm1 \n"
4738 "cvtps2dq %%xmm2,%%xmm2 \n"
4739 "cvtps2dq %%xmm3,%%xmm3 \n"
4740 "packssdw %%xmm1,%%xmm0 \n"
4741 "packssdw %%xmm3,%%xmm2 \n"
4742 "packuswb %%xmm2,%%xmm0 \n"
4743 "movdqu %%xmm0," MEMACCESS(2) " \n"
4744 "lea " MEMLEA(0x10,2) ",%2 \n"
4745 "sub $0x4,%3 \n"
4746 "jge 40b \n"
4747
4748 "49: \n"
4749 "add $0x3,%3 \n"
4750 "jl 19f \n"
4751
4752 // 1 pixel loop \n"
4753 LABELALIGN
4754 "10: \n"
4755 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
4756 MEMOPREG(psubd,0x00,0,4,4,xmm0) // psubd 0x00(%0,%4,4),%%xmm0
4757 "lea " MEMLEA(0x10,0) ",%0 \n"
4758 "psubd " MEMACCESS(1) ",%%xmm0 \n"
4759 MEMOPREG(paddd,0x00,1,4,4,xmm0) // paddd 0x00(%1,%4,4),%%xmm0
4760 "lea " MEMLEA(0x10,1) ",%1 \n"
4761 "cvtdq2ps %%xmm0,%%xmm0 \n"
4762 "mulps %%xmm4,%%xmm0 \n"
4763 "cvtps2dq %%xmm0,%%xmm0 \n"
4764 "packssdw %%xmm0,%%xmm0 \n"
4765 "packuswb %%xmm0,%%xmm0 \n"
4766 "movd %%xmm0," MEMACCESS(2) " \n"
4767 "lea " MEMLEA(0x4,2) ",%2 \n"
4768 "sub $0x1,%3 \n"
4769 "jge 10b \n"
4770 "19: \n"
4771 : "+r"(topleft), // %0
4772 "+r"(botleft), // %1
4773 "+r"(dst), // %2
4774 "+rm"(count) // %3
4775 : "r"((intptr_t)(width)), // %4
4776 "rm"(area) // %5
4777 : "memory", "cc", NACL_R14
4778 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
4779 );
4780 }
4781 #endif // HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
4782
4783 #ifdef HAS_ARGBAFFINEROW_SSE2
4784 // Copy ARGB pixels from source image with slope to a row of destination.
4785 LIBYUV_API
ARGBAffineRow_SSE2(const uint8 * src_argb,int src_argb_stride,uint8 * dst_argb,const float * src_dudv,int width)4786 void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
4787 uint8* dst_argb, const float* src_dudv, int width) {
4788 intptr_t src_argb_stride_temp = src_argb_stride;
4789 intptr_t temp;
4790 asm volatile (
4791 "movq " MEMACCESS(3) ",%%xmm2 \n"
4792 "movq " MEMACCESS2(0x08,3) ",%%xmm7 \n"
4793 "shl $0x10,%1 \n"
4794 "add $0x4,%1 \n"
4795 "movd %1,%%xmm5 \n"
4796 "sub $0x4,%4 \n"
4797 "jl 49f \n"
4798
4799 "pshufd $0x44,%%xmm7,%%xmm7 \n"
4800 "pshufd $0x0,%%xmm5,%%xmm5 \n"
4801 "movdqa %%xmm2,%%xmm0 \n"
4802 "addps %%xmm7,%%xmm0 \n"
4803 "movlhps %%xmm0,%%xmm2 \n"
4804 "movdqa %%xmm7,%%xmm4 \n"
4805 "addps %%xmm4,%%xmm4 \n"
4806 "movdqa %%xmm2,%%xmm3 \n"
4807 "addps %%xmm4,%%xmm3 \n"
4808 "addps %%xmm4,%%xmm4 \n"
4809
4810 // 4 pixel loop \n"
4811 LABELALIGN
4812 "40: \n"
4813 "cvttps2dq %%xmm2,%%xmm0 \n" // x, y float to int first 2
4814 "cvttps2dq %%xmm3,%%xmm1 \n" // x, y float to int next 2
4815 "packssdw %%xmm1,%%xmm0 \n" // x, y as 8 shorts
4816 "pmaddwd %%xmm5,%%xmm0 \n" // off = x * 4 + y * stride
4817 "movd %%xmm0,%k1 \n"
4818 "pshufd $0x39,%%xmm0,%%xmm0 \n"
4819 "movd %%xmm0,%k5 \n"
4820 "pshufd $0x39,%%xmm0,%%xmm0 \n"
4821 MEMOPREG(movd,0x00,0,1,1,xmm1) // movd (%0,%1,1),%%xmm1
4822 MEMOPREG(movd,0x00,0,5,1,xmm6) // movd (%0,%5,1),%%xmm6
4823 "punpckldq %%xmm6,%%xmm1 \n"
4824 "addps %%xmm4,%%xmm2 \n"
4825 "movq %%xmm1," MEMACCESS(2) " \n"
4826 "movd %%xmm0,%k1 \n"
4827 "pshufd $0x39,%%xmm0,%%xmm0 \n"
4828 "movd %%xmm0,%k5 \n"
4829 MEMOPREG(movd,0x00,0,1,1,xmm0) // movd (%0,%1,1),%%xmm0
4830 MEMOPREG(movd,0x00,0,5,1,xmm6) // movd (%0,%5,1),%%xmm6
4831 "punpckldq %%xmm6,%%xmm0 \n"
4832 "addps %%xmm4,%%xmm3 \n"
4833 "movq %%xmm0," MEMACCESS2(0x08,2) " \n"
4834 "lea " MEMLEA(0x10,2) ",%2 \n"
4835 "sub $0x4,%4 \n"
4836 "jge 40b \n"
4837
4838 "49: \n"
4839 "add $0x3,%4 \n"
4840 "jl 19f \n"
4841
4842 // 1 pixel loop \n"
4843 LABELALIGN
4844 "10: \n"
4845 "cvttps2dq %%xmm2,%%xmm0 \n"
4846 "packssdw %%xmm0,%%xmm0 \n"
4847 "pmaddwd %%xmm5,%%xmm0 \n"
4848 "addps %%xmm7,%%xmm2 \n"
4849 "movd %%xmm0,%k1 \n"
4850 MEMOPREG(movd,0x00,0,1,1,xmm0) // movd (%0,%1,1),%%xmm0
4851 "movd %%xmm0," MEMACCESS(2) " \n"
4852 "lea " MEMLEA(0x04,2) ",%2 \n"
4853 "sub $0x1,%4 \n"
4854 "jge 10b \n"
4855 "19: \n"
4856 : "+r"(src_argb), // %0
4857 "+r"(src_argb_stride_temp), // %1
4858 "+r"(dst_argb), // %2
4859 "+r"(src_dudv), // %3
4860 "+rm"(width), // %4
4861 "=&r"(temp) // %5
4862 :
4863 : "memory", "cc", NACL_R14
4864 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
4865 );
4866 }
4867 #endif // HAS_ARGBAFFINEROW_SSE2
4868
4869 #ifdef HAS_INTERPOLATEROW_SSSE3
4870 // Bilinear filter 16x2 -> 16x1
InterpolateRow_SSSE3(uint8 * dst_ptr,const uint8 * src_ptr,ptrdiff_t src_stride,int dst_width,int source_y_fraction)4871 void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
4872 ptrdiff_t src_stride, int dst_width,
4873 int source_y_fraction) {
4874 asm volatile (
4875 "sub %1,%0 \n"
4876 "cmp $0x0,%3 \n"
4877 "je 100f \n"
4878 "cmp $0x80,%3 \n"
4879 "je 50f \n"
4880
4881 "movd %3,%%xmm0 \n"
4882 "neg %3 \n"
4883 "add $0x100,%3 \n"
4884 "movd %3,%%xmm5 \n"
4885 "punpcklbw %%xmm0,%%xmm5 \n"
4886 "punpcklwd %%xmm5,%%xmm5 \n"
4887 "pshufd $0x0,%%xmm5,%%xmm5 \n"
4888 "mov $0x80808080,%%eax \n"
4889 "movd %%eax,%%xmm4 \n"
4890 "pshufd $0x0,%%xmm4,%%xmm4 \n"
4891
4892 // General purpose row blend.
4893 LABELALIGN
4894 "1: \n"
4895 "movdqu " MEMACCESS(1) ",%%xmm0 \n"
4896 MEMOPREG(movdqu,0x00,1,4,1,xmm2)
4897 "movdqa %%xmm0,%%xmm1 \n"
4898 "punpcklbw %%xmm2,%%xmm0 \n"
4899 "punpckhbw %%xmm2,%%xmm1 \n"
4900 "psubb %%xmm4,%%xmm0 \n"
4901 "psubb %%xmm4,%%xmm1 \n"
4902 "movdqa %%xmm5,%%xmm2 \n"
4903 "movdqa %%xmm5,%%xmm3 \n"
4904 "pmaddubsw %%xmm0,%%xmm2 \n"
4905 "pmaddubsw %%xmm1,%%xmm3 \n"
4906 "paddw %%xmm4,%%xmm2 \n"
4907 "paddw %%xmm4,%%xmm3 \n"
4908 "psrlw $0x8,%%xmm2 \n"
4909 "psrlw $0x8,%%xmm3 \n"
4910 "packuswb %%xmm3,%%xmm2 \n"
4911 MEMOPMEM(movdqu,xmm2,0x00,1,0,1)
4912 "lea " MEMLEA(0x10,1) ",%1 \n"
4913 "sub $0x10,%2 \n"
4914 "jg 1b \n"
4915 "jmp 99f \n"
4916
4917 // Blend 50 / 50.
4918 LABELALIGN
4919 "50: \n"
4920 "movdqu " MEMACCESS(1) ",%%xmm0 \n"
4921 MEMOPREG(movdqu,0x00,1,4,1,xmm1)
4922 "pavgb %%xmm1,%%xmm0 \n"
4923 MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
4924 "lea " MEMLEA(0x10,1) ",%1 \n"
4925 "sub $0x10,%2 \n"
4926 "jg 50b \n"
4927 "jmp 99f \n"
4928
4929 // Blend 100 / 0 - Copy row unchanged.
4930 LABELALIGN
4931 "100: \n"
4932 "movdqu " MEMACCESS(1) ",%%xmm0 \n"
4933 MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
4934 "lea " MEMLEA(0x10,1) ",%1 \n"
4935 "sub $0x10,%2 \n"
4936 "jg 100b \n"
4937
4938 "99: \n"
4939 : "+r"(dst_ptr), // %0
4940 "+r"(src_ptr), // %1
4941 "+rm"(dst_width), // %2
4942 "+r"(source_y_fraction) // %3
4943 : "r"((intptr_t)(src_stride)) // %4
4944 : "memory", "cc", "eax", NACL_R14
4945 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
4946 );
4947 }
4948 #endif // HAS_INTERPOLATEROW_SSSE3
4949
4950 #ifdef HAS_INTERPOLATEROW_AVX2
4951 // Bilinear filter 32x2 -> 32x1
InterpolateRow_AVX2(uint8 * dst_ptr,const uint8 * src_ptr,ptrdiff_t src_stride,int dst_width,int source_y_fraction)4952 void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr,
4953 ptrdiff_t src_stride, int dst_width,
4954 int source_y_fraction) {
4955 asm volatile (
4956 "cmp $0x0,%3 \n"
4957 "je 100f \n"
4958 "sub %1,%0 \n"
4959 "cmp $0x80,%3 \n"
4960 "je 50f \n"
4961
4962 "vmovd %3,%%xmm0 \n"
4963 "neg %3 \n"
4964 "add $0x100,%3 \n"
4965 "vmovd %3,%%xmm5 \n"
4966 "vpunpcklbw %%xmm0,%%xmm5,%%xmm5 \n"
4967 "vpunpcklwd %%xmm5,%%xmm5,%%xmm5 \n"
4968 "vbroadcastss %%xmm5,%%ymm5 \n"
4969 "mov $0x80808080,%%eax \n"
4970 "vmovd %%eax,%%xmm4 \n"
4971 "vbroadcastss %%xmm4,%%ymm4 \n"
4972
4973 // General purpose row blend.
4974 LABELALIGN
4975 "1: \n"
4976 "vmovdqu " MEMACCESS(1) ",%%ymm0 \n"
4977 MEMOPREG(vmovdqu,0x00,1,4,1,ymm2)
4978 "vpunpckhbw %%ymm2,%%ymm0,%%ymm1 \n"
4979 "vpunpcklbw %%ymm2,%%ymm0,%%ymm0 \n"
4980 "vpsubb %%ymm4,%%ymm1,%%ymm1 \n"
4981 "vpsubb %%ymm4,%%ymm0,%%ymm0 \n"
4982 "vpmaddubsw %%ymm1,%%ymm5,%%ymm1 \n"
4983 "vpmaddubsw %%ymm0,%%ymm5,%%ymm0 \n"
4984 "vpaddw %%ymm4,%%ymm1,%%ymm1 \n"
4985 "vpaddw %%ymm4,%%ymm0,%%ymm0 \n"
4986 "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
4987 "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
4988 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
4989 MEMOPMEM(vmovdqu,ymm0,0x00,1,0,1)
4990 "lea " MEMLEA(0x20,1) ",%1 \n"
4991 "sub $0x20,%2 \n"
4992 "jg 1b \n"
4993 "jmp 99f \n"
4994
4995 // Blend 50 / 50.
4996 LABELALIGN
4997 "50: \n"
4998 "vmovdqu " MEMACCESS(1) ",%%ymm0 \n"
4999 VMEMOPREG(vpavgb,0x00,1,4,1,ymm0,ymm0) // vpavgb (%1,%4,1),%%ymm0,%%ymm0
5000 MEMOPMEM(vmovdqu,ymm0,0x00,1,0,1)
5001 "lea " MEMLEA(0x20,1) ",%1 \n"
5002 "sub $0x20,%2 \n"
5003 "jg 50b \n"
5004 "jmp 99f \n"
5005
5006 // Blend 100 / 0 - Copy row unchanged.
5007 LABELALIGN
5008 "100: \n"
5009 "rep movsb " MEMMOVESTRING(1,0) " \n"
5010 "jmp 999f \n"
5011
5012 "99: \n"
5013 "vzeroupper \n"
5014 "999: \n"
5015 : "+D"(dst_ptr), // %0
5016 "+S"(src_ptr), // %1
5017 "+cm"(dst_width), // %2
5018 "+r"(source_y_fraction) // %3
5019 : "r"((intptr_t)(src_stride)) // %4
5020 : "memory", "cc", "eax", NACL_R14
5021 "xmm0", "xmm1", "xmm2", "xmm4", "xmm5"
5022 );
5023 }
5024 #endif // HAS_INTERPOLATEROW_AVX2
5025
5026 #ifdef HAS_ARGBSHUFFLEROW_SSSE3
5027 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
ARGBShuffleRow_SSSE3(const uint8 * src_argb,uint8 * dst_argb,const uint8 * shuffler,int width)5028 void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
5029 const uint8* shuffler, int width) {
5030 asm volatile (
5031 "movdqu " MEMACCESS(3) ",%%xmm5 \n"
5032 LABELALIGN
5033 "1: \n"
5034 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
5035 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
5036 "lea " MEMLEA(0x20,0) ",%0 \n"
5037 "pshufb %%xmm5,%%xmm0 \n"
5038 "pshufb %%xmm5,%%xmm1 \n"
5039 "movdqu %%xmm0," MEMACCESS(1) " \n"
5040 "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
5041 "lea " MEMLEA(0x20,1) ",%1 \n"
5042 "sub $0x8,%2 \n"
5043 "jg 1b \n"
5044 : "+r"(src_argb), // %0
5045 "+r"(dst_argb), // %1
5046 "+r"(width) // %2
5047 : "r"(shuffler) // %3
5048 : "memory", "cc"
5049 , "xmm0", "xmm1", "xmm5"
5050 );
5051 }
5052 #endif // HAS_ARGBSHUFFLEROW_SSSE3
5053
5054 #ifdef HAS_ARGBSHUFFLEROW_AVX2
5055 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
ARGBShuffleRow_AVX2(const uint8 * src_argb,uint8 * dst_argb,const uint8 * shuffler,int width)5056 void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb,
5057 const uint8* shuffler, int width) {
5058 asm volatile (
5059 "vbroadcastf128 " MEMACCESS(3) ",%%ymm5 \n"
5060 LABELALIGN
5061 "1: \n"
5062 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
5063 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
5064 "lea " MEMLEA(0x40,0) ",%0 \n"
5065 "vpshufb %%ymm5,%%ymm0,%%ymm0 \n"
5066 "vpshufb %%ymm5,%%ymm1,%%ymm1 \n"
5067 "vmovdqu %%ymm0," MEMACCESS(1) " \n"
5068 "vmovdqu %%ymm1," MEMACCESS2(0x20,1) " \n"
5069 "lea " MEMLEA(0x40,1) ",%1 \n"
5070 "sub $0x10,%2 \n"
5071 "jg 1b \n"
5072 "vzeroupper \n"
5073 : "+r"(src_argb), // %0
5074 "+r"(dst_argb), // %1
5075 "+r"(width) // %2
5076 : "r"(shuffler) // %3
5077 : "memory", "cc"
5078 , "xmm0", "xmm1", "xmm5"
5079 );
5080 }
5081 #endif // HAS_ARGBSHUFFLEROW_AVX2
5082
5083 #ifdef HAS_ARGBSHUFFLEROW_SSE2
5084 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
ARGBShuffleRow_SSE2(const uint8 * src_argb,uint8 * dst_argb,const uint8 * shuffler,int width)5085 void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb,
5086 const uint8* shuffler, int width) {
5087 uintptr_t pixel_temp;
5088 asm volatile (
5089 "pxor %%xmm5,%%xmm5 \n"
5090 "mov " MEMACCESS(4) ",%k2 \n"
5091 "cmp $0x3000102,%k2 \n"
5092 "je 3012f \n"
5093 "cmp $0x10203,%k2 \n"
5094 "je 123f \n"
5095 "cmp $0x30201,%k2 \n"
5096 "je 321f \n"
5097 "cmp $0x2010003,%k2 \n"
5098 "je 2103f \n"
5099
5100 LABELALIGN
5101 "1: \n"
5102 "movzb " MEMACCESS(4) ",%2 \n"
5103 MEMOPARG(movzb,0x00,0,2,1,2) " \n" // movzb (%0,%2,1),%2
5104 "mov %b2," MEMACCESS(1) " \n"
5105 "movzb " MEMACCESS2(0x1,4) ",%2 \n"
5106 MEMOPARG(movzb,0x00,0,2,1,2) " \n" // movzb (%0,%2,1),%2
5107 "mov %b2," MEMACCESS2(0x1,1) " \n"
5108 "movzb " MEMACCESS2(0x2,4) ",%2 \n"
5109 MEMOPARG(movzb,0x00,0,2,1,2) " \n" // movzb (%0,%2,1),%2
5110 "mov %b2," MEMACCESS2(0x2,1) " \n"
5111 "movzb " MEMACCESS2(0x3,4) ",%2 \n"
5112 MEMOPARG(movzb,0x00,0,2,1,2) " \n" // movzb (%0,%2,1),%2
5113 "mov %b2," MEMACCESS2(0x3,1) " \n"
5114 "lea " MEMLEA(0x4,0) ",%0 \n"
5115 "lea " MEMLEA(0x4,1) ",%1 \n"
5116 "sub $0x1,%3 \n"
5117 "jg 1b \n"
5118 "jmp 99f \n"
5119
5120 LABELALIGN
5121 "123: \n"
5122 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
5123 "lea " MEMLEA(0x10,0) ",%0 \n"
5124 "movdqa %%xmm0,%%xmm1 \n"
5125 "punpcklbw %%xmm5,%%xmm0 \n"
5126 "punpckhbw %%xmm5,%%xmm1 \n"
5127 "pshufhw $0x1b,%%xmm0,%%xmm0 \n"
5128 "pshuflw $0x1b,%%xmm0,%%xmm0 \n"
5129 "pshufhw $0x1b,%%xmm1,%%xmm1 \n"
5130 "pshuflw $0x1b,%%xmm1,%%xmm1 \n"
5131 "packuswb %%xmm1,%%xmm0 \n"
5132 "movdqu %%xmm0," MEMACCESS(1) " \n"
5133 "lea " MEMLEA(0x10,1) ",%1 \n"
5134 "sub $0x4,%3 \n"
5135 "jg 123b \n"
5136 "jmp 99f \n"
5137
5138 LABELALIGN
5139 "321: \n"
5140 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
5141 "lea " MEMLEA(0x10,0) ",%0 \n"
5142 "movdqa %%xmm0,%%xmm1 \n"
5143 "punpcklbw %%xmm5,%%xmm0 \n"
5144 "punpckhbw %%xmm5,%%xmm1 \n"
5145 "pshufhw $0x39,%%xmm0,%%xmm0 \n"
5146 "pshuflw $0x39,%%xmm0,%%xmm0 \n"
5147 "pshufhw $0x39,%%xmm1,%%xmm1 \n"
5148 "pshuflw $0x39,%%xmm1,%%xmm1 \n"
5149 "packuswb %%xmm1,%%xmm0 \n"
5150 "movdqu %%xmm0," MEMACCESS(1) " \n"
5151 "lea " MEMLEA(0x10,1) ",%1 \n"
5152 "sub $0x4,%3 \n"
5153 "jg 321b \n"
5154 "jmp 99f \n"
5155
5156 LABELALIGN
5157 "2103: \n"
5158 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
5159 "lea " MEMLEA(0x10,0) ",%0 \n"
5160 "movdqa %%xmm0,%%xmm1 \n"
5161 "punpcklbw %%xmm5,%%xmm0 \n"
5162 "punpckhbw %%xmm5,%%xmm1 \n"
5163 "pshufhw $0x93,%%xmm0,%%xmm0 \n"
5164 "pshuflw $0x93,%%xmm0,%%xmm0 \n"
5165 "pshufhw $0x93,%%xmm1,%%xmm1 \n"
5166 "pshuflw $0x93,%%xmm1,%%xmm1 \n"
5167 "packuswb %%xmm1,%%xmm0 \n"
5168 "movdqu %%xmm0," MEMACCESS(1) " \n"
5169 "lea " MEMLEA(0x10,1) ",%1 \n"
5170 "sub $0x4,%3 \n"
5171 "jg 2103b \n"
5172 "jmp 99f \n"
5173
5174 LABELALIGN
5175 "3012: \n"
5176 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
5177 "lea " MEMLEA(0x10,0) ",%0 \n"
5178 "movdqa %%xmm0,%%xmm1 \n"
5179 "punpcklbw %%xmm5,%%xmm0 \n"
5180 "punpckhbw %%xmm5,%%xmm1 \n"
5181 "pshufhw $0xc6,%%xmm0,%%xmm0 \n"
5182 "pshuflw $0xc6,%%xmm0,%%xmm0 \n"
5183 "pshufhw $0xc6,%%xmm1,%%xmm1 \n"
5184 "pshuflw $0xc6,%%xmm1,%%xmm1 \n"
5185 "packuswb %%xmm1,%%xmm0 \n"
5186 "movdqu %%xmm0," MEMACCESS(1) " \n"
5187 "lea " MEMLEA(0x10,1) ",%1 \n"
5188 "sub $0x4,%3 \n"
5189 "jg 3012b \n"
5190
5191 "99: \n"
5192 : "+r"(src_argb), // %0
5193 "+r"(dst_argb), // %1
5194 "=&d"(pixel_temp), // %2
5195 "+r"(width) // %3
5196 : "r"(shuffler) // %4
5197 : "memory", "cc", NACL_R14
5198 "xmm0", "xmm1", "xmm5"
5199 );
5200 }
5201 #endif // HAS_ARGBSHUFFLEROW_SSE2
5202
5203 #ifdef HAS_I422TOYUY2ROW_SSE2
I422ToYUY2Row_SSE2(const uint8 * src_y,const uint8 * src_u,const uint8 * src_v,uint8 * dst_frame,int width)5204 void I422ToYUY2Row_SSE2(const uint8* src_y,
5205 const uint8* src_u,
5206 const uint8* src_v,
5207 uint8* dst_frame, int width) {
5208 asm volatile (
5209 "sub %1,%2 \n"
5210 LABELALIGN
5211 "1: \n"
5212 "movq " MEMACCESS(1) ",%%xmm2 \n"
5213 MEMOPREG(movq,0x00,1,2,1,xmm3) // movq (%1,%2,1),%%xmm3
5214 "lea " MEMLEA(0x8,1) ",%1 \n"
5215 "punpcklbw %%xmm3,%%xmm2 \n"
5216 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
5217 "lea " MEMLEA(0x10,0) ",%0 \n"
5218 "movdqa %%xmm0,%%xmm1 \n"
5219 "punpcklbw %%xmm2,%%xmm0 \n"
5220 "punpckhbw %%xmm2,%%xmm1 \n"
5221 "movdqu %%xmm0," MEMACCESS(3) " \n"
5222 "movdqu %%xmm1," MEMACCESS2(0x10,3) " \n"
5223 "lea " MEMLEA(0x20,3) ",%3 \n"
5224 "sub $0x10,%4 \n"
5225 "jg 1b \n"
5226 : "+r"(src_y), // %0
5227 "+r"(src_u), // %1
5228 "+r"(src_v), // %2
5229 "+r"(dst_frame), // %3
5230 "+rm"(width) // %4
5231 :
5232 : "memory", "cc", NACL_R14
5233 "xmm0", "xmm1", "xmm2", "xmm3"
5234 );
5235 }
5236 #endif // HAS_I422TOYUY2ROW_SSE2
5237
5238 #ifdef HAS_I422TOUYVYROW_SSE2
I422ToUYVYRow_SSE2(const uint8 * src_y,const uint8 * src_u,const uint8 * src_v,uint8 * dst_frame,int width)5239 void I422ToUYVYRow_SSE2(const uint8* src_y,
5240 const uint8* src_u,
5241 const uint8* src_v,
5242 uint8* dst_frame, int width) {
5243 asm volatile (
5244 "sub %1,%2 \n"
5245 LABELALIGN
5246 "1: \n"
5247 "movq " MEMACCESS(1) ",%%xmm2 \n"
5248 MEMOPREG(movq,0x00,1,2,1,xmm3) // movq (%1,%2,1),%%xmm3
5249 "lea " MEMLEA(0x8,1) ",%1 \n"
5250 "punpcklbw %%xmm3,%%xmm2 \n"
5251 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
5252 "movdqa %%xmm2,%%xmm1 \n"
5253 "lea " MEMLEA(0x10,0) ",%0 \n"
5254 "punpcklbw %%xmm0,%%xmm1 \n"
5255 "punpckhbw %%xmm0,%%xmm2 \n"
5256 "movdqu %%xmm1," MEMACCESS(3) " \n"
5257 "movdqu %%xmm2," MEMACCESS2(0x10,3) " \n"
5258 "lea " MEMLEA(0x20,3) ",%3 \n"
5259 "sub $0x10,%4 \n"
5260 "jg 1b \n"
5261 : "+r"(src_y), // %0
5262 "+r"(src_u), // %1
5263 "+r"(src_v), // %2
5264 "+r"(dst_frame), // %3
5265 "+rm"(width) // %4
5266 :
5267 : "memory", "cc", NACL_R14
5268 "xmm0", "xmm1", "xmm2", "xmm3"
5269 );
5270 }
5271 #endif // HAS_I422TOUYVYROW_SSE2
5272
5273 #ifdef HAS_ARGBPOLYNOMIALROW_SSE2
ARGBPolynomialRow_SSE2(const uint8 * src_argb,uint8 * dst_argb,const float * poly,int width)5274 void ARGBPolynomialRow_SSE2(const uint8* src_argb,
5275 uint8* dst_argb, const float* poly,
5276 int width) {
5277 asm volatile (
5278 "pxor %%xmm3,%%xmm3 \n"
5279
5280 // 2 pixel loop.
5281 LABELALIGN
5282 "1: \n"
5283 "movq " MEMACCESS(0) ",%%xmm0 \n"
5284 "lea " MEMLEA(0x8,0) ",%0 \n"
5285 "punpcklbw %%xmm3,%%xmm0 \n"
5286 "movdqa %%xmm0,%%xmm4 \n"
5287 "punpcklwd %%xmm3,%%xmm0 \n"
5288 "punpckhwd %%xmm3,%%xmm4 \n"
5289 "cvtdq2ps %%xmm0,%%xmm0 \n"
5290 "cvtdq2ps %%xmm4,%%xmm4 \n"
5291 "movdqa %%xmm0,%%xmm1 \n"
5292 "movdqa %%xmm4,%%xmm5 \n"
5293 "mulps " MEMACCESS2(0x10,3) ",%%xmm0 \n"
5294 "mulps " MEMACCESS2(0x10,3) ",%%xmm4 \n"
5295 "addps " MEMACCESS(3) ",%%xmm0 \n"
5296 "addps " MEMACCESS(3) ",%%xmm4 \n"
5297 "movdqa %%xmm1,%%xmm2 \n"
5298 "movdqa %%xmm5,%%xmm6 \n"
5299 "mulps %%xmm1,%%xmm2 \n"
5300 "mulps %%xmm5,%%xmm6 \n"
5301 "mulps %%xmm2,%%xmm1 \n"
5302 "mulps %%xmm6,%%xmm5 \n"
5303 "mulps " MEMACCESS2(0x20,3) ",%%xmm2 \n"
5304 "mulps " MEMACCESS2(0x20,3) ",%%xmm6 \n"
5305 "mulps " MEMACCESS2(0x30,3) ",%%xmm1 \n"
5306 "mulps " MEMACCESS2(0x30,3) ",%%xmm5 \n"
5307 "addps %%xmm2,%%xmm0 \n"
5308 "addps %%xmm6,%%xmm4 \n"
5309 "addps %%xmm1,%%xmm0 \n"
5310 "addps %%xmm5,%%xmm4 \n"
5311 "cvttps2dq %%xmm0,%%xmm0 \n"
5312 "cvttps2dq %%xmm4,%%xmm4 \n"
5313 "packuswb %%xmm4,%%xmm0 \n"
5314 "packuswb %%xmm0,%%xmm0 \n"
5315 "movq %%xmm0," MEMACCESS(1) " \n"
5316 "lea " MEMLEA(0x8,1) ",%1 \n"
5317 "sub $0x2,%2 \n"
5318 "jg 1b \n"
5319 : "+r"(src_argb), // %0
5320 "+r"(dst_argb), // %1
5321 "+r"(width) // %2
5322 : "r"(poly) // %3
5323 : "memory", "cc"
5324 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
5325 );
5326 }
5327 #endif // HAS_ARGBPOLYNOMIALROW_SSE2
5328
5329 #ifdef HAS_ARGBPOLYNOMIALROW_AVX2
ARGBPolynomialRow_AVX2(const uint8 * src_argb,uint8 * dst_argb,const float * poly,int width)5330 void ARGBPolynomialRow_AVX2(const uint8* src_argb,
5331 uint8* dst_argb, const float* poly,
5332 int width) {
5333 asm volatile (
5334 "vbroadcastf128 " MEMACCESS(3) ",%%ymm4 \n"
5335 "vbroadcastf128 " MEMACCESS2(0x10,3) ",%%ymm5 \n"
5336 "vbroadcastf128 " MEMACCESS2(0x20,3) ",%%ymm6 \n"
5337 "vbroadcastf128 " MEMACCESS2(0x30,3) ",%%ymm7 \n"
5338
5339 // 2 pixel loop.
5340 LABELALIGN
5341 "1: \n"
5342 "vpmovzxbd " MEMACCESS(0) ",%%ymm0 \n" // 2 ARGB pixels
5343 "lea " MEMLEA(0x8,0) ",%0 \n"
5344 "vcvtdq2ps %%ymm0,%%ymm0 \n" // X 8 floats
5345 "vmulps %%ymm0,%%ymm0,%%ymm2 \n" // X * X
5346 "vmulps %%ymm7,%%ymm0,%%ymm3 \n" // C3 * X
5347 "vfmadd132ps %%ymm5,%%ymm4,%%ymm0 \n" // result = C0 + C1 * X
5348 "vfmadd231ps %%ymm6,%%ymm2,%%ymm0 \n" // result += C2 * X * X
5349 "vfmadd231ps %%ymm3,%%ymm2,%%ymm0 \n" // result += C3 * X * X * X
5350 "vcvttps2dq %%ymm0,%%ymm0 \n"
5351 "vpackusdw %%ymm0,%%ymm0,%%ymm0 \n"
5352 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
5353 "vpackuswb %%xmm0,%%xmm0,%%xmm0 \n"
5354 "vmovq %%xmm0," MEMACCESS(1) " \n"
5355 "lea " MEMLEA(0x8,1) ",%1 \n"
5356 "sub $0x2,%2 \n"
5357 "jg 1b \n"
5358 "vzeroupper \n"
5359 : "+r"(src_argb), // %0
5360 "+r"(dst_argb), // %1
5361 "+r"(width) // %2
5362 : "r"(poly) // %3
5363 : "memory", "cc",
5364 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
5365 );
5366 }
5367 #endif // HAS_ARGBPOLYNOMIALROW_AVX2
5368
5369 #ifdef HAS_ARGBCOLORTABLEROW_X86
5370 // Tranform ARGB pixels with color table.
ARGBColorTableRow_X86(uint8 * dst_argb,const uint8 * table_argb,int width)5371 void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb,
5372 int width) {
5373 uintptr_t pixel_temp;
5374 asm volatile (
5375 // 1 pixel loop.
5376 LABELALIGN
5377 "1: \n"
5378 "movzb " MEMACCESS(0) ",%1 \n"
5379 "lea " MEMLEA(0x4,0) ",%0 \n"
5380 MEMOPARG(movzb,0x00,3,1,4,1) " \n" // movzb (%3,%1,4),%1
5381 "mov %b1," MEMACCESS2(-0x4,0) " \n"
5382 "movzb " MEMACCESS2(-0x3,0) ",%1 \n"
5383 MEMOPARG(movzb,0x01,3,1,4,1) " \n" // movzb 0x1(%3,%1,4),%1
5384 "mov %b1," MEMACCESS2(-0x3,0) " \n"
5385 "movzb " MEMACCESS2(-0x2,0) ",%1 \n"
5386 MEMOPARG(movzb,0x02,3,1,4,1) " \n" // movzb 0x2(%3,%1,4),%1
5387 "mov %b1," MEMACCESS2(-0x2,0) " \n"
5388 "movzb " MEMACCESS2(-0x1,0) ",%1 \n"
5389 MEMOPARG(movzb,0x03,3,1,4,1) " \n" // movzb 0x3(%3,%1,4),%1
5390 "mov %b1," MEMACCESS2(-0x1,0) " \n"
5391 "dec %2 \n"
5392 "jg 1b \n"
5393 : "+r"(dst_argb), // %0
5394 "=&d"(pixel_temp), // %1
5395 "+r"(width) // %2
5396 : "r"(table_argb) // %3
5397 : "memory", "cc");
5398 }
5399 #endif // HAS_ARGBCOLORTABLEROW_X86
5400
5401 #ifdef HAS_RGBCOLORTABLEROW_X86
5402 // Tranform RGB pixels with color table.
RGBColorTableRow_X86(uint8 * dst_argb,const uint8 * table_argb,int width)5403 void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width) {
5404 uintptr_t pixel_temp;
5405 asm volatile (
5406 // 1 pixel loop.
5407 LABELALIGN
5408 "1: \n"
5409 "movzb " MEMACCESS(0) ",%1 \n"
5410 "lea " MEMLEA(0x4,0) ",%0 \n"
5411 MEMOPARG(movzb,0x00,3,1,4,1) " \n" // movzb (%3,%1,4),%1
5412 "mov %b1," MEMACCESS2(-0x4,0) " \n"
5413 "movzb " MEMACCESS2(-0x3,0) ",%1 \n"
5414 MEMOPARG(movzb,0x01,3,1,4,1) " \n" // movzb 0x1(%3,%1,4),%1
5415 "mov %b1," MEMACCESS2(-0x3,0) " \n"
5416 "movzb " MEMACCESS2(-0x2,0) ",%1 \n"
5417 MEMOPARG(movzb,0x02,3,1,4,1) " \n" // movzb 0x2(%3,%1,4),%1
5418 "mov %b1," MEMACCESS2(-0x2,0) " \n"
5419 "dec %2 \n"
5420 "jg 1b \n"
5421 : "+r"(dst_argb), // %0
5422 "=&d"(pixel_temp), // %1
5423 "+r"(width) // %2
5424 : "r"(table_argb) // %3
5425 : "memory", "cc");
5426 }
5427 #endif // HAS_RGBCOLORTABLEROW_X86
5428
5429 #ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3
5430 // Tranform RGB pixels with luma table.
ARGBLumaColorTableRow_SSSE3(const uint8 * src_argb,uint8 * dst_argb,int width,const uint8 * luma,uint32 lumacoeff)5431 void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
5432 int width,
5433 const uint8* luma, uint32 lumacoeff) {
5434 uintptr_t pixel_temp;
5435 uintptr_t table_temp;
5436 asm volatile (
5437 "movd %6,%%xmm3 \n"
5438 "pshufd $0x0,%%xmm3,%%xmm3 \n"
5439 "pcmpeqb %%xmm4,%%xmm4 \n"
5440 "psllw $0x8,%%xmm4 \n"
5441 "pxor %%xmm5,%%xmm5 \n"
5442
5443 // 4 pixel loop.
5444 LABELALIGN
5445 "1: \n"
5446 "movdqu " MEMACCESS(2) ",%%xmm0 \n"
5447 "pmaddubsw %%xmm3,%%xmm0 \n"
5448 "phaddw %%xmm0,%%xmm0 \n"
5449 "pand %%xmm4,%%xmm0 \n"
5450 "punpcklwd %%xmm5,%%xmm0 \n"
5451 "movd %%xmm0,%k1 \n" // 32 bit offset
5452 "add %5,%1 \n"
5453 "pshufd $0x39,%%xmm0,%%xmm0 \n"
5454
5455 "movzb " MEMACCESS(2) ",%0 \n"
5456 MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
5457 "mov %b0," MEMACCESS(3) " \n"
5458 "movzb " MEMACCESS2(0x1,2) ",%0 \n"
5459 MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
5460 "mov %b0," MEMACCESS2(0x1,3) " \n"
5461 "movzb " MEMACCESS2(0x2,2) ",%0 \n"
5462 MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
5463 "mov %b0," MEMACCESS2(0x2,3) " \n"
5464 "movzb " MEMACCESS2(0x3,2) ",%0 \n"
5465 "mov %b0," MEMACCESS2(0x3,3) " \n"
5466
5467 "movd %%xmm0,%k1 \n" // 32 bit offset
5468 "add %5,%1 \n"
5469 "pshufd $0x39,%%xmm0,%%xmm0 \n"
5470
5471 "movzb " MEMACCESS2(0x4,2) ",%0 \n"
5472 MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
5473 "mov %b0," MEMACCESS2(0x4,3) " \n"
5474 "movzb " MEMACCESS2(0x5,2) ",%0 \n"
5475 MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
5476 "mov %b0," MEMACCESS2(0x5,3) " \n"
5477 "movzb " MEMACCESS2(0x6,2) ",%0 \n"
5478 MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
5479 "mov %b0," MEMACCESS2(0x6,3) " \n"
5480 "movzb " MEMACCESS2(0x7,2) ",%0 \n"
5481 "mov %b0," MEMACCESS2(0x7,3) " \n"
5482
5483 "movd %%xmm0,%k1 \n" // 32 bit offset
5484 "add %5,%1 \n"
5485 "pshufd $0x39,%%xmm0,%%xmm0 \n"
5486
5487 "movzb " MEMACCESS2(0x8,2) ",%0 \n"
5488 MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
5489 "mov %b0," MEMACCESS2(0x8,3) " \n"
5490 "movzb " MEMACCESS2(0x9,2) ",%0 \n"
5491 MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
5492 "mov %b0," MEMACCESS2(0x9,3) " \n"
5493 "movzb " MEMACCESS2(0xa,2) ",%0 \n"
5494 MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
5495 "mov %b0," MEMACCESS2(0xa,3) " \n"
5496 "movzb " MEMACCESS2(0xb,2) ",%0 \n"
5497 "mov %b0," MEMACCESS2(0xb,3) " \n"
5498
5499 "movd %%xmm0,%k1 \n" // 32 bit offset
5500 "add %5,%1 \n"
5501
5502 "movzb " MEMACCESS2(0xc,2) ",%0 \n"
5503 MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
5504 "mov %b0," MEMACCESS2(0xc,3) " \n"
5505 "movzb " MEMACCESS2(0xd,2) ",%0 \n"
5506 MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
5507 "mov %b0," MEMACCESS2(0xd,3) " \n"
5508 "movzb " MEMACCESS2(0xe,2) ",%0 \n"
5509 MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
5510 "mov %b0," MEMACCESS2(0xe,3) " \n"
5511 "movzb " MEMACCESS2(0xf,2) ",%0 \n"
5512 "mov %b0," MEMACCESS2(0xf,3) " \n"
5513 "lea " MEMLEA(0x10,2) ",%2 \n"
5514 "lea " MEMLEA(0x10,3) ",%3 \n"
5515 "sub $0x4,%4 \n"
5516 "jg 1b \n"
5517 : "=&d"(pixel_temp), // %0
5518 "=&a"(table_temp), // %1
5519 "+r"(src_argb), // %2
5520 "+r"(dst_argb), // %3
5521 "+rm"(width) // %4
5522 : "r"(luma), // %5
5523 "rm"(lumacoeff) // %6
5524 : "memory", "cc", "xmm0", "xmm3", "xmm4", "xmm5"
5525 );
5526 }
5527 #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3
5528
5529 #endif // defined(__x86_64__) || defined(__i386__)
5530
5531 #ifdef __cplusplus
5532 } // extern "C"
5533 } // namespace libyuv
5534 #endif
5535