1 /*
2 * Copyright 2011 The LibYuv Project Authors. All rights reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include "libyuv/row.h"
12
13 #include "libyuv/basic_types.h"
14
15 #ifdef __cplusplus
16 namespace libyuv {
17 extern "C" {
18 #endif
19
20 // This module is for GCC x86 and x64
21 #if !defined(YUV_DISABLE_ASM) && (defined(__x86_64__) || defined(__i386__))
22
23 // GCC 4.2 on OSX has link error when passing static or const to inline.
24 // TODO(fbarchard): Use static const when gcc 4.2 support is dropped.
25 #ifdef __APPLE__
26 #define CONST
27 #else
28 #define CONST static const
29 #endif
30
31 #ifdef HAS_ARGBTOYROW_SSSE3
32
33 // Constants for ARGB
34 CONST vec8 kARGBToY = {
35 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0
36 };
37
38 CONST vec8 kARGBToU = {
39 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0
40 };
41
42 CONST vec8 kARGBToV = {
43 -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
44 };
45
46 // Constants for BGRA
47 CONST vec8 kBGRAToY = {
48 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13
49 };
50
51 CONST vec8 kBGRAToU = {
52 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112
53 };
54
55 CONST vec8 kBGRAToV = {
56 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18
57 };
58
59 // Constants for ABGR
60 CONST vec8 kABGRToY = {
61 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0
62 };
63
64 CONST vec8 kABGRToU = {
65 -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0
66 };
67
68 CONST vec8 kABGRToV = {
69 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0
70 };
71
72 CONST uvec8 kAddY16 = {
73 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u
74 };
75
76 CONST uvec8 kAddUV128 = {
77 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
78 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
79 };
80
81 // Shuffle table for converting RGB24 to ARGB.
82 CONST uvec8 kShuffleMaskRGB24ToARGB = {
83 0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u
84 };
85
86 // Shuffle table for converting RAW to ARGB.
87 CONST uvec8 kShuffleMaskRAWToARGB = {
88 2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u
89 };
90
91 // Shuffle table for converting ABGR to ARGB.
92 CONST uvec8 kShuffleMaskABGRToARGB = {
93 2u, 1u, 0u, 3u, 6u, 5u, 4u, 7u, 10u, 9u, 8u, 11u, 14u, 13u, 12u, 15u
94 };
95
96 // Shuffle table for converting BGRA to ARGB.
97 CONST uvec8 kShuffleMaskBGRAToARGB = {
98 3u, 2u, 1u, 0u, 7u, 6u, 5u, 4u, 11u, 10u, 9u, 8u, 15u, 14u, 13u, 12u
99 };
100
101 // Shuffle table for converting RGBA to ARGB.
102 CONST uvec8 kShuffleMaskRGBAToARGB = {
103 1u, 2u, 3u, 0u, 5u, 6u, 7u, 4u, 9u, 10u, 11u, 8u, 13u, 14u, 15u, 12u
104 };
105
106 // Shuffle table for converting ARGB to RGBA.
107 CONST uvec8 kShuffleMaskARGBToRGBA = {
108 3u, 0u, 1u, 2u, 7u, 4u, 5u, 6u, 11u, 8u, 9u, 10u, 15u, 12u, 13u, 14u
109 };
110
111 // Shuffle table for converting ARGB to RGB24.
112 CONST uvec8 kShuffleMaskARGBToRGB24 = {
113 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u
114 };
115
116 // Shuffle table for converting ARGB to RAW.
117 CONST uvec8 kShuffleMaskARGBToRAW = {
118 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u
119 };
120
I400ToARGBRow_SSE2(const uint8 * src_y,uint8 * dst_argb,int pix)121 void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
122 asm volatile (
123 "pcmpeqb %%xmm5,%%xmm5 \n"
124 "pslld $0x18,%%xmm5 \n"
125 ".p2align 4 \n"
126 "1: \n"
127 "movq (%0),%%xmm0 \n"
128 "lea 0x8(%0),%0 \n"
129 "punpcklbw %%xmm0,%%xmm0 \n"
130 "movdqa %%xmm0,%%xmm1 \n"
131 "punpcklwd %%xmm0,%%xmm0 \n"
132 "punpckhwd %%xmm1,%%xmm1 \n"
133 "por %%xmm5,%%xmm0 \n"
134 "por %%xmm5,%%xmm1 \n"
135 "movdqa %%xmm0,(%1) \n"
136 "movdqa %%xmm1,0x10(%1) \n"
137 "lea 0x20(%1),%1 \n"
138 "sub $0x8,%2 \n"
139 "jg 1b \n"
140 : "+r"(src_y), // %0
141 "+r"(dst_argb), // %1
142 "+r"(pix) // %2
143 :
144 : "memory", "cc"
145 #if defined(__SSE2__)
146 , "xmm0", "xmm1", "xmm5"
147 #endif
148 );
149 }
150
ABGRToARGBRow_SSSE3(const uint8 * src_abgr,uint8 * dst_argb,int pix)151 void ABGRToARGBRow_SSSE3(const uint8* src_abgr, uint8* dst_argb, int pix) {
152 asm volatile (
153 "movdqa %3,%%xmm5 \n"
154 "sub %0,%1 \n"
155 ".p2align 4 \n"
156 "1: \n"
157 "movdqa (%0),%%xmm0 \n"
158 "pshufb %%xmm5,%%xmm0 \n"
159 "sub $0x4,%2 \n"
160 "movdqa %%xmm0,(%0,%1,1) \n"
161 "lea 0x10(%0),%0 \n"
162 "jg 1b \n"
163
164 : "+r"(src_abgr), // %0
165 "+r"(dst_argb), // %1
166 "+r"(pix) // %2
167 : "m"(kShuffleMaskABGRToARGB) // %3
168 : "memory", "cc"
169 #if defined(__SSE2__)
170 , "xmm0", "xmm5"
171 #endif
172 );
173 }
174
BGRAToARGBRow_SSSE3(const uint8 * src_bgra,uint8 * dst_argb,int pix)175 void BGRAToARGBRow_SSSE3(const uint8* src_bgra, uint8* dst_argb, int pix) {
176 asm volatile (
177 "movdqa %3,%%xmm5 \n"
178 "sub %0,%1 \n"
179 ".p2align 4 \n"
180 "1: \n"
181 "movdqa (%0),%%xmm0 \n"
182 "pshufb %%xmm5,%%xmm0 \n"
183 "sub $0x4,%2 \n"
184 "movdqa %%xmm0,(%0,%1,1) \n"
185 "lea 0x10(%0),%0 \n"
186 "jg 1b \n"
187 : "+r"(src_bgra), // %0
188 "+r"(dst_argb), // %1
189 "+r"(pix) // %2
190 : "m"(kShuffleMaskBGRAToARGB) // %3
191 : "memory", "cc"
192 #if defined(__SSE2__)
193 , "xmm0", "xmm5"
194 #endif
195 );
196 }
197
RGBAToARGBRow_SSSE3(const uint8 * src_rgba,uint8 * dst_argb,int pix)198 void RGBAToARGBRow_SSSE3(const uint8* src_rgba, uint8* dst_argb, int pix) {
199 asm volatile (
200 "movdqa %3,%%xmm5 \n"
201 "sub %0,%1 \n"
202 ".p2align 4 \n"
203 "1: \n"
204 "movdqa (%0),%%xmm0 \n"
205 "pshufb %%xmm5,%%xmm0 \n"
206 "sub $0x4,%2 \n"
207 "movdqa %%xmm0,(%0,%1,1) \n"
208 "lea 0x10(%0),%0 \n"
209 "jg 1b \n"
210
211 : "+r"(src_rgba), // %0
212 "+r"(dst_argb), // %1
213 "+r"(pix) // %2
214 : "m"(kShuffleMaskRGBAToARGB) // %3
215 : "memory", "cc"
216 #if defined(__SSE2__)
217 , "xmm0", "xmm5"
218 #endif
219 );
220 }
221
ARGBToRGBARow_SSSE3(const uint8 * src_argb,uint8 * dst_rgba,int pix)222 void ARGBToRGBARow_SSSE3(const uint8* src_argb, uint8* dst_rgba, int pix) {
223 asm volatile (
224 "movdqa %3,%%xmm5 \n"
225 "sub %0,%1 \n"
226 ".p2align 4 \n"
227 "1: \n"
228 "movdqa (%0),%%xmm0 \n"
229 "pshufb %%xmm5,%%xmm0 \n"
230 "sub $0x4,%2 \n"
231 "movdqa %%xmm0,(%0,%1,1) \n"
232 "lea 0x10(%0),%0 \n"
233 "jg 1b \n"
234
235 : "+r"(src_argb), // %0
236 "+r"(dst_rgba), // %1
237 "+r"(pix) // %2
238 : "m"(kShuffleMaskARGBToRGBA) // %3
239 : "memory", "cc"
240 #if defined(__SSE2__)
241 , "xmm0", "xmm5"
242 #endif
243 );
244 }
245
RGB24ToARGBRow_SSSE3(const uint8 * src_rgb24,uint8 * dst_argb,int pix)246 void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) {
247 asm volatile (
248 "pcmpeqb %%xmm5,%%xmm5 \n" // generate mask 0xff000000
249 "pslld $0x18,%%xmm5 \n"
250 "movdqa %3,%%xmm4 \n"
251 ".p2align 4 \n"
252 "1: \n"
253 "movdqu (%0),%%xmm0 \n"
254 "movdqu 0x10(%0),%%xmm1 \n"
255 "movdqu 0x20(%0),%%xmm3 \n"
256 "lea 0x30(%0),%0 \n"
257 "movdqa %%xmm3,%%xmm2 \n"
258 "palignr $0x8,%%xmm1,%%xmm2 \n"
259 "pshufb %%xmm4,%%xmm2 \n"
260 "por %%xmm5,%%xmm2 \n"
261 "palignr $0xc,%%xmm0,%%xmm1 \n"
262 "pshufb %%xmm4,%%xmm0 \n"
263 "movdqa %%xmm2,0x20(%1) \n"
264 "por %%xmm5,%%xmm0 \n"
265 "pshufb %%xmm4,%%xmm1 \n"
266 "movdqa %%xmm0,(%1) \n"
267 "por %%xmm5,%%xmm1 \n"
268 "palignr $0x4,%%xmm3,%%xmm3 \n"
269 "pshufb %%xmm4,%%xmm3 \n"
270 "movdqa %%xmm1,0x10(%1) \n"
271 "por %%xmm5,%%xmm3 \n"
272 "sub $0x10,%2 \n"
273 "movdqa %%xmm3,0x30(%1) \n"
274 "lea 0x40(%1),%1 \n"
275 "jg 1b \n"
276 : "+r"(src_rgb24), // %0
277 "+r"(dst_argb), // %1
278 "+r"(pix) // %2
279 : "m"(kShuffleMaskRGB24ToARGB) // %3
280 : "memory", "cc"
281 #if defined(__SSE2__)
282 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
283 #endif
284 );
285 }
286
RAWToARGBRow_SSSE3(const uint8 * src_raw,uint8 * dst_argb,int pix)287 void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int pix) {
288 asm volatile (
289 "pcmpeqb %%xmm5,%%xmm5 \n" // generate mask 0xff000000
290 "pslld $0x18,%%xmm5 \n"
291 "movdqa %3,%%xmm4 \n"
292 ".p2align 4 \n"
293 "1: \n"
294 "movdqu (%0),%%xmm0 \n"
295 "movdqu 0x10(%0),%%xmm1 \n"
296 "movdqu 0x20(%0),%%xmm3 \n"
297 "lea 0x30(%0),%0 \n"
298 "movdqa %%xmm3,%%xmm2 \n"
299 "palignr $0x8,%%xmm1,%%xmm2 \n"
300 "pshufb %%xmm4,%%xmm2 \n"
301 "por %%xmm5,%%xmm2 \n"
302 "palignr $0xc,%%xmm0,%%xmm1 \n"
303 "pshufb %%xmm4,%%xmm0 \n"
304 "movdqa %%xmm2,0x20(%1) \n"
305 "por %%xmm5,%%xmm0 \n"
306 "pshufb %%xmm4,%%xmm1 \n"
307 "movdqa %%xmm0,(%1) \n"
308 "por %%xmm5,%%xmm1 \n"
309 "palignr $0x4,%%xmm3,%%xmm3 \n"
310 "pshufb %%xmm4,%%xmm3 \n"
311 "movdqa %%xmm1,0x10(%1) \n"
312 "por %%xmm5,%%xmm3 \n"
313 "sub $0x10,%2 \n"
314 "movdqa %%xmm3,0x30(%1) \n"
315 "lea 0x40(%1),%1 \n"
316 "jg 1b \n"
317 : "+r"(src_raw), // %0
318 "+r"(dst_argb), // %1
319 "+r"(pix) // %2
320 : "m"(kShuffleMaskRAWToARGB) // %3
321 : "memory", "cc"
322 #if defined(__SSE2__)
323 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
324 #endif
325 );
326 }
327
RGB565ToARGBRow_SSE2(const uint8 * src,uint8 * dst,int pix)328 void RGB565ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
329 asm volatile (
330 "mov $0x1080108,%%eax \n"
331 "movd %%eax,%%xmm5 \n"
332 "pshufd $0x0,%%xmm5,%%xmm5 \n"
333 "mov $0x20802080,%%eax \n"
334 "movd %%eax,%%xmm6 \n"
335 "pshufd $0x0,%%xmm6,%%xmm6 \n"
336 "pcmpeqb %%xmm3,%%xmm3 \n"
337 "psllw $0xb,%%xmm3 \n"
338 "pcmpeqb %%xmm4,%%xmm4 \n"
339 "psllw $0xa,%%xmm4 \n"
340 "psrlw $0x5,%%xmm4 \n"
341 "pcmpeqb %%xmm7,%%xmm7 \n"
342 "psllw $0x8,%%xmm7 \n"
343 "sub %0,%1 \n"
344 "sub %0,%1 \n"
345 ".p2align 4 \n"
346 "1: \n"
347 "movdqu (%0),%%xmm0 \n"
348 "movdqa %%xmm0,%%xmm1 \n"
349 "movdqa %%xmm0,%%xmm2 \n"
350 "pand %%xmm3,%%xmm1 \n"
351 "psllw $0xb,%%xmm2 \n"
352 "pmulhuw %%xmm5,%%xmm1 \n"
353 "pmulhuw %%xmm5,%%xmm2 \n"
354 "psllw $0x8,%%xmm1 \n"
355 "por %%xmm2,%%xmm1 \n"
356 "pand %%xmm4,%%xmm0 \n"
357 "pmulhuw %%xmm6,%%xmm0 \n"
358 "por %%xmm7,%%xmm0 \n"
359 "movdqa %%xmm1,%%xmm2 \n"
360 "punpcklbw %%xmm0,%%xmm1 \n"
361 "punpckhbw %%xmm0,%%xmm2 \n"
362 "movdqa %%xmm1,(%1,%0,2) \n"
363 "movdqa %%xmm2,0x10(%1,%0,2) \n"
364 "lea 0x10(%0),%0 \n"
365 "sub $0x8,%2 \n"
366 "jg 1b \n"
367 : "+r"(src), // %0
368 "+r"(dst), // %1
369 "+r"(pix) // %2
370 :
371 : "memory", "cc", "eax"
372 #if defined(__SSE2__)
373 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
374 #endif
375 );
376 }
377
ARGB1555ToARGBRow_SSE2(const uint8 * src,uint8 * dst,int pix)378 void ARGB1555ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
379 asm volatile (
380 "mov $0x1080108,%%eax \n"
381 "movd %%eax,%%xmm5 \n"
382 "pshufd $0x0,%%xmm5,%%xmm5 \n"
383 "mov $0x42004200,%%eax \n"
384 "movd %%eax,%%xmm6 \n"
385 "pshufd $0x0,%%xmm6,%%xmm6 \n"
386 "pcmpeqb %%xmm3,%%xmm3 \n"
387 "psllw $0xb,%%xmm3 \n"
388 "movdqa %%xmm3,%%xmm4 \n"
389 "psrlw $0x6,%%xmm4 \n"
390 "pcmpeqb %%xmm7,%%xmm7 \n"
391 "psllw $0x8,%%xmm7 \n"
392 "sub %0,%1 \n"
393 "sub %0,%1 \n"
394 ".p2align 4 \n"
395 "1: \n"
396 "movdqu (%0),%%xmm0 \n"
397 "movdqa %%xmm0,%%xmm1 \n"
398 "movdqa %%xmm0,%%xmm2 \n"
399 "psllw $0x1,%%xmm1 \n"
400 "psllw $0xb,%%xmm2 \n"
401 "pand %%xmm3,%%xmm1 \n"
402 "pmulhuw %%xmm5,%%xmm2 \n"
403 "pmulhuw %%xmm5,%%xmm1 \n"
404 "psllw $0x8,%%xmm1 \n"
405 "por %%xmm2,%%xmm1 \n"
406 "movdqa %%xmm0,%%xmm2 \n"
407 "pand %%xmm4,%%xmm0 \n"
408 "psraw $0x8,%%xmm2 \n"
409 "pmulhuw %%xmm6,%%xmm0 \n"
410 "pand %%xmm7,%%xmm2 \n"
411 "por %%xmm2,%%xmm0 \n"
412 "movdqa %%xmm1,%%xmm2 \n"
413 "punpcklbw %%xmm0,%%xmm1 \n"
414 "punpckhbw %%xmm0,%%xmm2 \n"
415 "movdqa %%xmm1,(%1,%0,2) \n"
416 "movdqa %%xmm2,0x10(%1,%0,2) \n"
417 "lea 0x10(%0),%0 \n"
418 "sub $0x8,%2 \n"
419 "jg 1b \n"
420 : "+r"(src), // %0
421 "+r"(dst), // %1
422 "+r"(pix) // %2
423 :
424 : "memory", "cc", "eax"
425 #if defined(__SSE2__)
426 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
427 #endif
428 );
429 }
430
ARGB4444ToARGBRow_SSE2(const uint8 * src,uint8 * dst,int pix)431 void ARGB4444ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
432 asm volatile (
433 "mov $0xf0f0f0f,%%eax \n"
434 "movd %%eax,%%xmm4 \n"
435 "pshufd $0x0,%%xmm4,%%xmm4 \n"
436 "movdqa %%xmm4,%%xmm5 \n"
437 "pslld $0x4,%%xmm5 \n"
438 "sub %0,%1 \n"
439 "sub %0,%1 \n"
440 ".p2align 4 \n"
441 "1: \n"
442 "movdqu (%0),%%xmm0 \n"
443 "movdqa %%xmm0,%%xmm2 \n"
444 "pand %%xmm4,%%xmm0 \n"
445 "pand %%xmm5,%%xmm2 \n"
446 "movdqa %%xmm0,%%xmm1 \n"
447 "movdqa %%xmm2,%%xmm3 \n"
448 "psllw $0x4,%%xmm1 \n"
449 "psrlw $0x4,%%xmm3 \n"
450 "por %%xmm1,%%xmm0 \n"
451 "por %%xmm3,%%xmm2 \n"
452 "movdqa %%xmm0,%%xmm1 \n"
453 "punpcklbw %%xmm2,%%xmm0 \n"
454 "punpckhbw %%xmm2,%%xmm1 \n"
455 "movdqa %%xmm0,(%1,%0,2) \n"
456 "movdqa %%xmm1,0x10(%1,%0,2) \n"
457 "lea 0x10(%0),%0 \n"
458 "sub $0x8,%2 \n"
459 "jg 1b \n"
460 : "+r"(src), // %0
461 "+r"(dst), // %1
462 "+r"(pix) // %2
463 :
464 : "memory", "cc", "eax"
465 #if defined(__SSE2__)
466 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
467 #endif
468 );
469 }
470
ARGBToRGB24Row_SSSE3(const uint8 * src,uint8 * dst,int pix)471 void ARGBToRGB24Row_SSSE3(const uint8* src, uint8* dst, int pix) {
472 asm volatile (
473 "movdqa %3,%%xmm6 \n"
474 ".p2align 4 \n"
475 "1: \n"
476 "movdqa (%0),%%xmm0 \n"
477 "movdqa 0x10(%0),%%xmm1 \n"
478 "movdqa 0x20(%0),%%xmm2 \n"
479 "movdqa 0x30(%0),%%xmm3 \n"
480 "lea 0x40(%0),%0 \n"
481 "pshufb %%xmm6,%%xmm0 \n"
482 "pshufb %%xmm6,%%xmm1 \n"
483 "pshufb %%xmm6,%%xmm2 \n"
484 "pshufb %%xmm6,%%xmm3 \n"
485 "movdqa %%xmm1,%%xmm4 \n"
486 "psrldq $0x4,%%xmm1 \n"
487 "pslldq $0xc,%%xmm4 \n"
488 "movdqa %%xmm2,%%xmm5 \n"
489 "por %%xmm4,%%xmm0 \n"
490 "pslldq $0x8,%%xmm5 \n"
491 "movdqa %%xmm0,(%1) \n"
492 "por %%xmm5,%%xmm1 \n"
493 "psrldq $0x8,%%xmm2 \n"
494 "pslldq $0x4,%%xmm3 \n"
495 "por %%xmm3,%%xmm2 \n"
496 "movdqa %%xmm1,0x10(%1) \n"
497 "movdqa %%xmm2,0x20(%1) \n"
498 "lea 0x30(%1),%1 \n"
499 "sub $0x10,%2 \n"
500 "jg 1b \n"
501 : "+r"(src), // %0
502 "+r"(dst), // %1
503 "+r"(pix) // %2
504 : "m"(kShuffleMaskARGBToRGB24) // %3
505 : "memory", "cc"
506 #if defined(__SSE2__)
507 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
508 #endif
509 );
510 }
511
ARGBToRAWRow_SSSE3(const uint8 * src,uint8 * dst,int pix)512 void ARGBToRAWRow_SSSE3(const uint8* src, uint8* dst, int pix) {
513 asm volatile (
514 "movdqa %3,%%xmm6 \n"
515 ".p2align 4 \n"
516 "1: \n"
517 "movdqa (%0),%%xmm0 \n"
518 "movdqa 0x10(%0),%%xmm1 \n"
519 "movdqa 0x20(%0),%%xmm2 \n"
520 "movdqa 0x30(%0),%%xmm3 \n"
521 "lea 0x40(%0),%0 \n"
522 "pshufb %%xmm6,%%xmm0 \n"
523 "pshufb %%xmm6,%%xmm1 \n"
524 "pshufb %%xmm6,%%xmm2 \n"
525 "pshufb %%xmm6,%%xmm3 \n"
526 "movdqa %%xmm1,%%xmm4 \n"
527 "psrldq $0x4,%%xmm1 \n"
528 "pslldq $0xc,%%xmm4 \n"
529 "movdqa %%xmm2,%%xmm5 \n"
530 "por %%xmm4,%%xmm0 \n"
531 "pslldq $0x8,%%xmm5 \n"
532 "movdqa %%xmm0,(%1) \n"
533 "por %%xmm5,%%xmm1 \n"
534 "psrldq $0x8,%%xmm2 \n"
535 "pslldq $0x4,%%xmm3 \n"
536 "por %%xmm3,%%xmm2 \n"
537 "movdqa %%xmm1,0x10(%1) \n"
538 "movdqa %%xmm2,0x20(%1) \n"
539 "lea 0x30(%1),%1 \n"
540 "sub $0x10,%2 \n"
541 "jg 1b \n"
542 : "+r"(src), // %0
543 "+r"(dst), // %1
544 "+r"(pix) // %2
545 : "m"(kShuffleMaskARGBToRAW) // %3
546 : "memory", "cc"
547 #if defined(__SSE2__)
548 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
549 #endif
550 );
551 }
552
ARGBToRGB565Row_SSE2(const uint8 * src,uint8 * dst,int pix)553 void ARGBToRGB565Row_SSE2(const uint8* src, uint8* dst, int pix) {
554 asm volatile (
555 "pcmpeqb %%xmm3,%%xmm3 \n"
556 "psrld $0x1b,%%xmm3 \n"
557 "pcmpeqb %%xmm4,%%xmm4 \n"
558 "psrld $0x1a,%%xmm4 \n"
559 "pslld $0x5,%%xmm4 \n"
560 "pcmpeqb %%xmm5,%%xmm5 \n"
561 "pslld $0xb,%%xmm5 \n"
562 ".p2align 4 \n"
563 "1: \n"
564 "movdqa (%0),%%xmm0 \n"
565 "movdqa %%xmm0,%%xmm1 \n"
566 "movdqa %%xmm0,%%xmm2 \n"
567 "pslld $0x8,%%xmm0 \n"
568 "psrld $0x3,%%xmm1 \n"
569 "psrld $0x5,%%xmm2 \n"
570 "psrad $0x10,%%xmm0 \n"
571 "pand %%xmm3,%%xmm1 \n"
572 "pand %%xmm4,%%xmm2 \n"
573 "pand %%xmm5,%%xmm0 \n"
574 "por %%xmm2,%%xmm1 \n"
575 "por %%xmm1,%%xmm0 \n"
576 "packssdw %%xmm0,%%xmm0 \n"
577 "lea 0x10(%0),%0 \n"
578 "movq %%xmm0,(%1) \n"
579 "lea 0x8(%1),%1 \n"
580 "sub $0x4,%2 \n"
581 "jg 1b \n"
582 : "+r"(src), // %0
583 "+r"(dst), // %1
584 "+r"(pix) // %2
585 :
586 : "memory", "cc"
587 #if defined(__SSE2__)
588 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
589 #endif
590 );
591 }
592
ARGBToARGB1555Row_SSE2(const uint8 * src,uint8 * dst,int pix)593 void ARGBToARGB1555Row_SSE2(const uint8* src, uint8* dst, int pix) {
594 asm volatile (
595 "pcmpeqb %%xmm4,%%xmm4 \n"
596 "psrld $0x1b,%%xmm4 \n"
597 "movdqa %%xmm4,%%xmm5 \n"
598 "pslld $0x5,%%xmm5 \n"
599 "movdqa %%xmm4,%%xmm6 \n"
600 "pslld $0xa,%%xmm6 \n"
601 "pcmpeqb %%xmm7,%%xmm7 \n"
602 "pslld $0xf,%%xmm7 \n"
603 ".p2align 4 \n"
604 "1: \n"
605 "movdqa (%0),%%xmm0 \n"
606 "movdqa %%xmm0,%%xmm1 \n"
607 "movdqa %%xmm0,%%xmm2 \n"
608 "movdqa %%xmm0,%%xmm3 \n"
609 "psrad $0x10,%%xmm0 \n"
610 "psrld $0x3,%%xmm1 \n"
611 "psrld $0x6,%%xmm2 \n"
612 "psrld $0x9,%%xmm3 \n"
613 "pand %%xmm7,%%xmm0 \n"
614 "pand %%xmm4,%%xmm1 \n"
615 "pand %%xmm5,%%xmm2 \n"
616 "pand %%xmm6,%%xmm3 \n"
617 "por %%xmm1,%%xmm0 \n"
618 "por %%xmm3,%%xmm2 \n"
619 "por %%xmm2,%%xmm0 \n"
620 "packssdw %%xmm0,%%xmm0 \n"
621 "lea 0x10(%0),%0 \n"
622 "movq %%xmm0,(%1) \n"
623 "lea 0x8(%1),%1 \n"
624 "sub $0x4,%2 \n"
625 "jg 1b \n"
626 : "+r"(src), // %0
627 "+r"(dst), // %1
628 "+r"(pix) // %2
629 :
630 : "memory", "cc"
631 #if defined(__SSE2__)
632 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
633 #endif
634 );
635 }
636
ARGBToARGB4444Row_SSE2(const uint8 * src,uint8 * dst,int pix)637 void ARGBToARGB4444Row_SSE2(const uint8* src, uint8* dst, int pix) {
638 asm volatile (
639 "pcmpeqb %%xmm4,%%xmm4 \n"
640 "psllw $0xc,%%xmm4 \n"
641 "movdqa %%xmm4,%%xmm3 \n"
642 "psrlw $0x8,%%xmm3 \n"
643 ".p2align 4 \n"
644 "1: \n"
645 "movdqa (%0),%%xmm0 \n"
646 "movdqa %%xmm0,%%xmm1 \n"
647 "pand %%xmm3,%%xmm0 \n"
648 "pand %%xmm4,%%xmm1 \n"
649 "psrlq $0x4,%%xmm0 \n"
650 "psrlq $0x8,%%xmm1 \n"
651 "por %%xmm1,%%xmm0 \n"
652 "packuswb %%xmm0,%%xmm0 \n"
653 "lea 0x10(%0),%0 \n"
654 "movq %%xmm0,(%1) \n"
655 "lea 0x8(%1),%1 \n"
656 "sub $0x4,%2 \n"
657 "jg 1b \n"
658 : "+r"(src), // %0
659 "+r"(dst), // %1
660 "+r"(pix) // %2
661 :
662 : "memory", "cc"
663 #if defined(__SSE2__)
664 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
665 #endif
666 );
667 }
668
ARGBToYRow_SSSE3(const uint8 * src_argb,uint8 * dst_y,int pix)669 void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
670 asm volatile (
671 "movdqa %4,%%xmm5 \n"
672 "movdqa %3,%%xmm4 \n"
673 ".p2align 4 \n"
674 "1: \n"
675 "movdqa (%0),%%xmm0 \n"
676 "movdqa 0x10(%0),%%xmm1 \n"
677 "movdqa 0x20(%0),%%xmm2 \n"
678 "movdqa 0x30(%0),%%xmm3 \n"
679 "pmaddubsw %%xmm4,%%xmm0 \n"
680 "pmaddubsw %%xmm4,%%xmm1 \n"
681 "pmaddubsw %%xmm4,%%xmm2 \n"
682 "pmaddubsw %%xmm4,%%xmm3 \n"
683 "lea 0x40(%0),%0 \n"
684 "phaddw %%xmm1,%%xmm0 \n"
685 "phaddw %%xmm3,%%xmm2 \n"
686 "psrlw $0x7,%%xmm0 \n"
687 "psrlw $0x7,%%xmm2 \n"
688 "packuswb %%xmm2,%%xmm0 \n"
689 "paddb %%xmm5,%%xmm0 \n"
690 "sub $0x10,%2 \n"
691 "movdqa %%xmm0,(%1) \n"
692 "lea 0x10(%1),%1 \n"
693 "jg 1b \n"
694 : "+r"(src_argb), // %0
695 "+r"(dst_y), // %1
696 "+r"(pix) // %2
697 : "m"(kARGBToY), // %3
698 "m"(kAddY16) // %4
699 : "memory", "cc"
700 #if defined(__SSE2__)
701 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
702 #endif
703 );
704 }
705
ARGBToYRow_Unaligned_SSSE3(const uint8 * src_argb,uint8 * dst_y,int pix)706 void ARGBToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
707 asm volatile (
708 "movdqa %4,%%xmm5 \n"
709 "movdqa %3,%%xmm4 \n"
710 ".p2align 4 \n"
711 "1: \n"
712 "movdqu (%0),%%xmm0 \n"
713 "movdqu 0x10(%0),%%xmm1 \n"
714 "movdqu 0x20(%0),%%xmm2 \n"
715 "movdqu 0x30(%0),%%xmm3 \n"
716 "pmaddubsw %%xmm4,%%xmm0 \n"
717 "pmaddubsw %%xmm4,%%xmm1 \n"
718 "pmaddubsw %%xmm4,%%xmm2 \n"
719 "pmaddubsw %%xmm4,%%xmm3 \n"
720 "lea 0x40(%0),%0 \n"
721 "phaddw %%xmm1,%%xmm0 \n"
722 "phaddw %%xmm3,%%xmm2 \n"
723 "psrlw $0x7,%%xmm0 \n"
724 "psrlw $0x7,%%xmm2 \n"
725 "packuswb %%xmm2,%%xmm0 \n"
726 "paddb %%xmm5,%%xmm0 \n"
727 "sub $0x10,%2 \n"
728 "movdqu %%xmm0,(%1) \n"
729 "lea 0x10(%1),%1 \n"
730 "jg 1b \n"
731 : "+r"(src_argb), // %0
732 "+r"(dst_y), // %1
733 "+r"(pix) // %2
734 : "m"(kARGBToY), // %3
735 "m"(kAddY16) // %4
736 : "memory", "cc"
737 #if defined(__SSE2__)
738 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
739 #endif
740 );
741 }
742
743 // TODO(fbarchard): pass xmm constants to single block of assembly.
744 // fpic on GCC 4.2 for OSX runs out of GPR registers. "m" effectively takes
745 // 3 registers - ebx, ebp and eax. "m" can be passed with 3 normal registers,
746 // or 4 if stack frame is disabled. Doing 2 assembly blocks is a work around
747 // and considered unsafe.
ARGBToUVRow_SSSE3(const uint8 * src_argb0,int src_stride_argb,uint8 * dst_u,uint8 * dst_v,int width)748 void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
749 uint8* dst_u, uint8* dst_v, int width) {
750 asm volatile (
751 "movdqa %0,%%xmm4 \n"
752 "movdqa %1,%%xmm3 \n"
753 "movdqa %2,%%xmm5 \n"
754 :
755 : "m"(kARGBToU), // %0
756 "m"(kARGBToV), // %1
757 "m"(kAddUV128) // %2
758 );
759 asm volatile (
760 "sub %1,%2 \n"
761 ".p2align 4 \n"
762 "1: \n"
763 "movdqa (%0),%%xmm0 \n"
764 "movdqa 0x10(%0),%%xmm1 \n"
765 "movdqa 0x20(%0),%%xmm2 \n"
766 "movdqa 0x30(%0),%%xmm6 \n"
767 "pavgb (%0,%4,1),%%xmm0 \n"
768 "pavgb 0x10(%0,%4,1),%%xmm1 \n"
769 "pavgb 0x20(%0,%4,1),%%xmm2 \n"
770 "pavgb 0x30(%0,%4,1),%%xmm6 \n"
771 "lea 0x40(%0),%0 \n"
772 "movdqa %%xmm0,%%xmm7 \n"
773 "shufps $0x88,%%xmm1,%%xmm0 \n"
774 "shufps $0xdd,%%xmm1,%%xmm7 \n"
775 "pavgb %%xmm7,%%xmm0 \n"
776 "movdqa %%xmm2,%%xmm7 \n"
777 "shufps $0x88,%%xmm6,%%xmm2 \n"
778 "shufps $0xdd,%%xmm6,%%xmm7 \n"
779 "pavgb %%xmm7,%%xmm2 \n"
780 "movdqa %%xmm0,%%xmm1 \n"
781 "movdqa %%xmm2,%%xmm6 \n"
782 "pmaddubsw %%xmm4,%%xmm0 \n"
783 "pmaddubsw %%xmm4,%%xmm2 \n"
784 "pmaddubsw %%xmm3,%%xmm1 \n"
785 "pmaddubsw %%xmm3,%%xmm6 \n"
786 "phaddw %%xmm2,%%xmm0 \n"
787 "phaddw %%xmm6,%%xmm1 \n"
788 "psraw $0x8,%%xmm0 \n"
789 "psraw $0x8,%%xmm1 \n"
790 "packsswb %%xmm1,%%xmm0 \n"
791 "paddb %%xmm5,%%xmm0 \n"
792 "sub $0x10,%3 \n"
793 "movlps %%xmm0,(%1) \n"
794 "movhps %%xmm0,(%1,%2,1) \n"
795 "lea 0x8(%1),%1 \n"
796 "jg 1b \n"
797 : "+r"(src_argb0), // %0
798 "+r"(dst_u), // %1
799 "+r"(dst_v), // %2
800 "+rm"(width) // %3
801 : "r"(static_cast<intptr_t>(src_stride_argb))
802 : "memory", "cc"
803 #if defined(__SSE2__)
804 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
805 #endif
806 );
807 }
808
ARGBToUVRow_Unaligned_SSSE3(const uint8 * src_argb0,int src_stride_argb,uint8 * dst_u,uint8 * dst_v,int width)809 void ARGBToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
810 uint8* dst_u, uint8* dst_v, int width) {
811 asm volatile (
812 "movdqa %0,%%xmm4 \n"
813 "movdqa %1,%%xmm3 \n"
814 "movdqa %2,%%xmm5 \n"
815 :
816 : "m"(kARGBToU), // %0
817 "m"(kARGBToV), // %1
818 "m"(kAddUV128) // %2
819 );
820 asm volatile (
821 "sub %1,%2 \n"
822 ".p2align 4 \n"
823 "1: \n"
824 "movdqu (%0),%%xmm0 \n"
825 "movdqu 0x10(%0),%%xmm1 \n"
826 "movdqu 0x20(%0),%%xmm2 \n"
827 "movdqu 0x30(%0),%%xmm6 \n"
828 "movdqu (%0,%4,1),%%xmm7 \n"
829 "pavgb %%xmm7,%%xmm0 \n"
830 "movdqu 0x10(%0,%4,1),%%xmm7 \n"
831 "pavgb %%xmm7,%%xmm1 \n"
832 "movdqu 0x20(%0,%4,1),%%xmm7 \n"
833 "pavgb %%xmm7,%%xmm2 \n"
834 "movdqu 0x30(%0,%4,1),%%xmm7 \n"
835 "pavgb %%xmm7,%%xmm6 \n"
836 "lea 0x40(%0),%0 \n"
837 "movdqa %%xmm0,%%xmm7 \n"
838 "shufps $0x88,%%xmm1,%%xmm0 \n"
839 "shufps $0xdd,%%xmm1,%%xmm7 \n"
840 "pavgb %%xmm7,%%xmm0 \n"
841 "movdqa %%xmm2,%%xmm7 \n"
842 "shufps $0x88,%%xmm6,%%xmm2 \n"
843 "shufps $0xdd,%%xmm6,%%xmm7 \n"
844 "pavgb %%xmm7,%%xmm2 \n"
845 "movdqa %%xmm0,%%xmm1 \n"
846 "movdqa %%xmm2,%%xmm6 \n"
847 "pmaddubsw %%xmm4,%%xmm0 \n"
848 "pmaddubsw %%xmm4,%%xmm2 \n"
849 "pmaddubsw %%xmm3,%%xmm1 \n"
850 "pmaddubsw %%xmm3,%%xmm6 \n"
851 "phaddw %%xmm2,%%xmm0 \n"
852 "phaddw %%xmm6,%%xmm1 \n"
853 "psraw $0x8,%%xmm0 \n"
854 "psraw $0x8,%%xmm1 \n"
855 "packsswb %%xmm1,%%xmm0 \n"
856 "paddb %%xmm5,%%xmm0 \n"
857 "sub $0x10,%3 \n"
858 "movlps %%xmm0,(%1) \n"
859 "movhps %%xmm0,(%1,%2,1) \n"
860 "lea 0x8(%1),%1 \n"
861 "jg 1b \n"
862 : "+r"(src_argb0), // %0
863 "+r"(dst_u), // %1
864 "+r"(dst_v), // %2
865 "+rm"(width) // %3
866 : "r"(static_cast<intptr_t>(src_stride_argb))
867 : "memory", "cc"
868 #if defined(__SSE2__)
869 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
870 #endif
871 );
872 }
873
BGRAToYRow_SSSE3(const uint8 * src_bgra,uint8 * dst_y,int pix)874 void BGRAToYRow_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) {
875 asm volatile (
876 "movdqa %4,%%xmm5 \n"
877 "movdqa %3,%%xmm4 \n"
878 ".p2align 4 \n"
879 "1: \n"
880 "movdqa (%0),%%xmm0 \n"
881 "movdqa 0x10(%0),%%xmm1 \n"
882 "movdqa 0x20(%0),%%xmm2 \n"
883 "movdqa 0x30(%0),%%xmm3 \n"
884 "pmaddubsw %%xmm4,%%xmm0 \n"
885 "pmaddubsw %%xmm4,%%xmm1 \n"
886 "pmaddubsw %%xmm4,%%xmm2 \n"
887 "pmaddubsw %%xmm4,%%xmm3 \n"
888 "lea 0x40(%0),%0 \n"
889 "phaddw %%xmm1,%%xmm0 \n"
890 "phaddw %%xmm3,%%xmm2 \n"
891 "psrlw $0x7,%%xmm0 \n"
892 "psrlw $0x7,%%xmm2 \n"
893 "packuswb %%xmm2,%%xmm0 \n"
894 "paddb %%xmm5,%%xmm0 \n"
895 "sub $0x10,%2 \n"
896 "movdqa %%xmm0,(%1) \n"
897 "lea 0x10(%1),%1 \n"
898 "jg 1b \n"
899 : "+r"(src_bgra), // %0
900 "+r"(dst_y), // %1
901 "+r"(pix) // %2
902 : "m"(kBGRAToY), // %3
903 "m"(kAddY16) // %4
904 : "memory", "cc"
905 #if defined(__SSE2__)
906 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
907 #endif
908 );
909 }
910
BGRAToYRow_Unaligned_SSSE3(const uint8 * src_bgra,uint8 * dst_y,int pix)911 void BGRAToYRow_Unaligned_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) {
912 asm volatile (
913 "movdqa %4,%%xmm5 \n"
914 "movdqa %3,%%xmm4 \n"
915 ".p2align 4 \n"
916 "1: \n"
917 "movdqu (%0),%%xmm0 \n"
918 "movdqu 0x10(%0),%%xmm1 \n"
919 "movdqu 0x20(%0),%%xmm2 \n"
920 "movdqu 0x30(%0),%%xmm3 \n"
921 "pmaddubsw %%xmm4,%%xmm0 \n"
922 "pmaddubsw %%xmm4,%%xmm1 \n"
923 "pmaddubsw %%xmm4,%%xmm2 \n"
924 "pmaddubsw %%xmm4,%%xmm3 \n"
925 "lea 0x40(%0),%0 \n"
926 "phaddw %%xmm1,%%xmm0 \n"
927 "phaddw %%xmm3,%%xmm2 \n"
928 "psrlw $0x7,%%xmm0 \n"
929 "psrlw $0x7,%%xmm2 \n"
930 "packuswb %%xmm2,%%xmm0 \n"
931 "paddb %%xmm5,%%xmm0 \n"
932 "sub $0x10,%2 \n"
933 "movdqu %%xmm0,(%1) \n"
934 "lea 0x10(%1),%1 \n"
935 "jg 1b \n"
936 : "+r"(src_bgra), // %0
937 "+r"(dst_y), // %1
938 "+r"(pix) // %2
939 : "m"(kBGRAToY), // %3
940 "m"(kAddY16) // %4
941 : "memory", "cc"
942 #if defined(__SSE2__)
943 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
944 #endif
945 );
946 }
947
BGRAToUVRow_SSSE3(const uint8 * src_bgra0,int src_stride_bgra,uint8 * dst_u,uint8 * dst_v,int width)948 void BGRAToUVRow_SSSE3(const uint8* src_bgra0, int src_stride_bgra,
949 uint8* dst_u, uint8* dst_v, int width) {
950 asm volatile (
951 "movdqa %0,%%xmm4 \n"
952 "movdqa %1,%%xmm3 \n"
953 "movdqa %2,%%xmm5 \n"
954 :
955 : "m"(kBGRAToU), // %0
956 "m"(kBGRAToV), // %1
957 "m"(kAddUV128) // %2
958 );
959 asm volatile (
960 "sub %1,%2 \n"
961 ".p2align 4 \n"
962 "1: \n"
963 "movdqa (%0),%%xmm0 \n"
964 "movdqa 0x10(%0),%%xmm1 \n"
965 "movdqa 0x20(%0),%%xmm2 \n"
966 "movdqa 0x30(%0),%%xmm6 \n"
967 "pavgb (%0,%4,1),%%xmm0 \n"
968 "pavgb 0x10(%0,%4,1),%%xmm1 \n"
969 "pavgb 0x20(%0,%4,1),%%xmm2 \n"
970 "pavgb 0x30(%0,%4,1),%%xmm6 \n"
971 "lea 0x40(%0),%0 \n"
972 "movdqa %%xmm0,%%xmm7 \n"
973 "shufps $0x88,%%xmm1,%%xmm0 \n"
974 "shufps $0xdd,%%xmm1,%%xmm7 \n"
975 "pavgb %%xmm7,%%xmm0 \n"
976 "movdqa %%xmm2,%%xmm7 \n"
977 "shufps $0x88,%%xmm6,%%xmm2 \n"
978 "shufps $0xdd,%%xmm6,%%xmm7 \n"
979 "pavgb %%xmm7,%%xmm2 \n"
980 "movdqa %%xmm0,%%xmm1 \n"
981 "movdqa %%xmm2,%%xmm6 \n"
982 "pmaddubsw %%xmm4,%%xmm0 \n"
983 "pmaddubsw %%xmm4,%%xmm2 \n"
984 "pmaddubsw %%xmm3,%%xmm1 \n"
985 "pmaddubsw %%xmm3,%%xmm6 \n"
986 "phaddw %%xmm2,%%xmm0 \n"
987 "phaddw %%xmm6,%%xmm1 \n"
988 "psraw $0x8,%%xmm0 \n"
989 "psraw $0x8,%%xmm1 \n"
990 "packsswb %%xmm1,%%xmm0 \n"
991 "paddb %%xmm5,%%xmm0 \n"
992 "sub $0x10,%3 \n"
993 "movlps %%xmm0,(%1) \n"
994 "movhps %%xmm0,(%1,%2,1) \n"
995 "lea 0x8(%1),%1 \n"
996 "jg 1b \n"
997 : "+r"(src_bgra0), // %0
998 "+r"(dst_u), // %1
999 "+r"(dst_v), // %2
1000 "+rm"(width) // %3
1001 : "r"(static_cast<intptr_t>(src_stride_bgra))
1002 : "memory", "cc"
1003 #if defined(__SSE2__)
1004 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1005 #endif
1006 );
1007 }
1008
BGRAToUVRow_Unaligned_SSSE3(const uint8 * src_bgra0,int src_stride_bgra,uint8 * dst_u,uint8 * dst_v,int width)1009 void BGRAToUVRow_Unaligned_SSSE3(const uint8* src_bgra0, int src_stride_bgra,
1010 uint8* dst_u, uint8* dst_v, int width) {
1011 asm volatile (
1012 "movdqa %0,%%xmm4 \n"
1013 "movdqa %1,%%xmm3 \n"
1014 "movdqa %2,%%xmm5 \n"
1015 :
1016 : "m"(kBGRAToU), // %0
1017 "m"(kBGRAToV), // %1
1018 "m"(kAddUV128) // %2
1019 );
1020 asm volatile (
1021 "sub %1,%2 \n"
1022 ".p2align 4 \n"
1023 "1: \n"
1024 "movdqu (%0),%%xmm0 \n"
1025 "movdqu 0x10(%0),%%xmm1 \n"
1026 "movdqu 0x20(%0),%%xmm2 \n"
1027 "movdqu 0x30(%0),%%xmm6 \n"
1028 "movdqu (%0,%4,1),%%xmm7 \n"
1029 "pavgb %%xmm7,%%xmm0 \n"
1030 "movdqu 0x10(%0,%4,1),%%xmm7 \n"
1031 "pavgb %%xmm7,%%xmm1 \n"
1032 "movdqu 0x20(%0,%4,1),%%xmm7 \n"
1033 "pavgb %%xmm7,%%xmm2 \n"
1034 "movdqu 0x30(%0,%4,1),%%xmm7 \n"
1035 "pavgb %%xmm7,%%xmm6 \n"
1036 "lea 0x40(%0),%0 \n"
1037 "movdqa %%xmm0,%%xmm7 \n"
1038 "shufps $0x88,%%xmm1,%%xmm0 \n"
1039 "shufps $0xdd,%%xmm1,%%xmm7 \n"
1040 "pavgb %%xmm7,%%xmm0 \n"
1041 "movdqa %%xmm2,%%xmm7 \n"
1042 "shufps $0x88,%%xmm6,%%xmm2 \n"
1043 "shufps $0xdd,%%xmm6,%%xmm7 \n"
1044 "pavgb %%xmm7,%%xmm2 \n"
1045 "movdqa %%xmm0,%%xmm1 \n"
1046 "movdqa %%xmm2,%%xmm6 \n"
1047 "pmaddubsw %%xmm4,%%xmm0 \n"
1048 "pmaddubsw %%xmm4,%%xmm2 \n"
1049 "pmaddubsw %%xmm3,%%xmm1 \n"
1050 "pmaddubsw %%xmm3,%%xmm6 \n"
1051 "phaddw %%xmm2,%%xmm0 \n"
1052 "phaddw %%xmm6,%%xmm1 \n"
1053 "psraw $0x8,%%xmm0 \n"
1054 "psraw $0x8,%%xmm1 \n"
1055 "packsswb %%xmm1,%%xmm0 \n"
1056 "paddb %%xmm5,%%xmm0 \n"
1057 "sub $0x10,%3 \n"
1058 "movlps %%xmm0,(%1) \n"
1059 "movhps %%xmm0,(%1,%2,1) \n"
1060 "lea 0x8(%1),%1 \n"
1061 "jg 1b \n"
1062 : "+r"(src_bgra0), // %0
1063 "+r"(dst_u), // %1
1064 "+r"(dst_v), // %2
1065 "+rm"(width) // %3
1066 : "r"(static_cast<intptr_t>(src_stride_bgra))
1067 : "memory", "cc"
1068 #if defined(__SSE2__)
1069 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1070 #endif
1071 );
1072 }
1073
ABGRToYRow_SSSE3(const uint8 * src_abgr,uint8 * dst_y,int pix)1074 void ABGRToYRow_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix) {
1075 asm volatile (
1076 "movdqa %4,%%xmm5 \n"
1077 "movdqa %3,%%xmm4 \n"
1078 ".p2align 4 \n"
1079 "1: \n"
1080 "movdqa (%0),%%xmm0 \n"
1081 "movdqa 0x10(%0),%%xmm1 \n"
1082 "movdqa 0x20(%0),%%xmm2 \n"
1083 "movdqa 0x30(%0),%%xmm3 \n"
1084 "pmaddubsw %%xmm4,%%xmm0 \n"
1085 "pmaddubsw %%xmm4,%%xmm1 \n"
1086 "pmaddubsw %%xmm4,%%xmm2 \n"
1087 "pmaddubsw %%xmm4,%%xmm3 \n"
1088 "lea 0x40(%0),%0 \n"
1089 "phaddw %%xmm1,%%xmm0 \n"
1090 "phaddw %%xmm3,%%xmm2 \n"
1091 "psrlw $0x7,%%xmm0 \n"
1092 "psrlw $0x7,%%xmm2 \n"
1093 "packuswb %%xmm2,%%xmm0 \n"
1094 "paddb %%xmm5,%%xmm0 \n"
1095 "sub $0x10,%2 \n"
1096 "movdqa %%xmm0,(%1) \n"
1097 "lea 0x10(%1),%1 \n"
1098 "jg 1b \n"
1099 : "+r"(src_abgr), // %0
1100 "+r"(dst_y), // %1
1101 "+r"(pix) // %2
1102 : "m"(kABGRToY), // %3
1103 "m"(kAddY16) // %4
1104 : "memory", "cc"
1105 #if defined(__SSE2__)
1106 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1107 #endif
1108 );
1109 }
1110
ABGRToYRow_Unaligned_SSSE3(const uint8 * src_abgr,uint8 * dst_y,int pix)1111 void ABGRToYRow_Unaligned_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix) {
1112 asm volatile (
1113 "movdqa %4,%%xmm5 \n"
1114 "movdqa %3,%%xmm4 \n"
1115 ".p2align 4 \n"
1116 "1: \n"
1117 "movdqu (%0),%%xmm0 \n"
1118 "movdqu 0x10(%0),%%xmm1 \n"
1119 "movdqu 0x20(%0),%%xmm2 \n"
1120 "movdqu 0x30(%0),%%xmm3 \n"
1121 "pmaddubsw %%xmm4,%%xmm0 \n"
1122 "pmaddubsw %%xmm4,%%xmm1 \n"
1123 "pmaddubsw %%xmm4,%%xmm2 \n"
1124 "pmaddubsw %%xmm4,%%xmm3 \n"
1125 "lea 0x40(%0),%0 \n"
1126 "phaddw %%xmm1,%%xmm0 \n"
1127 "phaddw %%xmm3,%%xmm2 \n"
1128 "psrlw $0x7,%%xmm0 \n"
1129 "psrlw $0x7,%%xmm2 \n"
1130 "packuswb %%xmm2,%%xmm0 \n"
1131 "paddb %%xmm5,%%xmm0 \n"
1132 "sub $0x10,%2 \n"
1133 "movdqu %%xmm0,(%1) \n"
1134 "lea 0x10(%1),%1 \n"
1135 "jg 1b \n"
1136 : "+r"(src_abgr), // %0
1137 "+r"(dst_y), // %1
1138 "+r"(pix) // %2
1139 : "m"(kABGRToY), // %3
1140 "m"(kAddY16) // %4
1141 : "memory", "cc"
1142 #if defined(__SSE2__)
1143 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1144 #endif
1145 );
1146 }
1147
ABGRToUVRow_SSSE3(const uint8 * src_abgr0,int src_stride_abgr,uint8 * dst_u,uint8 * dst_v,int width)1148 void ABGRToUVRow_SSSE3(const uint8* src_abgr0, int src_stride_abgr,
1149 uint8* dst_u, uint8* dst_v, int width) {
1150 asm volatile (
1151 "movdqa %0,%%xmm4 \n"
1152 "movdqa %1,%%xmm3 \n"
1153 "movdqa %2,%%xmm5 \n"
1154 :
1155 : "m"(kABGRToU), // %0
1156 "m"(kABGRToV), // %1
1157 "m"(kAddUV128) // %2
1158 );
1159 asm volatile (
1160 "sub %1,%2 \n"
1161 ".p2align 4 \n"
1162 "1: \n"
1163 "movdqa (%0),%%xmm0 \n"
1164 "movdqa 0x10(%0),%%xmm1 \n"
1165 "movdqa 0x20(%0),%%xmm2 \n"
1166 "movdqa 0x30(%0),%%xmm6 \n"
1167 "pavgb (%0,%4,1),%%xmm0 \n"
1168 "pavgb 0x10(%0,%4,1),%%xmm1 \n"
1169 "pavgb 0x20(%0,%4,1),%%xmm2 \n"
1170 "pavgb 0x30(%0,%4,1),%%xmm6 \n"
1171 "lea 0x40(%0),%0 \n"
1172 "movdqa %%xmm0,%%xmm7 \n"
1173 "shufps $0x88,%%xmm1,%%xmm0 \n"
1174 "shufps $0xdd,%%xmm1,%%xmm7 \n"
1175 "pavgb %%xmm7,%%xmm0 \n"
1176 "movdqa %%xmm2,%%xmm7 \n"
1177 "shufps $0x88,%%xmm6,%%xmm2 \n"
1178 "shufps $0xdd,%%xmm6,%%xmm7 \n"
1179 "pavgb %%xmm7,%%xmm2 \n"
1180 "movdqa %%xmm0,%%xmm1 \n"
1181 "movdqa %%xmm2,%%xmm6 \n"
1182 "pmaddubsw %%xmm4,%%xmm0 \n"
1183 "pmaddubsw %%xmm4,%%xmm2 \n"
1184 "pmaddubsw %%xmm3,%%xmm1 \n"
1185 "pmaddubsw %%xmm3,%%xmm6 \n"
1186 "phaddw %%xmm2,%%xmm0 \n"
1187 "phaddw %%xmm6,%%xmm1 \n"
1188 "psraw $0x8,%%xmm0 \n"
1189 "psraw $0x8,%%xmm1 \n"
1190 "packsswb %%xmm1,%%xmm0 \n"
1191 "paddb %%xmm5,%%xmm0 \n"
1192 "sub $0x10,%3 \n"
1193 "movlps %%xmm0,(%1) \n"
1194 "movhps %%xmm0,(%1,%2,1) \n"
1195 "lea 0x8(%1),%1 \n"
1196 "jg 1b \n"
1197 : "+r"(src_abgr0), // %0
1198 "+r"(dst_u), // %1
1199 "+r"(dst_v), // %2
1200 "+rm"(width) // %3
1201 : "r"(static_cast<intptr_t>(src_stride_abgr))
1202 : "memory", "cc"
1203 #if defined(__SSE2__)
1204 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1205 #endif
1206 );
1207 }
1208
ABGRToUVRow_Unaligned_SSSE3(const uint8 * src_abgr0,int src_stride_abgr,uint8 * dst_u,uint8 * dst_v,int width)1209 void ABGRToUVRow_Unaligned_SSSE3(const uint8* src_abgr0, int src_stride_abgr,
1210 uint8* dst_u, uint8* dst_v, int width) {
1211 asm volatile (
1212 "movdqa %0,%%xmm4 \n"
1213 "movdqa %1,%%xmm3 \n"
1214 "movdqa %2,%%xmm5 \n"
1215 :
1216 : "m"(kABGRToU), // %0
1217 "m"(kABGRToV), // %1
1218 "m"(kAddUV128) // %2
1219 );
1220 asm volatile (
1221 "sub %1,%2 \n"
1222 ".p2align 4 \n"
1223 "1: \n"
1224 "movdqu (%0),%%xmm0 \n"
1225 "movdqu 0x10(%0),%%xmm1 \n"
1226 "movdqu 0x20(%0),%%xmm2 \n"
1227 "movdqu 0x30(%0),%%xmm6 \n"
1228 "movdqu (%0,%4,1),%%xmm7 \n"
1229 "pavgb %%xmm7,%%xmm0 \n"
1230 "movdqu 0x10(%0,%4,1),%%xmm7 \n"
1231 "pavgb %%xmm7,%%xmm1 \n"
1232 "movdqu 0x20(%0,%4,1),%%xmm7 \n"
1233 "pavgb %%xmm7,%%xmm2 \n"
1234 "movdqu 0x30(%0,%4,1),%%xmm7 \n"
1235 "pavgb %%xmm7,%%xmm6 \n"
1236 "lea 0x40(%0),%0 \n"
1237 "movdqa %%xmm0,%%xmm7 \n"
1238 "shufps $0x88,%%xmm1,%%xmm0 \n"
1239 "shufps $0xdd,%%xmm1,%%xmm7 \n"
1240 "pavgb %%xmm7,%%xmm0 \n"
1241 "movdqa %%xmm2,%%xmm7 \n"
1242 "shufps $0x88,%%xmm6,%%xmm2 \n"
1243 "shufps $0xdd,%%xmm6,%%xmm7 \n"
1244 "pavgb %%xmm7,%%xmm2 \n"
1245 "movdqa %%xmm0,%%xmm1 \n"
1246 "movdqa %%xmm2,%%xmm6 \n"
1247 "pmaddubsw %%xmm4,%%xmm0 \n"
1248 "pmaddubsw %%xmm4,%%xmm2 \n"
1249 "pmaddubsw %%xmm3,%%xmm1 \n"
1250 "pmaddubsw %%xmm3,%%xmm6 \n"
1251 "phaddw %%xmm2,%%xmm0 \n"
1252 "phaddw %%xmm6,%%xmm1 \n"
1253 "psraw $0x8,%%xmm0 \n"
1254 "psraw $0x8,%%xmm1 \n"
1255 "packsswb %%xmm1,%%xmm0 \n"
1256 "paddb %%xmm5,%%xmm0 \n"
1257 "sub $0x10,%3 \n"
1258 "movlps %%xmm0,(%1) \n"
1259 "movhps %%xmm0,(%1,%2,1) \n"
1260 "lea 0x8(%1),%1 \n"
1261 "jg 1b \n"
1262 : "+r"(src_abgr0), // %0
1263 "+r"(dst_u), // %1
1264 "+r"(dst_v), // %2
1265 "+rm"(width) // %3
1266 : "r"(static_cast<intptr_t>(src_stride_abgr))
1267 : "memory", "cc"
1268 #if defined(__SSE2__)
1269 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1270 #endif
1271 );
1272 }
1273 #endif // HAS_ARGBTOYROW_SSSE3
1274
1275 #ifdef HAS_I422TOARGBROW_SSSE3
1276 #define UB 127 /* min(63,static_cast<int8>(2.018 * 64)) */
1277 #define UG -25 /* static_cast<int8>(-0.391 * 64 - 0.5) */
1278 #define UR 0
1279
1280 #define VB 0
1281 #define VG -52 /* static_cast<int8>(-0.813 * 64 - 0.5) */
1282 #define VR 102 /* static_cast<int8>(1.596 * 64 + 0.5) */
1283
1284 // Bias
1285 #define BB UB * 128 + VB * 128
1286 #define BG UG * 128 + VG * 128
1287 #define BR UR * 128 + VR * 128
1288
1289 #define YG 74 /* static_cast<int8>(1.164 * 64 + 0.5) */
1290
1291 struct {
1292 vec8 kUVToB; // 0
1293 vec8 kUVToG; // 16
1294 vec8 kUVToR; // 32
1295 vec16 kUVBiasB; // 48
1296 vec16 kUVBiasG; // 64
1297 vec16 kUVBiasR; // 80
1298 vec16 kYSub16; // 96
1299 vec16 kYToRgb; // 112
1300 vec8 kVUToB; // 128
1301 vec8 kVUToG; // 144
1302 vec8 kVUToR; // 160
1303 } CONST SIMD_ALIGNED(kYuvConstants) = {
1304 { UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB },
1305 { UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG },
1306 { UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR },
1307 { BB, BB, BB, BB, BB, BB, BB, BB },
1308 { BG, BG, BG, BG, BG, BG, BG, BG },
1309 { BR, BR, BR, BR, BR, BR, BR, BR },
1310 { 16, 16, 16, 16, 16, 16, 16, 16 },
1311 { YG, YG, YG, YG, YG, YG, YG, YG },
1312 { VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB },
1313 { VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG },
1314 { VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR }
1315 };
1316
1317
1318 // Read 8 UV from 411
1319 #define READYUV444 \
1320 "movq (%[u_buf]),%%xmm0 \n" \
1321 "movq (%[u_buf],%[v_buf],1),%%xmm1 \n" \
1322 "lea 0x8(%[u_buf]),%[u_buf] \n" \
1323 "punpcklbw %%xmm1,%%xmm0 \n" \
1324
1325 // Read 4 UV from 422, upsample to 8 UV
1326 #define READYUV422 \
1327 "movd (%[u_buf]),%%xmm0 \n" \
1328 "movd (%[u_buf],%[v_buf],1),%%xmm1 \n" \
1329 "lea 0x4(%[u_buf]),%[u_buf] \n" \
1330 "punpcklbw %%xmm1,%%xmm0 \n" \
1331 "punpcklwd %%xmm0,%%xmm0 \n" \
1332
1333 // Read 2 UV from 411, upsample to 8 UV
1334 #define READYUV411 \
1335 "movd (%[u_buf]),%%xmm0 \n" \
1336 "movd (%[u_buf],%[v_buf],1),%%xmm1 \n" \
1337 "lea 0x2(%[u_buf]),%[u_buf] \n" \
1338 "punpcklbw %%xmm1,%%xmm0 \n" \
1339 "punpcklwd %%xmm0,%%xmm0 \n" \
1340 "punpckldq %%xmm0,%%xmm0 \n" \
1341
1342 // Read 4 UV from NV12, upsample to 8 UV
1343 #define READNV12 \
1344 "movq (%[uv_buf]),%%xmm0 \n" \
1345 "lea 0x8(%[uv_buf]),%[uv_buf] \n" \
1346 "punpcklwd %%xmm0,%%xmm0 \n" \
1347
1348 // Convert 8 pixels: 8 UV and 8 Y
1349 #define YUVTORGB \
1350 "movdqa %%xmm0,%%xmm1 \n" \
1351 "movdqa %%xmm0,%%xmm2 \n" \
1352 "pmaddubsw (%[kYuvConstants]),%%xmm0 \n" \
1353 "pmaddubsw 16(%[kYuvConstants]),%%xmm1 \n" \
1354 "pmaddubsw 32(%[kYuvConstants]),%%xmm2 \n" \
1355 "psubw 48(%[kYuvConstants]),%%xmm0 \n" \
1356 "psubw 64(%[kYuvConstants]),%%xmm1 \n" \
1357 "psubw 80(%[kYuvConstants]),%%xmm2 \n" \
1358 "movq (%[y_buf]),%%xmm3 \n" \
1359 "lea 0x8(%[y_buf]),%[y_buf] \n" \
1360 "punpcklbw %%xmm4,%%xmm3 \n" \
1361 "psubsw 96(%[kYuvConstants]),%%xmm3 \n" \
1362 "pmullw 112(%[kYuvConstants]),%%xmm3 \n" \
1363 "paddsw %%xmm3,%%xmm0 \n" \
1364 "paddsw %%xmm3,%%xmm1 \n" \
1365 "paddsw %%xmm3,%%xmm2 \n" \
1366 "psraw $0x6,%%xmm0 \n" \
1367 "psraw $0x6,%%xmm1 \n" \
1368 "psraw $0x6,%%xmm2 \n" \
1369 "packuswb %%xmm0,%%xmm0 \n" \
1370 "packuswb %%xmm1,%%xmm1 \n" \
1371 "packuswb %%xmm2,%%xmm2 \n" \
1372
1373 // Convert 8 pixels: 8 VU and 8 Y
1374 #define YVUTORGB \
1375 "movdqa %%xmm0,%%xmm1 \n" \
1376 "movdqa %%xmm0,%%xmm2 \n" \
1377 "pmaddubsw 128(%[kYuvConstants]),%%xmm0 \n" \
1378 "pmaddubsw 144(%[kYuvConstants]),%%xmm1 \n" \
1379 "pmaddubsw 160(%[kYuvConstants]),%%xmm2 \n" \
1380 "psubw 48(%[kYuvConstants]),%%xmm0 \n" \
1381 "psubw 64(%[kYuvConstants]),%%xmm1 \n" \
1382 "psubw 80(%[kYuvConstants]),%%xmm2 \n" \
1383 "movq (%[y_buf]),%%xmm3 \n" \
1384 "lea 0x8(%[y_buf]),%[y_buf] \n" \
1385 "punpcklbw %%xmm4,%%xmm3 \n" \
1386 "psubsw 96(%[kYuvConstants]),%%xmm3 \n" \
1387 "pmullw 112(%[kYuvConstants]),%%xmm3 \n" \
1388 "paddsw %%xmm3,%%xmm0 \n" \
1389 "paddsw %%xmm3,%%xmm1 \n" \
1390 "paddsw %%xmm3,%%xmm2 \n" \
1391 "psraw $0x6,%%xmm0 \n" \
1392 "psraw $0x6,%%xmm1 \n" \
1393 "psraw $0x6,%%xmm2 \n" \
1394 "packuswb %%xmm0,%%xmm0 \n" \
1395 "packuswb %%xmm1,%%xmm1 \n" \
1396 "packuswb %%xmm2,%%xmm2 \n" \
1397
I444ToARGBRow_SSSE3(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * argb_buf,int width)1398 void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf,
1399 const uint8* u_buf,
1400 const uint8* v_buf,
1401 uint8* argb_buf,
1402 int width) {
1403 asm volatile (
1404 "sub %[u_buf],%[v_buf] \n"
1405 "pcmpeqb %%xmm5,%%xmm5 \n"
1406 "pxor %%xmm4,%%xmm4 \n"
1407 ".p2align 4 \n"
1408 "1: \n"
1409 READYUV444
1410 YUVTORGB
1411 "punpcklbw %%xmm1,%%xmm0 \n"
1412 "punpcklbw %%xmm5,%%xmm2 \n"
1413 "movdqa %%xmm0,%%xmm1 \n"
1414 "punpcklwd %%xmm2,%%xmm0 \n"
1415 "punpckhwd %%xmm2,%%xmm1 \n"
1416 "movdqa %%xmm0,(%[argb_buf]) \n"
1417 "movdqa %%xmm1,0x10(%[argb_buf]) \n"
1418 "lea 0x20(%[argb_buf]),%[argb_buf] \n"
1419 "sub $0x8,%[width] \n"
1420 "jg 1b \n"
1421 : [y_buf]"+r"(y_buf), // %[y_buf]
1422 [u_buf]"+r"(u_buf), // %[u_buf]
1423 [v_buf]"+r"(v_buf), // %[v_buf]
1424 [argb_buf]"+r"(argb_buf), // %[argb_buf]
1425 [width]"+rm"(width) // %[width]
1426 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
1427 : "memory", "cc"
1428 #if defined(__SSE2__)
1429 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1430 #endif
1431 );
1432 }
1433
I422ToARGBRow_SSSE3(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * argb_buf,int width)1434 void OMITFP I422ToARGBRow_SSSE3(const uint8* y_buf,
1435 const uint8* u_buf,
1436 const uint8* v_buf,
1437 uint8* argb_buf,
1438 int width) {
1439 asm volatile (
1440 "sub %[u_buf],%[v_buf] \n"
1441 "pcmpeqb %%xmm5,%%xmm5 \n"
1442 "pxor %%xmm4,%%xmm4 \n"
1443 ".p2align 4 \n"
1444 "1: \n"
1445 READYUV422
1446 YUVTORGB
1447 "punpcklbw %%xmm1,%%xmm0 \n"
1448 "punpcklbw %%xmm5,%%xmm2 \n"
1449 "movdqa %%xmm0,%%xmm1 \n"
1450 "punpcklwd %%xmm2,%%xmm0 \n"
1451 "punpckhwd %%xmm2,%%xmm1 \n"
1452 "movdqa %%xmm0,(%[argb_buf]) \n"
1453 "movdqa %%xmm1,0x10(%[argb_buf]) \n"
1454 "lea 0x20(%[argb_buf]),%[argb_buf] \n"
1455 "sub $0x8,%[width] \n"
1456 "jg 1b \n"
1457 : [y_buf]"+r"(y_buf), // %[y_buf]
1458 [u_buf]"+r"(u_buf), // %[u_buf]
1459 [v_buf]"+r"(v_buf), // %[v_buf]
1460 [argb_buf]"+r"(argb_buf), // %[argb_buf]
1461 [width]"+rm"(width) // %[width]
1462 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
1463 : "memory", "cc"
1464 #if defined(__SSE2__)
1465 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1466 #endif
1467 );
1468 }
1469
I411ToARGBRow_SSSE3(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * argb_buf,int width)1470 void OMITFP I411ToARGBRow_SSSE3(const uint8* y_buf,
1471 const uint8* u_buf,
1472 const uint8* v_buf,
1473 uint8* argb_buf,
1474 int width) {
1475 asm volatile (
1476 "sub %[u_buf],%[v_buf] \n"
1477 "pcmpeqb %%xmm5,%%xmm5 \n"
1478 "pxor %%xmm4,%%xmm4 \n"
1479 ".p2align 4 \n"
1480 "1: \n"
1481 READYUV411
1482 YUVTORGB
1483 "punpcklbw %%xmm1,%%xmm0 \n"
1484 "punpcklbw %%xmm5,%%xmm2 \n"
1485 "movdqa %%xmm0,%%xmm1 \n"
1486 "punpcklwd %%xmm2,%%xmm0 \n"
1487 "punpckhwd %%xmm2,%%xmm1 \n"
1488 "movdqa %%xmm0,(%[argb_buf]) \n"
1489 "movdqa %%xmm1,0x10(%[argb_buf]) \n"
1490 "lea 0x20(%[argb_buf]),%[argb_buf] \n"
1491 "sub $0x8,%[width] \n"
1492 "jg 1b \n"
1493 : [y_buf]"+r"(y_buf), // %[y_buf]
1494 [u_buf]"+r"(u_buf), // %[u_buf]
1495 [v_buf]"+r"(v_buf), // %[v_buf]
1496 [argb_buf]"+r"(argb_buf), // %[argb_buf]
1497 [width]"+rm"(width) // %[width]
1498 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
1499 : "memory", "cc"
1500 #if defined(__SSE2__)
1501 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1502 #endif
1503 );
1504 }
1505
NV12ToARGBRow_SSSE3(const uint8 * y_buf,const uint8 * uv_buf,uint8 * argb_buf,int width)1506 void OMITFP NV12ToARGBRow_SSSE3(const uint8* y_buf,
1507 const uint8* uv_buf,
1508 uint8* argb_buf,
1509 int width) {
1510 asm volatile (
1511 "pcmpeqb %%xmm5,%%xmm5 \n"
1512 "pxor %%xmm4,%%xmm4 \n"
1513 ".p2align 4 \n"
1514 "1: \n"
1515 READNV12
1516 YUVTORGB
1517 "punpcklbw %%xmm1,%%xmm0 \n"
1518 "punpcklbw %%xmm5,%%xmm2 \n"
1519 "movdqa %%xmm0,%%xmm1 \n"
1520 "punpcklwd %%xmm2,%%xmm0 \n"
1521 "punpckhwd %%xmm2,%%xmm1 \n"
1522 "movdqa %%xmm0,(%[argb_buf]) \n"
1523 "movdqa %%xmm1,0x10(%[argb_buf]) \n"
1524 "lea 0x20(%[argb_buf]),%[argb_buf] \n"
1525 "sub $0x8,%[width] \n"
1526 "jg 1b \n"
1527 : [y_buf]"+r"(y_buf), // %[y_buf]
1528 [uv_buf]"+r"(uv_buf), // %[uv_buf]
1529 [argb_buf]"+r"(argb_buf), // %[argb_buf]
1530 [width]"+rm"(width) // %[width]
1531 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
1532 : "memory", "cc"
1533 #if defined(__SSE2__)
1534 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1535 #endif
1536 );
1537 }
1538
NV21ToARGBRow_SSSE3(const uint8 * y_buf,const uint8 * vu_buf,uint8 * argb_buf,int width)1539 void OMITFP NV21ToARGBRow_SSSE3(const uint8* y_buf,
1540 const uint8* vu_buf,
1541 uint8* argb_buf,
1542 int width) {
1543 asm volatile (
1544 "pcmpeqb %%xmm5,%%xmm5 \n"
1545 "pxor %%xmm4,%%xmm4 \n"
1546 ".p2align 4 \n"
1547 "1: \n"
1548 READNV12
1549 YVUTORGB
1550 "punpcklbw %%xmm1,%%xmm0 \n"
1551 "punpcklbw %%xmm5,%%xmm2 \n"
1552 "movdqa %%xmm0,%%xmm1 \n"
1553 "punpcklwd %%xmm2,%%xmm0 \n"
1554 "punpckhwd %%xmm2,%%xmm1 \n"
1555 "movdqa %%xmm0,(%[argb_buf]) \n"
1556 "movdqa %%xmm1,0x10(%[argb_buf]) \n"
1557 "lea 0x20(%[argb_buf]),%[argb_buf] \n"
1558 "sub $0x8,%[width] \n"
1559 "jg 1b \n"
1560 : [y_buf]"+r"(y_buf), // %[y_buf]
1561 [uv_buf]"+r"(vu_buf), // %[uv_buf]
1562 [argb_buf]"+r"(argb_buf), // %[argb_buf]
1563 [width]"+rm"(width) // %[width]
1564 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
1565 : "memory", "cc"
1566 #if defined(__SSE2__)
1567 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1568 #endif
1569 );
1570 }
1571
I444ToARGBRow_Unaligned_SSSE3(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * argb_buf,int width)1572 void OMITFP I444ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
1573 const uint8* u_buf,
1574 const uint8* v_buf,
1575 uint8* argb_buf,
1576 int width) {
1577 asm volatile (
1578 "sub %[u_buf],%[v_buf] \n"
1579 "pcmpeqb %%xmm5,%%xmm5 \n"
1580 "pxor %%xmm4,%%xmm4 \n"
1581 ".p2align 4 \n"
1582 "1: \n"
1583 READYUV444
1584 YUVTORGB
1585 "punpcklbw %%xmm1,%%xmm0 \n"
1586 "punpcklbw %%xmm5,%%xmm2 \n"
1587 "movdqa %%xmm0,%%xmm1 \n"
1588 "punpcklwd %%xmm2,%%xmm0 \n"
1589 "punpckhwd %%xmm2,%%xmm1 \n"
1590 "movdqu %%xmm0,(%[argb_buf]) \n"
1591 "movdqu %%xmm1,0x10(%[argb_buf]) \n"
1592 "lea 0x20(%[argb_buf]),%[argb_buf] \n"
1593 "sub $0x8,%[width] \n"
1594 "jg 1b \n"
1595 : [y_buf]"+r"(y_buf), // %[y_buf]
1596 [u_buf]"+r"(u_buf), // %[u_buf]
1597 [v_buf]"+r"(v_buf), // %[v_buf]
1598 [argb_buf]"+r"(argb_buf), // %[argb_buf]
1599 [width]"+rm"(width) // %[width]
1600 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
1601 : "memory", "cc"
1602 #if defined(__SSE2__)
1603 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1604 #endif
1605 );
1606 }
1607
I422ToARGBRow_Unaligned_SSSE3(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * argb_buf,int width)1608 void OMITFP I422ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
1609 const uint8* u_buf,
1610 const uint8* v_buf,
1611 uint8* argb_buf,
1612 int width) {
1613 asm volatile (
1614 "sub %[u_buf],%[v_buf] \n"
1615 "pcmpeqb %%xmm5,%%xmm5 \n"
1616 "pxor %%xmm4,%%xmm4 \n"
1617 ".p2align 4 \n"
1618 "1: \n"
1619 READYUV422
1620 YUVTORGB
1621 "punpcklbw %%xmm1,%%xmm0 \n"
1622 "punpcklbw %%xmm5,%%xmm2 \n"
1623 "movdqa %%xmm0,%%xmm1 \n"
1624 "punpcklwd %%xmm2,%%xmm0 \n"
1625 "punpckhwd %%xmm2,%%xmm1 \n"
1626 "movdqu %%xmm0,(%[argb_buf]) \n"
1627 "movdqu %%xmm1,0x10(%[argb_buf]) \n"
1628 "lea 0x20(%[argb_buf]),%[argb_buf] \n"
1629 "sub $0x8,%[width] \n"
1630 "jg 1b \n"
1631 : [y_buf]"+r"(y_buf), // %[y_buf]
1632 [u_buf]"+r"(u_buf), // %[u_buf]
1633 [v_buf]"+r"(v_buf), // %[v_buf]
1634 [argb_buf]"+r"(argb_buf), // %[argb_buf]
1635 [width]"+rm"(width) // %[width]
1636 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
1637 : "memory", "cc"
1638 #if defined(__SSE2__)
1639 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1640 #endif
1641 );
1642 }
1643
I411ToARGBRow_Unaligned_SSSE3(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * argb_buf,int width)1644 void OMITFP I411ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
1645 const uint8* u_buf,
1646 const uint8* v_buf,
1647 uint8* argb_buf,
1648 int width) {
1649 asm volatile (
1650 "sub %[u_buf],%[v_buf] \n"
1651 "pcmpeqb %%xmm5,%%xmm5 \n"
1652 "pxor %%xmm4,%%xmm4 \n"
1653 ".p2align 4 \n"
1654 "1: \n"
1655 READYUV411
1656 YUVTORGB
1657 "punpcklbw %%xmm1,%%xmm0 \n"
1658 "punpcklbw %%xmm5,%%xmm2 \n"
1659 "movdqa %%xmm0,%%xmm1 \n"
1660 "punpcklwd %%xmm2,%%xmm0 \n"
1661 "punpckhwd %%xmm2,%%xmm1 \n"
1662 "movdqu %%xmm0,(%[argb_buf]) \n"
1663 "movdqu %%xmm1,0x10(%[argb_buf]) \n"
1664 "lea 0x20(%[argb_buf]),%[argb_buf] \n"
1665 "sub $0x8,%[width] \n"
1666 "jg 1b \n"
1667 : [y_buf]"+r"(y_buf), // %[y_buf]
1668 [u_buf]"+r"(u_buf), // %[u_buf]
1669 [v_buf]"+r"(v_buf), // %[v_buf]
1670 [argb_buf]"+r"(argb_buf), // %[argb_buf]
1671 [width]"+rm"(width) // %[width]
1672 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
1673 : "memory", "cc"
1674 #if defined(__SSE2__)
1675 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1676 #endif
1677 );
1678 }
1679
NV12ToARGBRow_Unaligned_SSSE3(const uint8 * y_buf,const uint8 * uv_buf,uint8 * argb_buf,int width)1680 void OMITFP NV12ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
1681 const uint8* uv_buf,
1682 uint8* argb_buf,
1683 int width) {
1684 asm volatile (
1685 "pcmpeqb %%xmm5,%%xmm5 \n"
1686 "pxor %%xmm4,%%xmm4 \n"
1687 ".p2align 4 \n"
1688 "1: \n"
1689 READNV12
1690 YUVTORGB
1691 "punpcklbw %%xmm1,%%xmm0 \n"
1692 "punpcklbw %%xmm5,%%xmm2 \n"
1693 "movdqa %%xmm0,%%xmm1 \n"
1694 "punpcklwd %%xmm2,%%xmm0 \n"
1695 "punpckhwd %%xmm2,%%xmm1 \n"
1696 "movdqu %%xmm0,(%[argb_buf]) \n"
1697 "movdqu %%xmm1,0x10(%[argb_buf]) \n"
1698 "lea 0x20(%[argb_buf]),%[argb_buf] \n"
1699 "sub $0x8,%[width] \n"
1700 "jg 1b \n"
1701 : [y_buf]"+r"(y_buf), // %[y_buf]
1702 [uv_buf]"+r"(uv_buf), // %[uv_buf]
1703 [argb_buf]"+r"(argb_buf), // %[argb_buf]
1704 [width]"+rm"(width) // %[width]
1705 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
1706 : "memory", "cc"
1707 #if defined(__SSE2__)
1708 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1709 #endif
1710 );
1711 }
1712
NV21ToARGBRow_Unaligned_SSSE3(const uint8 * y_buf,const uint8 * vu_buf,uint8 * argb_buf,int width)1713 void OMITFP NV21ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
1714 const uint8* vu_buf,
1715 uint8* argb_buf,
1716 int width) {
1717 asm volatile (
1718 "pcmpeqb %%xmm5,%%xmm5 \n"
1719 "pxor %%xmm4,%%xmm4 \n"
1720 ".p2align 4 \n"
1721 "1: \n"
1722 READNV12
1723 YVUTORGB
1724 "punpcklbw %%xmm1,%%xmm0 \n"
1725 "punpcklbw %%xmm5,%%xmm2 \n"
1726 "movdqa %%xmm0,%%xmm1 \n"
1727 "punpcklwd %%xmm2,%%xmm0 \n"
1728 "punpckhwd %%xmm2,%%xmm1 \n"
1729 "movdqu %%xmm0,(%[argb_buf]) \n"
1730 "movdqu %%xmm1,0x10(%[argb_buf]) \n"
1731 "lea 0x20(%[argb_buf]),%[argb_buf] \n"
1732 "sub $0x8,%[width] \n"
1733 "jg 1b \n"
1734 : [y_buf]"+r"(y_buf), // %[y_buf]
1735 [uv_buf]"+r"(vu_buf), // %[uv_buf]
1736 [argb_buf]"+r"(argb_buf), // %[argb_buf]
1737 [width]"+rm"(width) // %[width]
1738 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
1739 : "memory", "cc"
1740 #if defined(__SSE2__)
1741 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1742 #endif
1743 );
1744 }
1745
I422ToBGRARow_SSSE3(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * bgra_buf,int width)1746 void OMITFP I422ToBGRARow_SSSE3(const uint8* y_buf,
1747 const uint8* u_buf,
1748 const uint8* v_buf,
1749 uint8* bgra_buf,
1750 int width) {
1751 asm volatile (
1752 "sub %[u_buf],%[v_buf] \n"
1753 "pcmpeqb %%xmm5,%%xmm5 \n"
1754 "pxor %%xmm4,%%xmm4 \n"
1755 ".p2align 4 \n"
1756 "1: \n"
1757 READYUV422
1758 YUVTORGB
1759 "pcmpeqb %%xmm5,%%xmm5 \n"
1760 "punpcklbw %%xmm0,%%xmm1 \n"
1761 "punpcklbw %%xmm2,%%xmm5 \n"
1762 "movdqa %%xmm5,%%xmm0 \n"
1763 "punpcklwd %%xmm1,%%xmm5 \n"
1764 "punpckhwd %%xmm1,%%xmm0 \n"
1765 "movdqa %%xmm5,(%[argb_buf]) \n"
1766 "movdqa %%xmm0,0x10(%[argb_buf]) \n"
1767 "lea 0x20(%[argb_buf]),%[argb_buf] \n"
1768 "sub $0x8,%[width] \n"
1769 "jg 1b \n"
1770 : [y_buf]"+r"(y_buf), // %[y_buf]
1771 [u_buf]"+r"(u_buf), // %[u_buf]
1772 [v_buf]"+r"(v_buf), // %[v_buf]
1773 [argb_buf]"+r"(bgra_buf), // %[argb_buf]
1774 [width]"+rm"(width) // %[width]
1775 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
1776 : "memory", "cc"
1777 #if defined(__SSE2__)
1778 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1779 #endif
1780 );
1781 }
1782
I422ToABGRRow_SSSE3(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * abgr_buf,int width)1783 void OMITFP I422ToABGRRow_SSSE3(const uint8* y_buf,
1784 const uint8* u_buf,
1785 const uint8* v_buf,
1786 uint8* abgr_buf,
1787 int width) {
1788 asm volatile (
1789 "sub %[u_buf],%[v_buf] \n"
1790 "pcmpeqb %%xmm5,%%xmm5 \n"
1791 "pxor %%xmm4,%%xmm4 \n"
1792 ".p2align 4 \n"
1793 "1: \n"
1794 READYUV422
1795 YUVTORGB
1796 "punpcklbw %%xmm1,%%xmm2 \n"
1797 "punpcklbw %%xmm5,%%xmm0 \n"
1798 "movdqa %%xmm2,%%xmm1 \n"
1799 "punpcklwd %%xmm0,%%xmm2 \n"
1800 "punpckhwd %%xmm0,%%xmm1 \n"
1801 "movdqa %%xmm2,(%[argb_buf]) \n"
1802 "movdqa %%xmm1,0x10(%[argb_buf]) \n"
1803 "lea 0x20(%[argb_buf]),%[argb_buf] \n"
1804 "sub $0x8,%[width] \n"
1805 "jg 1b \n"
1806 : [y_buf]"+r"(y_buf), // %[y_buf]
1807 [u_buf]"+r"(u_buf), // %[u_buf]
1808 [v_buf]"+r"(v_buf), // %[v_buf]
1809 [argb_buf]"+r"(abgr_buf), // %[argb_buf]
1810 [width]"+rm"(width) // %[width]
1811 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
1812 : "memory", "cc"
1813 #if defined(__SSE2__)
1814 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1815 #endif
1816 );
1817 }
1818
I422ToBGRARow_Unaligned_SSSE3(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * bgra_buf,int width)1819 void OMITFP I422ToBGRARow_Unaligned_SSSE3(const uint8* y_buf,
1820 const uint8* u_buf,
1821 const uint8* v_buf,
1822 uint8* bgra_buf,
1823 int width) {
1824 asm volatile (
1825 "sub %[u_buf],%[v_buf] \n"
1826 "pcmpeqb %%xmm5,%%xmm5 \n"
1827 "pxor %%xmm4,%%xmm4 \n"
1828 ".p2align 4 \n"
1829 "1: \n"
1830 READYUV422
1831 YUVTORGB
1832 "pcmpeqb %%xmm5,%%xmm5 \n"
1833 "punpcklbw %%xmm0,%%xmm1 \n"
1834 "punpcklbw %%xmm2,%%xmm5 \n"
1835 "movdqa %%xmm5,%%xmm0 \n"
1836 "punpcklwd %%xmm1,%%xmm5 \n"
1837 "punpckhwd %%xmm1,%%xmm0 \n"
1838 "movdqu %%xmm5,(%[argb_buf]) \n"
1839 "movdqu %%xmm0,0x10(%[argb_buf]) \n"
1840 "lea 0x20(%[argb_buf]),%[argb_buf] \n"
1841 "sub $0x8,%[width] \n"
1842 "jg 1b \n"
1843 : [y_buf]"+r"(y_buf), // %[y_buf]
1844 [u_buf]"+r"(u_buf), // %[u_buf]
1845 [v_buf]"+r"(v_buf), // %[v_buf]
1846 [argb_buf]"+r"(bgra_buf), // %[argb_buf]
1847 [width]"+rm"(width) // %[width]
1848 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
1849 : "memory", "cc"
1850 #if defined(__SSE2__)
1851 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1852 #endif
1853 );
1854 }
1855
I422ToABGRRow_Unaligned_SSSE3(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * abgr_buf,int width)1856 void OMITFP I422ToABGRRow_Unaligned_SSSE3(const uint8* y_buf,
1857 const uint8* u_buf,
1858 const uint8* v_buf,
1859 uint8* abgr_buf,
1860 int width) {
1861 asm volatile (
1862 "sub %[u_buf],%[v_buf] \n"
1863 "pcmpeqb %%xmm5,%%xmm5 \n"
1864 "pxor %%xmm4,%%xmm4 \n"
1865 ".p2align 4 \n"
1866 "1: \n"
1867 READYUV422
1868 YUVTORGB
1869 "punpcklbw %%xmm1,%%xmm2 \n"
1870 "punpcklbw %%xmm5,%%xmm0 \n"
1871 "movdqa %%xmm2,%%xmm1 \n"
1872 "punpcklwd %%xmm0,%%xmm2 \n"
1873 "punpckhwd %%xmm0,%%xmm1 \n"
1874 "movdqu %%xmm2,(%[argb_buf]) \n"
1875 "movdqu %%xmm1,0x10(%[argb_buf]) \n"
1876 "lea 0x20(%[argb_buf]),%[argb_buf] \n"
1877 "sub $0x8,%[width] \n"
1878 "jg 1b \n"
1879 : [y_buf]"+r"(y_buf), // %[y_buf]
1880 [u_buf]"+r"(u_buf), // %[u_buf]
1881 [v_buf]"+r"(v_buf), // %[v_buf]
1882 [argb_buf]"+r"(abgr_buf), // %[argb_buf]
1883 [width]"+rm"(width) // %[width]
1884 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
1885 : "memory", "cc"
1886 #if defined(__SSE2__)
1887 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1888 #endif
1889 );
1890 }
1891 #endif // HAS_I422TOARGBROW_SSSE3
1892
1893 #ifdef HAS_YTOARGBROW_SSE2
YToARGBRow_SSE2(const uint8 * y_buf,uint8 * rgb_buf,int width)1894 void YToARGBRow_SSE2(const uint8* y_buf,
1895 uint8* rgb_buf,
1896 int width) {
1897 asm volatile (
1898 "pcmpeqb %%xmm4,%%xmm4 \n"
1899 "pslld $0x18,%%xmm4 \n"
1900 "mov $0x10001000,%%eax \n"
1901 "movd %%eax,%%xmm3 \n"
1902 "pshufd $0x0,%%xmm3,%%xmm3 \n"
1903 "mov $0x012a012a,%%eax \n"
1904 "movd %%eax,%%xmm2 \n"
1905 "pshufd $0x0,%%xmm2,%%xmm2 \n"
1906 ".p2align 4 \n"
1907 "1: \n"
1908 // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
1909 "movq (%0),%%xmm0 \n"
1910 "lea 0x8(%0),%0 \n"
1911 "punpcklbw %%xmm0,%%xmm0 \n"
1912 "psubusw %%xmm3,%%xmm0 \n"
1913 "pmulhuw %%xmm2,%%xmm0 \n"
1914 "packuswb %%xmm0,%%xmm0 \n"
1915
1916 // Step 2: Weave into ARGB
1917 "punpcklbw %%xmm0,%%xmm0 \n"
1918 "movdqa %%xmm0,%%xmm1 \n"
1919 "punpcklwd %%xmm0,%%xmm0 \n"
1920 "punpckhwd %%xmm1,%%xmm1 \n"
1921 "por %%xmm4,%%xmm0 \n"
1922 "por %%xmm4,%%xmm1 \n"
1923 "movdqa %%xmm0,(%1) \n"
1924 "movdqa %%xmm1,16(%1) \n"
1925 "lea 32(%1),%1 \n"
1926
1927 "sub $0x8,%2 \n"
1928 "jg 1b \n"
1929 : "+r"(y_buf), // %0
1930 "+r"(rgb_buf), // %1
1931 "+rm"(width) // %2
1932 :
1933 : "memory", "cc", "eax"
1934 #if defined(__SSE2__)
1935 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
1936 #endif
1937 );
1938 }
1939 #endif // HAS_YTOARGBROW_SSE2
1940
1941 #ifdef HAS_MIRRORROW_SSSE3
1942 // Shuffle table for reversing the bytes.
1943 CONST uvec8 kShuffleMirror = {
1944 15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
1945 };
1946
MirrorRow_SSSE3(const uint8 * src,uint8 * dst,int width)1947 void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
1948 intptr_t temp_width = static_cast<intptr_t>(width);
1949 asm volatile (
1950 "movdqa %3,%%xmm5 \n"
1951 "lea -0x10(%0),%0 \n"
1952 ".p2align 4 \n"
1953 "1: \n"
1954 "movdqa (%0,%2),%%xmm0 \n"
1955 "pshufb %%xmm5,%%xmm0 \n"
1956 "sub $0x10,%2 \n"
1957 "movdqa %%xmm0,(%1) \n"
1958 "lea 0x10(%1),%1 \n"
1959 "jg 1b \n"
1960 : "+r"(src), // %0
1961 "+r"(dst), // %1
1962 "+r"(temp_width) // %2
1963 : "m"(kShuffleMirror) // %3
1964 : "memory", "cc"
1965 #if defined(__SSE2__)
1966 , "xmm0", "xmm5"
1967 #endif
1968 );
1969 }
1970 #endif // HAS_MIRRORROW_SSSE3
1971
1972 #ifdef HAS_MIRRORROW_SSE2
MirrorRow_SSE2(const uint8 * src,uint8 * dst,int width)1973 void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
1974 intptr_t temp_width = static_cast<intptr_t>(width);
1975 asm volatile (
1976 "lea -0x10(%0),%0 \n"
1977 ".p2align 4 \n"
1978 "1: \n"
1979 "movdqu (%0,%2),%%xmm0 \n"
1980 "movdqa %%xmm0,%%xmm1 \n"
1981 "psllw $0x8,%%xmm0 \n"
1982 "psrlw $0x8,%%xmm1 \n"
1983 "por %%xmm1,%%xmm0 \n"
1984 "pshuflw $0x1b,%%xmm0,%%xmm0 \n"
1985 "pshufhw $0x1b,%%xmm0,%%xmm0 \n"
1986 "pshufd $0x4e,%%xmm0,%%xmm0 \n"
1987 "sub $0x10,%2 \n"
1988 "movdqu %%xmm0,(%1) \n"
1989 "lea 0x10(%1),%1 \n"
1990 "jg 1b \n"
1991 : "+r"(src), // %0
1992 "+r"(dst), // %1
1993 "+r"(temp_width) // %2
1994 :
1995 : "memory", "cc"
1996 #if defined(__SSE2__)
1997 , "xmm0", "xmm1"
1998 #endif
1999 );
2000 }
2001 #endif // HAS_MIRRORROW_SSE2
2002
2003 #ifdef HAS_MIRRORROW_UV_SSSE3
2004 // Shuffle table for reversing the bytes of UV channels.
2005 CONST uvec8 kShuffleMirrorUV = {
2006 14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u
2007 };
MirrorRowUV_SSSE3(const uint8 * src,uint8 * dst_u,uint8 * dst_v,int width)2008 void MirrorRowUV_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
2009 int width) {
2010 intptr_t temp_width = static_cast<intptr_t>(width);
2011 asm volatile (
2012 "movdqa %4,%%xmm1 \n"
2013 "lea -16(%0,%3,2),%0 \n"
2014 "sub %1,%2 \n"
2015 ".p2align 4 \n"
2016 "1: \n"
2017 "movdqa (%0),%%xmm0 \n"
2018 "lea -16(%0),%0 \n"
2019 "pshufb %%xmm1,%%xmm0 \n"
2020 "sub $8,%3 \n"
2021 "movlpd %%xmm0,(%1) \n"
2022 "movhpd %%xmm0,(%1,%2) \n"
2023 "lea 8(%1),%1 \n"
2024 "jg 1b \n"
2025 : "+r"(src), // %0
2026 "+r"(dst_u), // %1
2027 "+r"(dst_v), // %2
2028 "+r"(temp_width) // %3
2029 : "m"(kShuffleMirrorUV) // %4
2030 : "memory", "cc"
2031 #if defined(__SSE2__)
2032 , "xmm0", "xmm1"
2033 #endif
2034 );
2035 }
2036 #endif // HAS_MIRRORROW_UV_SSSE3
2037
2038 #ifdef HAS_ARGBMIRRORROW_SSSE3
2039 // Shuffle table for reversing the bytes.
2040 CONST uvec8 kARGBShuffleMirror = {
2041 12u, 13u, 14u, 15u, 8u, 9u, 10u, 11u, 4u, 5u, 6u, 7u, 0u, 1u, 2u, 3u
2042 };
2043
ARGBMirrorRow_SSSE3(const uint8 * src,uint8 * dst,int width)2044 void ARGBMirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
2045 intptr_t temp_width = static_cast<intptr_t>(width);
2046 asm volatile (
2047 "movdqa %3,%%xmm5 \n"
2048 "lea -0x10(%0),%0 \n"
2049 ".p2align 4 \n"
2050 "1: \n"
2051 "movdqa (%0,%2,4),%%xmm0 \n"
2052 "pshufb %%xmm5,%%xmm0 \n"
2053 "sub $0x4,%2 \n"
2054 "movdqa %%xmm0,(%1) \n"
2055 "lea 0x10(%1),%1 \n"
2056 "jg 1b \n"
2057 : "+r"(src), // %0
2058 "+r"(dst), // %1
2059 "+r"(temp_width) // %2
2060 : "m"(kARGBShuffleMirror) // %3
2061 : "memory", "cc"
2062 #if defined(__SSE2__)
2063 , "xmm0", "xmm5"
2064 #endif
2065 );
2066 }
2067 #endif // HAS_ARGBMIRRORROW_SSSE3
2068
2069 #ifdef HAS_SPLITUV_SSE2
SplitUV_SSE2(const uint8 * src_uv,uint8 * dst_u,uint8 * dst_v,int pix)2070 void SplitUV_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
2071 asm volatile (
2072 "pcmpeqb %%xmm5,%%xmm5 \n"
2073 "psrlw $0x8,%%xmm5 \n"
2074 "sub %1,%2 \n"
2075 ".p2align 4 \n"
2076 "1: \n"
2077 "movdqa (%0),%%xmm0 \n"
2078 "movdqa 0x10(%0),%%xmm1 \n"
2079 "lea 0x20(%0),%0 \n"
2080 "movdqa %%xmm0,%%xmm2 \n"
2081 "movdqa %%xmm1,%%xmm3 \n"
2082 "pand %%xmm5,%%xmm0 \n"
2083 "pand %%xmm5,%%xmm1 \n"
2084 "packuswb %%xmm1,%%xmm0 \n"
2085 "psrlw $0x8,%%xmm2 \n"
2086 "psrlw $0x8,%%xmm3 \n"
2087 "packuswb %%xmm3,%%xmm2 \n"
2088 "movdqa %%xmm0,(%1) \n"
2089 "movdqa %%xmm2,(%1,%2) \n"
2090 "lea 0x10(%1),%1 \n"
2091 "sub $0x10,%3 \n"
2092 "jg 1b \n"
2093 : "+r"(src_uv), // %0
2094 "+r"(dst_u), // %1
2095 "+r"(dst_v), // %2
2096 "+r"(pix) // %3
2097 :
2098 : "memory", "cc"
2099 #if defined(__SSE2__)
2100 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
2101 #endif
2102 );
2103 }
2104 #endif // HAS_SPLITUV_SSE2
2105
2106 #ifdef HAS_COPYROW_SSE2
CopyRow_SSE2(const uint8 * src,uint8 * dst,int count)2107 void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
2108 asm volatile (
2109 "sub %0,%1 \n"
2110 ".p2align 4 \n"
2111 "1: \n"
2112 "movdqa (%0),%%xmm0 \n"
2113 "movdqa 0x10(%0),%%xmm1 \n"
2114 "movdqa %%xmm0,(%0,%1) \n"
2115 "movdqa %%xmm1,0x10(%0,%1) \n"
2116 "lea 0x20(%0),%0 \n"
2117 "sub $0x20,%2 \n"
2118 "jg 1b \n"
2119 : "+r"(src), // %0
2120 "+r"(dst), // %1
2121 "+r"(count) // %2
2122 :
2123 : "memory", "cc"
2124 #if defined(__SSE2__)
2125 , "xmm0", "xmm1"
2126 #endif
2127 );
2128 }
2129 #endif // HAS_COPYROW_SSE2
2130
2131 #ifdef HAS_COPYROW_X86
CopyRow_X86(const uint8 * src,uint8 * dst,int width)2132 void CopyRow_X86(const uint8* src, uint8* dst, int width) {
2133 size_t width_tmp = static_cast<size_t>(width);
2134 asm volatile (
2135 "shr $0x2,%2 \n"
2136 "rep movsl \n"
2137 : "+S"(src), // %0
2138 "+D"(dst), // %1
2139 "+c"(width_tmp) // %2
2140 :
2141 : "memory", "cc"
2142 );
2143 }
2144 #endif // HAS_COPYROW_X86
2145
2146 #ifdef HAS_SETROW_X86
SetRow8_X86(uint8 * dst,uint32 v32,int width)2147 void SetRow8_X86(uint8* dst, uint32 v32, int width) {
2148 size_t width_tmp = static_cast<size_t>(width);
2149 asm volatile (
2150 "shr $0x2,%1 \n"
2151 "rep stosl \n"
2152 : "+D"(dst), // %0
2153 "+c"(width_tmp) // %1
2154 : "a"(v32) // %2
2155 : "memory", "cc");
2156 }
2157
SetRows32_X86(uint8 * dst,uint32 v32,int width,int dst_stride,int height)2158 void SetRows32_X86(uint8* dst, uint32 v32, int width,
2159 int dst_stride, int height) {
2160 for (int y = 0; y < height; ++y) {
2161 size_t width_tmp = static_cast<size_t>(width);
2162 uint32* d = reinterpret_cast<uint32*>(dst);
2163 asm volatile (
2164 "rep stosl \n"
2165 : "+D"(d), // %0
2166 "+c"(width_tmp) // %1
2167 : "a"(v32) // %2
2168 : "memory", "cc");
2169 dst += dst_stride;
2170 }
2171 }
2172 #endif // HAS_SETROW_X86
2173
2174 #ifdef HAS_YUY2TOYROW_SSE2
YUY2ToYRow_SSE2(const uint8 * src_yuy2,uint8 * dst_y,int pix)2175 void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix) {
2176 asm volatile (
2177 "pcmpeqb %%xmm5,%%xmm5 \n"
2178 "psrlw $0x8,%%xmm5 \n"
2179 ".p2align 4 \n"
2180 "1: \n"
2181 "movdqa (%0),%%xmm0 \n"
2182 "movdqa 0x10(%0),%%xmm1 \n"
2183 "lea 0x20(%0),%0 \n"
2184 "pand %%xmm5,%%xmm0 \n"
2185 "pand %%xmm5,%%xmm1 \n"
2186 "packuswb %%xmm1,%%xmm0 \n"
2187 "movdqa %%xmm0,(%1) \n"
2188 "lea 0x10(%1),%1 \n"
2189 "sub $0x10,%2 \n"
2190 "jg 1b \n"
2191 : "+r"(src_yuy2), // %0
2192 "+r"(dst_y), // %1
2193 "+r"(pix) // %2
2194 :
2195 : "memory", "cc"
2196 #if defined(__SSE2__)
2197 , "xmm0", "xmm1", "xmm5"
2198 #endif
2199 );
2200 }
2201
YUY2ToUVRow_SSE2(const uint8 * src_yuy2,int stride_yuy2,uint8 * dst_u,uint8 * dst_v,int pix)2202 void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
2203 uint8* dst_u, uint8* dst_v, int pix) {
2204 asm volatile (
2205 "pcmpeqb %%xmm5,%%xmm5 \n"
2206 "psrlw $0x8,%%xmm5 \n"
2207 "sub %1,%2 \n"
2208 ".p2align 4 \n"
2209 "1: \n"
2210 "movdqa (%0),%%xmm0 \n"
2211 "movdqa 0x10(%0),%%xmm1 \n"
2212 "movdqa (%0,%4,1),%%xmm2 \n"
2213 "movdqa 0x10(%0,%4,1),%%xmm3 \n"
2214 "lea 0x20(%0),%0 \n"
2215 "pavgb %%xmm2,%%xmm0 \n"
2216 "pavgb %%xmm3,%%xmm1 \n"
2217 "psrlw $0x8,%%xmm0 \n"
2218 "psrlw $0x8,%%xmm1 \n"
2219 "packuswb %%xmm1,%%xmm0 \n"
2220 "movdqa %%xmm0,%%xmm1 \n"
2221 "pand %%xmm5,%%xmm0 \n"
2222 "packuswb %%xmm0,%%xmm0 \n"
2223 "psrlw $0x8,%%xmm1 \n"
2224 "packuswb %%xmm1,%%xmm1 \n"
2225 "movq %%xmm0,(%1) \n"
2226 "movq %%xmm1,(%1,%2) \n"
2227 "lea 0x8(%1),%1 \n"
2228 "sub $0x10,%3 \n"
2229 "jg 1b \n"
2230 : "+r"(src_yuy2), // %0
2231 "+r"(dst_u), // %1
2232 "+r"(dst_v), // %2
2233 "+r"(pix) // %3
2234 : "r"(static_cast<intptr_t>(stride_yuy2)) // %4
2235 : "memory", "cc"
2236 #if defined(__SSE2__)
2237 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
2238 #endif
2239 );
2240 }
2241
YUY2ToUV422Row_SSE2(const uint8 * src_yuy2,uint8 * dst_u,uint8 * dst_v,int pix)2242 void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
2243 uint8* dst_u, uint8* dst_v, int pix) {
2244 asm volatile (
2245 "pcmpeqb %%xmm5,%%xmm5 \n"
2246 "psrlw $0x8,%%xmm5 \n"
2247 "sub %1,%2 \n"
2248 ".p2align 4 \n"
2249 "1: \n"
2250 "movdqa (%0),%%xmm0 \n"
2251 "movdqa 0x10(%0),%%xmm1 \n"
2252 "lea 0x20(%0),%0 \n"
2253 "psrlw $0x8,%%xmm0 \n"
2254 "psrlw $0x8,%%xmm1 \n"
2255 "packuswb %%xmm1,%%xmm0 \n"
2256 "movdqa %%xmm0,%%xmm1 \n"
2257 "pand %%xmm5,%%xmm0 \n"
2258 "packuswb %%xmm0,%%xmm0 \n"
2259 "psrlw $0x8,%%xmm1 \n"
2260 "packuswb %%xmm1,%%xmm1 \n"
2261 "movq %%xmm0,(%1) \n"
2262 "movq %%xmm1,(%1,%2) \n"
2263 "lea 0x8(%1),%1 \n"
2264 "sub $0x10,%3 \n"
2265 "jg 1b \n"
2266 : "+r"(src_yuy2), // %0
2267 "+r"(dst_u), // %1
2268 "+r"(dst_v), // %2
2269 "+r"(pix) // %3
2270 :
2271 : "memory", "cc"
2272 #if defined(__SSE2__)
2273 , "xmm0", "xmm1", "xmm5"
2274 #endif
2275 );
2276 }
2277
YUY2ToYRow_Unaligned_SSE2(const uint8 * src_yuy2,uint8 * dst_y,int pix)2278 void YUY2ToYRow_Unaligned_SSE2(const uint8* src_yuy2,
2279 uint8* dst_y, int pix) {
2280 asm volatile (
2281 "pcmpeqb %%xmm5,%%xmm5 \n"
2282 "psrlw $0x8,%%xmm5 \n"
2283 ".p2align 4 \n"
2284 "1: \n"
2285 "movdqu (%0),%%xmm0 \n"
2286 "movdqu 0x10(%0),%%xmm1 \n"
2287 "lea 0x20(%0),%0 \n"
2288 "pand %%xmm5,%%xmm0 \n"
2289 "pand %%xmm5,%%xmm1 \n"
2290 "packuswb %%xmm1,%%xmm0 \n"
2291 "sub $0x10,%2 \n"
2292 "movdqu %%xmm0,(%1) \n"
2293 "lea 0x10(%1),%1 \n"
2294 "jg 1b \n"
2295 : "+r"(src_yuy2), // %0
2296 "+r"(dst_y), // %1
2297 "+r"(pix) // %2
2298 :
2299 : "memory", "cc"
2300 #if defined(__SSE2__)
2301 , "xmm0", "xmm1", "xmm5"
2302 #endif
2303 );
2304 }
2305
YUY2ToUVRow_Unaligned_SSE2(const uint8 * src_yuy2,int stride_yuy2,uint8 * dst_u,uint8 * dst_v,int pix)2306 void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2,
2307 int stride_yuy2,
2308 uint8* dst_u, uint8* dst_v, int pix) {
2309 asm volatile (
2310 "pcmpeqb %%xmm5,%%xmm5 \n"
2311 "psrlw $0x8,%%xmm5 \n"
2312 "sub %1,%2 \n"
2313 ".p2align 4 \n"
2314 "1: \n"
2315 "movdqu (%0),%%xmm0 \n"
2316 "movdqu 0x10(%0),%%xmm1 \n"
2317 "movdqu (%0,%4,1),%%xmm2 \n"
2318 "movdqu 0x10(%0,%4,1),%%xmm3 \n"
2319 "lea 0x20(%0),%0 \n"
2320 "pavgb %%xmm2,%%xmm0 \n"
2321 "pavgb %%xmm3,%%xmm1 \n"
2322 "psrlw $0x8,%%xmm0 \n"
2323 "psrlw $0x8,%%xmm1 \n"
2324 "packuswb %%xmm1,%%xmm0 \n"
2325 "movdqa %%xmm0,%%xmm1 \n"
2326 "pand %%xmm5,%%xmm0 \n"
2327 "packuswb %%xmm0,%%xmm0 \n"
2328 "psrlw $0x8,%%xmm1 \n"
2329 "packuswb %%xmm1,%%xmm1 \n"
2330 "movq %%xmm0,(%1) \n"
2331 "movq %%xmm1,(%1,%2) \n"
2332 "lea 0x8(%1),%1 \n"
2333 "sub $0x10,%3 \n"
2334 "jg 1b \n"
2335 : "+r"(src_yuy2), // %0
2336 "+r"(dst_u), // %1
2337 "+r"(dst_v), // %2
2338 "+r"(pix) // %3
2339 : "r"(static_cast<intptr_t>(stride_yuy2)) // %4
2340 : "memory", "cc"
2341 #if defined(__SSE2__)
2342 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
2343 #endif
2344 );
2345 }
2346
YUY2ToUV422Row_Unaligned_SSE2(const uint8 * src_yuy2,uint8 * dst_u,uint8 * dst_v,int pix)2347 void YUY2ToUV422Row_Unaligned_SSE2(const uint8* src_yuy2,
2348 uint8* dst_u, uint8* dst_v, int pix) {
2349 asm volatile (
2350 "pcmpeqb %%xmm5,%%xmm5 \n"
2351 "psrlw $0x8,%%xmm5 \n"
2352 "sub %1,%2 \n"
2353 ".p2align 4 \n"
2354 "1: \n"
2355 "movdqu (%0),%%xmm0 \n"
2356 "movdqu 0x10(%0),%%xmm1 \n"
2357 "lea 0x20(%0),%0 \n"
2358 "psrlw $0x8,%%xmm0 \n"
2359 "psrlw $0x8,%%xmm1 \n"
2360 "packuswb %%xmm1,%%xmm0 \n"
2361 "movdqa %%xmm0,%%xmm1 \n"
2362 "pand %%xmm5,%%xmm0 \n"
2363 "packuswb %%xmm0,%%xmm0 \n"
2364 "psrlw $0x8,%%xmm1 \n"
2365 "packuswb %%xmm1,%%xmm1 \n"
2366 "movq %%xmm0,(%1) \n"
2367 "movq %%xmm1,(%1,%2) \n"
2368 "lea 0x8(%1),%1 \n"
2369 "sub $0x10,%3 \n"
2370 "jg 1b \n"
2371 : "+r"(src_yuy2), // %0
2372 "+r"(dst_u), // %1
2373 "+r"(dst_v), // %2
2374 "+r"(pix) // %3
2375 :
2376 : "memory", "cc"
2377 #if defined(__SSE2__)
2378 , "xmm0", "xmm1", "xmm5"
2379 #endif
2380 );
2381 }
2382
UYVYToYRow_SSE2(const uint8 * src_uyvy,uint8 * dst_y,int pix)2383 void UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int pix) {
2384 asm volatile (
2385 ".p2align 4 \n"
2386 "1: \n"
2387 "movdqa (%0),%%xmm0 \n"
2388 "movdqa 0x10(%0),%%xmm1 \n"
2389 "lea 0x20(%0),%0 \n"
2390 "psrlw $0x8,%%xmm0 \n"
2391 "psrlw $0x8,%%xmm1 \n"
2392 "packuswb %%xmm1,%%xmm0 \n"
2393 "sub $0x10,%2 \n"
2394 "movdqa %%xmm0,(%1) \n"
2395 "lea 0x10(%1),%1 \n"
2396 "jg 1b \n"
2397 : "+r"(src_uyvy), // %0
2398 "+r"(dst_y), // %1
2399 "+r"(pix) // %2
2400 :
2401 : "memory", "cc"
2402 #if defined(__SSE2__)
2403 , "xmm0", "xmm1"
2404 #endif
2405 );
2406 }
2407
UYVYToUVRow_SSE2(const uint8 * src_uyvy,int stride_uyvy,uint8 * dst_u,uint8 * dst_v,int pix)2408 void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
2409 uint8* dst_u, uint8* dst_v, int pix) {
2410 asm volatile (
2411 "pcmpeqb %%xmm5,%%xmm5 \n"
2412 "psrlw $0x8,%%xmm5 \n"
2413 "sub %1,%2 \n"
2414 ".p2align 4 \n"
2415 "1: \n"
2416 "movdqa (%0),%%xmm0 \n"
2417 "movdqa 0x10(%0),%%xmm1 \n"
2418 "movdqa (%0,%4,1),%%xmm2 \n"
2419 "movdqa 0x10(%0,%4,1),%%xmm3 \n"
2420 "lea 0x20(%0),%0 \n"
2421 "pavgb %%xmm2,%%xmm0 \n"
2422 "pavgb %%xmm3,%%xmm1 \n"
2423 "pand %%xmm5,%%xmm0 \n"
2424 "pand %%xmm5,%%xmm1 \n"
2425 "packuswb %%xmm1,%%xmm0 \n"
2426 "movdqa %%xmm0,%%xmm1 \n"
2427 "pand %%xmm5,%%xmm0 \n"
2428 "packuswb %%xmm0,%%xmm0 \n"
2429 "psrlw $0x8,%%xmm1 \n"
2430 "packuswb %%xmm1,%%xmm1 \n"
2431 "movq %%xmm0,(%1) \n"
2432 "movq %%xmm1,(%1,%2) \n"
2433 "lea 0x8(%1),%1 \n"
2434 "sub $0x10,%3 \n"
2435 "jg 1b \n"
2436 : "+r"(src_uyvy), // %0
2437 "+r"(dst_u), // %1
2438 "+r"(dst_v), // %2
2439 "+r"(pix) // %3
2440 : "r"(static_cast<intptr_t>(stride_uyvy)) // %4
2441 : "memory", "cc"
2442 #if defined(__SSE2__)
2443 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
2444 #endif
2445 );
2446 }
2447
UYVYToUV422Row_SSE2(const uint8 * src_uyvy,uint8 * dst_u,uint8 * dst_v,int pix)2448 void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
2449 uint8* dst_u, uint8* dst_v, int pix) {
2450 asm volatile (
2451 "pcmpeqb %%xmm5,%%xmm5 \n"
2452 "psrlw $0x8,%%xmm5 \n"
2453 "sub %1,%2 \n"
2454 ".p2align 4 \n"
2455 "1: \n"
2456 "movdqa (%0),%%xmm0 \n"
2457 "movdqa 0x10(%0),%%xmm1 \n"
2458 "lea 0x20(%0),%0 \n"
2459 "pand %%xmm5,%%xmm0 \n"
2460 "pand %%xmm5,%%xmm1 \n"
2461 "packuswb %%xmm1,%%xmm0 \n"
2462 "movdqa %%xmm0,%%xmm1 \n"
2463 "pand %%xmm5,%%xmm0 \n"
2464 "packuswb %%xmm0,%%xmm0 \n"
2465 "psrlw $0x8,%%xmm1 \n"
2466 "packuswb %%xmm1,%%xmm1 \n"
2467 "movq %%xmm0,(%1) \n"
2468 "movq %%xmm1,(%1,%2) \n"
2469 "lea 0x8(%1),%1 \n"
2470 "sub $0x10,%3 \n"
2471 "jg 1b \n"
2472 : "+r"(src_uyvy), // %0
2473 "+r"(dst_u), // %1
2474 "+r"(dst_v), // %2
2475 "+r"(pix) // %3
2476 :
2477 : "memory", "cc"
2478 #if defined(__SSE2__)
2479 , "xmm0", "xmm1", "xmm5"
2480 #endif
2481 );
2482 }
2483
UYVYToYRow_Unaligned_SSE2(const uint8 * src_uyvy,uint8 * dst_y,int pix)2484 void UYVYToYRow_Unaligned_SSE2(const uint8* src_uyvy,
2485 uint8* dst_y, int pix) {
2486 asm volatile (
2487 ".p2align 4 \n"
2488 "1: \n"
2489 "movdqu (%0),%%xmm0 \n"
2490 "movdqu 0x10(%0),%%xmm1 \n"
2491 "lea 0x20(%0),%0 \n"
2492 "psrlw $0x8,%%xmm0 \n"
2493 "psrlw $0x8,%%xmm1 \n"
2494 "packuswb %%xmm1,%%xmm0 \n"
2495 "sub $0x10,%2 \n"
2496 "movdqu %%xmm0,(%1) \n"
2497 "lea 0x10(%1),%1 \n"
2498 "jg 1b \n"
2499 : "+r"(src_uyvy), // %0
2500 "+r"(dst_y), // %1
2501 "+r"(pix) // %2
2502 :
2503 : "memory", "cc"
2504 #if defined(__SSE2__)
2505 , "xmm0", "xmm1"
2506 #endif
2507 );
2508 }
2509
UYVYToUVRow_Unaligned_SSE2(const uint8 * src_uyvy,int stride_uyvy,uint8 * dst_u,uint8 * dst_v,int pix)2510 void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy,
2511 uint8* dst_u, uint8* dst_v, int pix) {
2512 asm volatile (
2513 "pcmpeqb %%xmm5,%%xmm5 \n"
2514 "psrlw $0x8,%%xmm5 \n"
2515 "sub %1,%2 \n"
2516 ".p2align 4 \n"
2517 "1: \n"
2518 "movdqu (%0),%%xmm0 \n"
2519 "movdqu 0x10(%0),%%xmm1 \n"
2520 "movdqu (%0,%4,1),%%xmm2 \n"
2521 "movdqu 0x10(%0,%4,1),%%xmm3 \n"
2522 "lea 0x20(%0),%0 \n"
2523 "pavgb %%xmm2,%%xmm0 \n"
2524 "pavgb %%xmm3,%%xmm1 \n"
2525 "pand %%xmm5,%%xmm0 \n"
2526 "pand %%xmm5,%%xmm1 \n"
2527 "packuswb %%xmm1,%%xmm0 \n"
2528 "movdqa %%xmm0,%%xmm1 \n"
2529 "pand %%xmm5,%%xmm0 \n"
2530 "packuswb %%xmm0,%%xmm0 \n"
2531 "psrlw $0x8,%%xmm1 \n"
2532 "packuswb %%xmm1,%%xmm1 \n"
2533 "movq %%xmm0,(%1) \n"
2534 "movq %%xmm1,(%1,%2) \n"
2535 "lea 0x8(%1),%1 \n"
2536 "sub $0x10,%3 \n"
2537 "jg 1b \n"
2538 : "+r"(src_uyvy), // %0
2539 "+r"(dst_u), // %1
2540 "+r"(dst_v), // %2
2541 "+r"(pix) // %3
2542 : "r"(static_cast<intptr_t>(stride_uyvy)) // %4
2543 : "memory", "cc"
2544 #if defined(__SSE2__)
2545 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
2546 #endif
2547 );
2548 }
2549
UYVYToUV422Row_Unaligned_SSE2(const uint8 * src_uyvy,uint8 * dst_u,uint8 * dst_v,int pix)2550 void UYVYToUV422Row_Unaligned_SSE2(const uint8* src_uyvy,
2551 uint8* dst_u, uint8* dst_v, int pix) {
2552 asm volatile (
2553 "pcmpeqb %%xmm5,%%xmm5 \n"
2554 "psrlw $0x8,%%xmm5 \n"
2555 "sub %1,%2 \n"
2556 ".p2align 4 \n"
2557 "1: \n"
2558 "movdqu (%0),%%xmm0 \n"
2559 "movdqu 0x10(%0),%%xmm1 \n"
2560 "lea 0x20(%0),%0 \n"
2561 "pand %%xmm5,%%xmm0 \n"
2562 "pand %%xmm5,%%xmm1 \n"
2563 "packuswb %%xmm1,%%xmm0 \n"
2564 "movdqa %%xmm0,%%xmm1 \n"
2565 "pand %%xmm5,%%xmm0 \n"
2566 "packuswb %%xmm0,%%xmm0 \n"
2567 "psrlw $0x8,%%xmm1 \n"
2568 "packuswb %%xmm1,%%xmm1 \n"
2569 "movq %%xmm0,(%1) \n"
2570 "movq %%xmm1,(%1,%2) \n"
2571 "lea 0x8(%1),%1 \n"
2572 "sub $0x10,%3 \n"
2573 "jg 1b \n"
2574 : "+r"(src_uyvy), // %0
2575 "+r"(dst_u), // %1
2576 "+r"(dst_v), // %2
2577 "+r"(pix) // %3
2578 :
2579 : "memory", "cc"
2580 #if defined(__SSE2__)
2581 , "xmm0", "xmm1", "xmm5"
2582 #endif
2583 );
2584 }
2585 #endif // HAS_YUY2TOYROW_SSE2
2586
2587 #ifdef HAS_ARGBBLENDROW_SSE2
2588 // Blend 8 pixels at a time.
ARGBBlendRow_SSE2(const uint8 * src_argb0,const uint8 * src_argb1,uint8 * dst_argb,int width)2589 void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
2590 uint8* dst_argb, int width) {
2591 asm volatile (
2592 "pcmpeqb %%xmm7,%%xmm7 \n"
2593 "psrlw $0xf,%%xmm7 \n"
2594 "pcmpeqb %%xmm6,%%xmm6 \n"
2595 "psrlw $0x8,%%xmm6 \n"
2596 "pcmpeqb %%xmm5,%%xmm5 \n"
2597 "psllw $0x8,%%xmm5 \n"
2598 "pcmpeqb %%xmm4,%%xmm4 \n"
2599 "pslld $0x18,%%xmm4 \n"
2600 "sub $0x1,%3 \n"
2601 "je 91f \n"
2602 "jl 99f \n"
2603
2604 // 1 pixel loop until destination pointer is aligned.
2605 "10: \n"
2606 "test $0xf,%2 \n"
2607 "je 19f \n"
2608 "movd (%0),%%xmm3 \n"
2609 "lea 0x4(%0),%0 \n"
2610 "movdqa %%xmm3,%%xmm0 \n"
2611 "pxor %%xmm4,%%xmm3 \n"
2612 "movd (%1),%%xmm2 \n"
2613 "psrlw $0x8,%%xmm3 \n"
2614 "pshufhw $0xf5,%%xmm3,%%xmm3 \n"
2615 "pshuflw $0xf5,%%xmm3,%%xmm3 \n"
2616 "pand %%xmm6,%%xmm2 \n"
2617 "paddw %%xmm7,%%xmm3 \n"
2618 "pmullw %%xmm3,%%xmm2 \n"
2619 "movd (%1),%%xmm1 \n"
2620 "lea 0x4(%1),%1 \n"
2621 "psrlw $0x8,%%xmm1 \n"
2622 "por %%xmm4,%%xmm0 \n"
2623 "pmullw %%xmm3,%%xmm1 \n"
2624 "psrlw $0x8,%%xmm2 \n"
2625 "paddusb %%xmm2,%%xmm0 \n"
2626 "pand %%xmm5,%%xmm1 \n"
2627 "paddusb %%xmm1,%%xmm0 \n"
2628 "sub $0x1,%3 \n"
2629 "movd %%xmm0,(%2) \n"
2630 "lea 0x4(%2),%2 \n"
2631 "jge 10b \n"
2632
2633 "19: \n"
2634 "add $1-4,%3 \n"
2635 "jl 49f \n"
2636
2637 // 4 pixel loop.
2638 ".p2align 2 \n"
2639 "41: \n"
2640 "movdqu (%0),%%xmm3 \n"
2641 "lea 0x10(%0),%0 \n"
2642 "movdqa %%xmm3,%%xmm0 \n"
2643 "pxor %%xmm4,%%xmm3 \n"
2644 "movdqu (%1),%%xmm2 \n"
2645 "psrlw $0x8,%%xmm3 \n"
2646 "pshufhw $0xf5,%%xmm3,%%xmm3 \n"
2647 "pshuflw $0xf5,%%xmm3,%%xmm3 \n"
2648 "pand %%xmm6,%%xmm2 \n"
2649 "paddw %%xmm7,%%xmm3 \n"
2650 "pmullw %%xmm3,%%xmm2 \n"
2651 "movdqu (%1),%%xmm1 \n"
2652 "lea 0x10(%1),%1 \n"
2653 "psrlw $0x8,%%xmm1 \n"
2654 "por %%xmm4,%%xmm0 \n"
2655 "pmullw %%xmm3,%%xmm1 \n"
2656 "psrlw $0x8,%%xmm2 \n"
2657 "paddusb %%xmm2,%%xmm0 \n"
2658 "pand %%xmm5,%%xmm1 \n"
2659 "paddusb %%xmm1,%%xmm0 \n"
2660 "sub $0x4,%3 \n"
2661 "movdqa %%xmm0,(%2) \n"
2662 "lea 0x10(%2),%2 \n"
2663 "jge 41b \n"
2664
2665 "49: \n"
2666 "add $0x3,%3 \n"
2667 "jl 99f \n"
2668
2669 // 1 pixel loop.
2670 "91: \n"
2671 "movd (%0),%%xmm3 \n"
2672 "lea 0x4(%0),%0 \n"
2673 "movdqa %%xmm3,%%xmm0 \n"
2674 "pxor %%xmm4,%%xmm3 \n"
2675 "movd (%1),%%xmm2 \n"
2676 "psrlw $0x8,%%xmm3 \n"
2677 "pshufhw $0xf5,%%xmm3,%%xmm3 \n"
2678 "pshuflw $0xf5,%%xmm3,%%xmm3 \n"
2679 "pand %%xmm6,%%xmm2 \n"
2680 "paddw %%xmm7,%%xmm3 \n"
2681 "pmullw %%xmm3,%%xmm2 \n"
2682 "movd (%1),%%xmm1 \n"
2683 "lea 0x4(%1),%1 \n"
2684 "psrlw $0x8,%%xmm1 \n"
2685 "por %%xmm4,%%xmm0 \n"
2686 "pmullw %%xmm3,%%xmm1 \n"
2687 "psrlw $0x8,%%xmm2 \n"
2688 "paddusb %%xmm2,%%xmm0 \n"
2689 "pand %%xmm5,%%xmm1 \n"
2690 "paddusb %%xmm1,%%xmm0 \n"
2691 "sub $0x1,%3 \n"
2692 "movd %%xmm0,(%2) \n"
2693 "lea 0x4(%2),%2 \n"
2694 "jge 91b \n"
2695 "99: \n"
2696 : "+r"(src_argb0), // %0
2697 "+r"(src_argb1), // %1
2698 "+r"(dst_argb), // %2
2699 "+r"(width) // %3
2700 :
2701 : "memory", "cc"
2702 #if defined(__SSE2__)
2703 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
2704 #endif
2705 );
2706 }
2707 #endif // HAS_ARGBBLENDROW_SSE2
2708
2709 #ifdef HAS_ARGBBLENDROW_SSSE3
2710 // Shuffle table for isolating alpha.
2711 CONST uvec8 kShuffleAlpha = {
2712 3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80,
2713 11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80
2714 };
2715
2716 // Blend 8 pixels at a time
2717 // Shuffle table for reversing the bytes.
2718
2719 // Same as SSE2, but replaces
2720 // psrlw xmm3, 8 // alpha
2721 // pshufhw xmm3, xmm3,0F5h // 8 alpha words
2722 // pshuflw xmm3, xmm3,0F5h
2723 // with..
2724 // pshufb xmm3, kShuffleAlpha // alpha
2725
ARGBBlendRow_SSSE3(const uint8 * src_argb0,const uint8 * src_argb1,uint8 * dst_argb,int width)2726 void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
2727 uint8* dst_argb, int width) {
2728 asm volatile (
2729 "pcmpeqb %%xmm7,%%xmm7 \n"
2730 "psrlw $0xf,%%xmm7 \n"
2731 "pcmpeqb %%xmm6,%%xmm6 \n"
2732 "psrlw $0x8,%%xmm6 \n"
2733 "pcmpeqb %%xmm5,%%xmm5 \n"
2734 "psllw $0x8,%%xmm5 \n"
2735 "pcmpeqb %%xmm4,%%xmm4 \n"
2736 "pslld $0x18,%%xmm4 \n"
2737 "sub $0x1,%3 \n"
2738 "je 91f \n"
2739 "jl 99f \n"
2740
2741 // 1 pixel loop until destination pointer is aligned.
2742 "10: \n"
2743 "test $0xf,%2 \n"
2744 "je 19f \n"
2745 "movd (%0),%%xmm3 \n"
2746 "lea 0x4(%0),%0 \n"
2747 "movdqa %%xmm3,%%xmm0 \n"
2748 "pxor %%xmm4,%%xmm3 \n"
2749 "movd (%1),%%xmm2 \n"
2750 "pshufb %4,%%xmm3 \n"
2751 "pand %%xmm6,%%xmm2 \n"
2752 "paddw %%xmm7,%%xmm3 \n"
2753 "pmullw %%xmm3,%%xmm2 \n"
2754 "movd (%1),%%xmm1 \n"
2755 "lea 0x4(%1),%1 \n"
2756 "psrlw $0x8,%%xmm1 \n"
2757 "por %%xmm4,%%xmm0 \n"
2758 "pmullw %%xmm3,%%xmm1 \n"
2759 "psrlw $0x8,%%xmm2 \n"
2760 "paddusb %%xmm2,%%xmm0 \n"
2761 "pand %%xmm5,%%xmm1 \n"
2762 "paddusb %%xmm1,%%xmm0 \n"
2763 "sub $0x1,%3 \n"
2764 "movd %%xmm0,(%2) \n"
2765 "lea 0x4(%2),%2 \n"
2766 "jge 10b \n"
2767
2768 "19: \n"
2769 "add $1-4,%3 \n"
2770 "jl 49f \n"
2771 "test $0xf,%0 \n"
2772 "jne 41f \n"
2773 "test $0xf,%1 \n"
2774 "jne 41f \n"
2775
2776 // 4 pixel loop.
2777 ".p2align 2 \n"
2778 "40: \n"
2779 "movdqa (%0),%%xmm3 \n"
2780 "lea 0x10(%0),%0 \n"
2781 "movdqa %%xmm3,%%xmm0 \n"
2782 "pxor %%xmm4,%%xmm3 \n"
2783 "movdqa (%1),%%xmm2 \n"
2784 "pshufb %4,%%xmm3 \n"
2785 "pand %%xmm6,%%xmm2 \n"
2786 "paddw %%xmm7,%%xmm3 \n"
2787 "pmullw %%xmm3,%%xmm2 \n"
2788 "movdqa (%1),%%xmm1 \n"
2789 "lea 0x10(%1),%1 \n"
2790 "psrlw $0x8,%%xmm1 \n"
2791 "por %%xmm4,%%xmm0 \n"
2792 "pmullw %%xmm3,%%xmm1 \n"
2793 "psrlw $0x8,%%xmm2 \n"
2794 "paddusb %%xmm2,%%xmm0 \n"
2795 "pand %%xmm5,%%xmm1 \n"
2796 "paddusb %%xmm1,%%xmm0 \n"
2797 "sub $0x4,%3 \n"
2798 "movdqa %%xmm0,(%2) \n"
2799 "lea 0x10(%2),%2 \n"
2800 "jge 40b \n"
2801 "jmp 49f \n"
2802
2803 // 4 pixel unaligned loop.
2804 ".p2align 2 \n"
2805 "41: \n"
2806 "movdqu (%0),%%xmm3 \n"
2807 "lea 0x10(%0),%0 \n"
2808 "movdqa %%xmm3,%%xmm0 \n"
2809 "pxor %%xmm4,%%xmm3 \n"
2810 "movdqu (%1),%%xmm2 \n"
2811 "pshufb %4,%%xmm3 \n"
2812 "pand %%xmm6,%%xmm2 \n"
2813 "paddw %%xmm7,%%xmm3 \n"
2814 "pmullw %%xmm3,%%xmm2 \n"
2815 "movdqu (%1),%%xmm1 \n"
2816 "lea 0x10(%1),%1 \n"
2817 "psrlw $0x8,%%xmm1 \n"
2818 "por %%xmm4,%%xmm0 \n"
2819 "pmullw %%xmm3,%%xmm1 \n"
2820 "psrlw $0x8,%%xmm2 \n"
2821 "paddusb %%xmm2,%%xmm0 \n"
2822 "pand %%xmm5,%%xmm1 \n"
2823 "paddusb %%xmm1,%%xmm0 \n"
2824 "sub $0x4,%3 \n"
2825 "movdqa %%xmm0,(%2) \n"
2826 "lea 0x10(%2),%2 \n"
2827 "jge 41b \n"
2828
2829 "49: \n"
2830 "add $0x3,%3 \n"
2831 "jl 99f \n"
2832
2833 // 1 pixel loop.
2834 "91: \n"
2835 "movd (%0),%%xmm3 \n"
2836 "lea 0x4(%0),%0 \n"
2837 "movdqa %%xmm3,%%xmm0 \n"
2838 "pxor %%xmm4,%%xmm3 \n"
2839 "movd (%1),%%xmm2 \n"
2840 "pshufb %4,%%xmm3 \n"
2841 "pand %%xmm6,%%xmm2 \n"
2842 "paddw %%xmm7,%%xmm3 \n"
2843 "pmullw %%xmm3,%%xmm2 \n"
2844 "movd (%1),%%xmm1 \n"
2845 "lea 0x4(%1),%1 \n"
2846 "psrlw $0x8,%%xmm1 \n"
2847 "por %%xmm4,%%xmm0 \n"
2848 "pmullw %%xmm3,%%xmm1 \n"
2849 "psrlw $0x8,%%xmm2 \n"
2850 "paddusb %%xmm2,%%xmm0 \n"
2851 "pand %%xmm5,%%xmm1 \n"
2852 "paddusb %%xmm1,%%xmm0 \n"
2853 "sub $0x1,%3 \n"
2854 "movd %%xmm0,(%2) \n"
2855 "lea 0x4(%2),%2 \n"
2856 "jge 91b \n"
2857 "99: \n"
2858 : "+r"(src_argb0), // %0
2859 "+r"(src_argb1), // %1
2860 "+r"(dst_argb), // %2
2861 "+r"(width) // %3
2862 : "m"(kShuffleAlpha) // %4
2863 : "memory", "cc"
2864 #if defined(__SSE2__)
2865 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
2866 #endif
2867 );
2868 }
2869 #endif // HAS_ARGBBLENDROW_SSSE3
2870
2871 #ifdef HAS_ARGBATTENUATE_SSE2
2872 // Attenuate 4 pixels at a time.
2873 // aligned to 16 bytes
ARGBAttenuateRow_SSE2(const uint8 * src_argb,uint8 * dst_argb,int width)2874 void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
2875 asm volatile (
2876 "sub %0,%1 \n"
2877 "pcmpeqb %%xmm4,%%xmm4 \n"
2878 "pslld $0x18,%%xmm4 \n"
2879 "pcmpeqb %%xmm5,%%xmm5 \n"
2880 "psrld $0x8,%%xmm5 \n"
2881
2882 // 4 pixel loop.
2883 ".p2align 4 \n"
2884 "1: \n"
2885 "movdqa (%0),%%xmm0 \n"
2886 "punpcklbw %%xmm0,%%xmm0 \n"
2887 "pshufhw $0xff,%%xmm0,%%xmm2 \n"
2888 "pshuflw $0xff,%%xmm2,%%xmm2 \n"
2889 "pmulhuw %%xmm2,%%xmm0 \n"
2890 "movdqa (%0),%%xmm1 \n"
2891 "punpckhbw %%xmm1,%%xmm1 \n"
2892 "pshufhw $0xff,%%xmm1,%%xmm2 \n"
2893 "pshuflw $0xff,%%xmm2,%%xmm2 \n"
2894 "pmulhuw %%xmm2,%%xmm1 \n"
2895 "movdqa (%0),%%xmm2 \n"
2896 "psrlw $0x8,%%xmm0 \n"
2897 "pand %%xmm4,%%xmm2 \n"
2898 "psrlw $0x8,%%xmm1 \n"
2899 "packuswb %%xmm1,%%xmm0 \n"
2900 "pand %%xmm5,%%xmm0 \n"
2901 "por %%xmm2,%%xmm0 \n"
2902 "sub $0x4,%2 \n"
2903 "movdqa %%xmm0,(%0,%1,1) \n"
2904 "lea 0x10(%0),%0 \n"
2905 "jg 1b \n"
2906 : "+r"(src_argb), // %0
2907 "+r"(dst_argb), // %1
2908 "+r"(width) // %2
2909 :
2910 : "memory", "cc"
2911 #if defined(__SSE2__)
2912 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2913 #endif
2914 );
2915 }
2916 #endif // HAS_ARGBATTENUATE_SSE2
2917
2918 #ifdef HAS_ARGBATTENUATEROW_SSSE3
2919 // Shuffle table duplicating alpha
2920 CONST uvec8 kShuffleAlpha0 = {
2921 3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u,
2922 };
2923 CONST uvec8 kShuffleAlpha1 = {
2924 11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
2925 15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u,
2926 };
2927 // Attenuate 4 pixels at a time.
2928 // aligned to 16 bytes
ARGBAttenuateRow_SSSE3(const uint8 * src_argb,uint8 * dst_argb,int width)2929 void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
2930 asm volatile (
2931 "sub %0,%1 \n"
2932 "pcmpeqb %%xmm3,%%xmm3 \n"
2933 "pslld $0x18,%%xmm3 \n"
2934 "movdqa %3,%%xmm4 \n"
2935 "movdqa %4,%%xmm5 \n"
2936
2937 // 4 pixel loop.
2938 ".p2align 4 \n"
2939 "1: \n"
2940 "movdqa (%0),%%xmm0 \n"
2941 "pshufb %%xmm4,%%xmm0 \n"
2942 "movdqa (%0),%%xmm1 \n"
2943 "punpcklbw %%xmm1,%%xmm1 \n"
2944 "pmulhuw %%xmm1,%%xmm0 \n"
2945 "movdqa (%0),%%xmm1 \n"
2946 "pshufb %%xmm5,%%xmm1 \n"
2947 "movdqa (%0),%%xmm2 \n"
2948 "punpckhbw %%xmm2,%%xmm2 \n"
2949 "pmulhuw %%xmm2,%%xmm1 \n"
2950 "movdqa (%0),%%xmm2 \n"
2951 "pand %%xmm3,%%xmm2 \n"
2952 "psrlw $0x8,%%xmm0 \n"
2953 "psrlw $0x8,%%xmm1 \n"
2954 "packuswb %%xmm1,%%xmm0 \n"
2955 "por %%xmm2,%%xmm0 \n"
2956 "sub $0x4,%2 \n"
2957 "movdqa %%xmm0,(%0,%1,1) \n"
2958 "lea 0x10(%0),%0 \n"
2959 "jg 1b \n"
2960 : "+r"(src_argb), // %0
2961 "+r"(dst_argb), // %1
2962 "+r"(width) // %2
2963 : "m"(kShuffleAlpha0), // %3
2964 "m"(kShuffleAlpha1) // %4
2965 : "memory", "cc"
2966 #if defined(__SSE2__)
2967 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2968 #endif
2969 );
2970 }
2971 #endif // HAS_ARGBATTENUATEROW_SSSE3
2972
2973 #ifdef HAS_ARGBUNATTENUATEROW_SSE2
2974 // Unattenuate 4 pixels at a time.
2975 // aligned to 16 bytes
ARGBUnattenuateRow_SSE2(const uint8 * src_argb,uint8 * dst_argb,int width)2976 void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
2977 int width) {
2978 uintptr_t alpha = 0;
2979 asm volatile (
2980 "sub %0,%1 \n"
2981 "pcmpeqb %%xmm4,%%xmm4 \n"
2982 "pslld $0x18,%%xmm4 \n"
2983
2984 // 4 pixel loop.
2985 ".p2align 4 \n"
2986 "1: \n"
2987 "movdqa (%0),%%xmm0 \n"
2988 "movzb 0x3(%0),%3 \n"
2989 "punpcklbw %%xmm0,%%xmm0 \n"
2990 "movd 0x0(%4,%3,4),%%xmm2 \n"
2991 "movzb 0x7(%0),%3 \n"
2992 "movd 0x0(%4,%3,4),%%xmm3 \n"
2993 "pshuflw $0xc0,%%xmm2,%%xmm2 \n"
2994 "pshuflw $0xc0,%%xmm3,%%xmm3 \n"
2995 "movlhps %%xmm3,%%xmm2 \n"
2996 "pmulhuw %%xmm2,%%xmm0 \n"
2997 "movdqa (%0),%%xmm1 \n"
2998 "movzb 0xb(%0),%3 \n"
2999 "punpckhbw %%xmm1,%%xmm1 \n"
3000 "movd 0x0(%4,%3,4),%%xmm2 \n"
3001 "movzb 0xf(%0),%3 \n"
3002 "movd 0x0(%4,%3,4),%%xmm3 \n"
3003 "pshuflw $0xc0,%%xmm2,%%xmm2 \n"
3004 "pshuflw $0xc0,%%xmm3,%%xmm3 \n"
3005 "movlhps %%xmm3,%%xmm2 \n"
3006 "pmulhuw %%xmm2,%%xmm1 \n"
3007 "movdqa (%0),%%xmm2 \n"
3008 "pand %%xmm4,%%xmm2 \n"
3009 "packuswb %%xmm1,%%xmm0 \n"
3010 "por %%xmm2,%%xmm0 \n"
3011 "sub $0x4,%2 \n"
3012 "movdqa %%xmm0,(%0,%1,1) \n"
3013 "lea 0x10(%0),%0 \n"
3014 "jg 1b \n"
3015 : "+r"(src_argb), // %0
3016 "+r"(dst_argb), // %1
3017 "+r"(width), // %2
3018 "+r"(alpha) // %3
3019 : "r"(fixed_invtbl8) // %4
3020 : "memory", "cc"
3021 #if defined(__SSE2__)
3022 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
3023 #endif
3024 );
3025 }
3026 #endif // HAS_ARGBUNATTENUATEROW_SSE2
3027
3028 #ifdef HAS_ARGBGRAYROW_SSSE3
3029 // Constant for ARGB color to gray scale. 0.11 * B + 0.59 * G + 0.30 * R
3030 CONST vec8 kARGBToGray = {
3031 14, 76, 38, 0, 14, 76, 38, 0, 14, 76, 38, 0, 14, 76, 38, 0
3032 };
3033
3034 // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
ARGBGrayRow_SSSE3(const uint8 * src_argb,uint8 * dst_argb,int width)3035 void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
3036 asm volatile (
3037 "movdqa %3,%%xmm4 \n"
3038 "sub %0,%1 \n"
3039
3040 // 8 pixel loop.
3041 ".p2align 4 \n"
3042 "1: \n"
3043 "movdqa (%0),%%xmm0 \n"
3044 "movdqa 0x10(%0),%%xmm1 \n"
3045 "pmaddubsw %%xmm4,%%xmm0 \n"
3046 "pmaddubsw %%xmm4,%%xmm1 \n"
3047 "phaddw %%xmm1,%%xmm0 \n"
3048 "psrlw $0x7,%%xmm0 \n"
3049 "packuswb %%xmm0,%%xmm0 \n"
3050 "movdqa (%0),%%xmm2 \n"
3051 "movdqa 0x10(%0),%%xmm3 \n"
3052 "psrld $0x18,%%xmm2 \n"
3053 "psrld $0x18,%%xmm3 \n"
3054 "packuswb %%xmm3,%%xmm2 \n"
3055 "packuswb %%xmm2,%%xmm2 \n"
3056 "movdqa %%xmm0,%%xmm3 \n"
3057 "punpcklbw %%xmm0,%%xmm0 \n"
3058 "punpcklbw %%xmm2,%%xmm3 \n"
3059 "movdqa %%xmm0,%%xmm1 \n"
3060 "punpcklwd %%xmm3,%%xmm0 \n"
3061 "punpckhwd %%xmm3,%%xmm1 \n"
3062 "sub $0x8,%2 \n"
3063 "movdqa %%xmm0,(%0,%1,1) \n"
3064 "movdqa %%xmm1,0x10(%0,%1,1) \n"
3065 "lea 0x20(%0),%0 \n"
3066 "jg 1b \n"
3067 : "+r"(src_argb), // %0
3068 "+r"(dst_argb), // %1
3069 "+r"(width) // %2
3070 : "m"(kARGBToGray) // %3
3071 : "memory", "cc"
3072 #if defined(__SSE2__)
3073 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
3074 #endif
3075 );
3076 }
3077 #endif // HAS_ARGBGRAYROW_SSSE3
3078
3079 #ifdef HAS_ARGBSEPIAROW_SSSE3
3080 // b = (r * 35 + g * 68 + b * 17) >> 7
3081 // g = (r * 45 + g * 88 + b * 22) >> 7
3082 // r = (r * 50 + g * 98 + b * 24) >> 7
3083 // Constant for ARGB color to sepia tone
3084 CONST vec8 kARGBToSepiaB = {
3085 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0
3086 };
3087
3088 CONST vec8 kARGBToSepiaG = {
3089 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0
3090 };
3091
3092 CONST vec8 kARGBToSepiaR = {
3093 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0
3094 };
3095
3096 // Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
ARGBSepiaRow_SSSE3(uint8 * dst_argb,int width)3097 void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
3098 asm volatile (
3099 "movdqa %2,%%xmm2 \n"
3100 "movdqa %3,%%xmm3 \n"
3101 "movdqa %4,%%xmm4 \n"
3102
3103 // 8 pixel loop.
3104 ".p2align 4 \n"
3105 "1: \n"
3106 "movdqa (%0),%%xmm0 \n"
3107 "movdqa 0x10(%0),%%xmm6 \n"
3108 "pmaddubsw %%xmm2,%%xmm0 \n"
3109 "pmaddubsw %%xmm2,%%xmm6 \n"
3110 "phaddw %%xmm6,%%xmm0 \n"
3111 "psrlw $0x7,%%xmm0 \n"
3112 "packuswb %%xmm0,%%xmm0 \n"
3113 "movdqa (%0),%%xmm5 \n"
3114 "movdqa 0x10(%0),%%xmm1 \n"
3115 "pmaddubsw %%xmm3,%%xmm5 \n"
3116 "pmaddubsw %%xmm3,%%xmm1 \n"
3117 "phaddw %%xmm1,%%xmm5 \n"
3118 "psrlw $0x7,%%xmm5 \n"
3119 "packuswb %%xmm5,%%xmm5 \n"
3120 "punpcklbw %%xmm5,%%xmm0 \n"
3121 "movdqa (%0),%%xmm5 \n"
3122 "movdqa 0x10(%0),%%xmm1 \n"
3123 "pmaddubsw %%xmm4,%%xmm5 \n"
3124 "pmaddubsw %%xmm4,%%xmm1 \n"
3125 "phaddw %%xmm1,%%xmm5 \n"
3126 "psrlw $0x7,%%xmm5 \n"
3127 "packuswb %%xmm5,%%xmm5 \n"
3128 "movdqa (%0),%%xmm6 \n"
3129 "movdqa 0x10(%0),%%xmm1 \n"
3130 "psrld $0x18,%%xmm6 \n"
3131 "psrld $0x18,%%xmm1 \n"
3132 "packuswb %%xmm1,%%xmm6 \n"
3133 "packuswb %%xmm6,%%xmm6 \n"
3134 "punpcklbw %%xmm6,%%xmm5 \n"
3135 "movdqa %%xmm0,%%xmm1 \n"
3136 "punpcklwd %%xmm5,%%xmm0 \n"
3137 "punpckhwd %%xmm5,%%xmm1 \n"
3138 "sub $0x8,%1 \n"
3139 "movdqa %%xmm0,(%0) \n"
3140 "movdqa %%xmm1,0x10(%0) \n"
3141 "lea 0x20(%0),%0 \n"
3142 "jg 1b \n"
3143 : "+r"(dst_argb), // %0
3144 "+r"(width) // %1
3145 : "m"(kARGBToSepiaB), // %2
3146 "m"(kARGBToSepiaG), // %3
3147 "m"(kARGBToSepiaR) // %4
3148 : "memory", "cc"
3149 #if defined(__SSE2__)
3150 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
3151 #endif
3152 );
3153 }
3154 #endif // HAS_ARGBSEPIAROW_SSSE3
3155
3156 #ifdef HAS_ARGBCOLORMATRIXROW_SSSE3
3157 // Tranform 8 ARGB pixels (32 bytes) with color matrix.
3158 // Same as Sepia except matrix is provided.
ARGBColorMatrixRow_SSSE3(uint8 * dst_argb,const int8 * matrix_argb,int width)3159 void ARGBColorMatrixRow_SSSE3(uint8* dst_argb, const int8* matrix_argb,
3160 int width) {
3161 asm volatile (
3162 "movd (%2),%%xmm2 \n"
3163 "movd 0x4(%2),%%xmm3 \n"
3164 "movd 0x8(%2),%%xmm4 \n"
3165 "pshufd $0x0,%%xmm2,%%xmm2 \n"
3166 "pshufd $0x0,%%xmm3,%%xmm3 \n"
3167 "pshufd $0x0,%%xmm4,%%xmm4 \n"
3168
3169 // 8 pixel loop.
3170 ".p2align 4 \n"
3171 "1: \n"
3172 "movdqa (%0),%%xmm0 \n"
3173 "movdqa 0x10(%0),%%xmm6 \n"
3174 "pmaddubsw %%xmm2,%%xmm0 \n"
3175 "pmaddubsw %%xmm2,%%xmm6 \n"
3176 "movdqa (%0),%%xmm5 \n"
3177 "movdqa 0x10(%0),%%xmm1 \n"
3178 "pmaddubsw %%xmm3,%%xmm5 \n"
3179 "pmaddubsw %%xmm3,%%xmm1 \n"
3180 "phaddsw %%xmm6,%%xmm0 \n"
3181 "phaddsw %%xmm1,%%xmm5 \n"
3182 "psraw $0x7,%%xmm0 \n"
3183 "psraw $0x7,%%xmm5 \n"
3184 "packuswb %%xmm0,%%xmm0 \n"
3185 "packuswb %%xmm5,%%xmm5 \n"
3186 "punpcklbw %%xmm5,%%xmm0 \n"
3187 "movdqa (%0),%%xmm5 \n"
3188 "movdqa 0x10(%0),%%xmm1 \n"
3189 "pmaddubsw %%xmm4,%%xmm5 \n"
3190 "pmaddubsw %%xmm4,%%xmm1 \n"
3191 "phaddsw %%xmm1,%%xmm5 \n"
3192 "psraw $0x7,%%xmm5 \n"
3193 "packuswb %%xmm5,%%xmm5 \n"
3194 "movdqa (%0),%%xmm6 \n"
3195 "movdqa 0x10(%0),%%xmm1 \n"
3196 "psrld $0x18,%%xmm6 \n"
3197 "psrld $0x18,%%xmm1 \n"
3198 "packuswb %%xmm1,%%xmm6 \n"
3199 "packuswb %%xmm6,%%xmm6 \n"
3200 "movdqa %%xmm0,%%xmm1 \n"
3201 "punpcklbw %%xmm6,%%xmm5 \n"
3202 "punpcklwd %%xmm5,%%xmm0 \n"
3203 "punpckhwd %%xmm5,%%xmm1 \n"
3204 "sub $0x8,%1 \n"
3205 "movdqa %%xmm0,(%0) \n"
3206 "movdqa %%xmm1,0x10(%0) \n"
3207 "lea 0x20(%0),%0 \n"
3208 "jg 1b \n"
3209 : "+r"(dst_argb), // %0
3210 "+r"(width) // %1
3211 : "r"(matrix_argb) // %2
3212 : "memory", "cc"
3213 #if defined(__SSE2__)
3214 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
3215 #endif
3216 );
3217 }
3218 #endif // HAS_ARGBCOLORMATRIXROW_SSSE3
3219
3220 #ifdef HAS_ARGBQUANTIZEROW_SSE2
3221 // Quantize 4 ARGB pixels (16 bytes).
3222 // aligned to 16 bytes
ARGBQuantizeRow_SSE2(uint8 * dst_argb,int scale,int interval_size,int interval_offset,int width)3223 void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,
3224 int interval_offset, int width) {
3225 asm volatile (
3226 "movd %2,%%xmm2 \n"
3227 "movd %3,%%xmm3 \n"
3228 "movd %4,%%xmm4 \n"
3229 "pshuflw $0x40,%%xmm2,%%xmm2 \n"
3230 "pshufd $0x44,%%xmm2,%%xmm2 \n"
3231 "pshuflw $0x40,%%xmm3,%%xmm3 \n"
3232 "pshufd $0x44,%%xmm3,%%xmm3 \n"
3233 "pshuflw $0x40,%%xmm4,%%xmm4 \n"
3234 "pshufd $0x44,%%xmm4,%%xmm4 \n"
3235 "pxor %%xmm5,%%xmm5 \n"
3236 "pcmpeqb %%xmm6,%%xmm6 \n"
3237 "pslld $0x18,%%xmm6 \n"
3238
3239 // 4 pixel loop.
3240 ".p2align 2 \n"
3241 "1: \n"
3242 "movdqa (%0),%%xmm0 \n"
3243 "punpcklbw %%xmm5,%%xmm0 \n"
3244 "pmulhuw %%xmm2,%%xmm0 \n"
3245 "movdqa (%0),%%xmm1 \n"
3246 "punpckhbw %%xmm5,%%xmm1 \n"
3247 "pmulhuw %%xmm2,%%xmm1 \n"
3248 "pmullw %%xmm3,%%xmm0 \n"
3249 "movdqa (%0),%%xmm7 \n"
3250 "pmullw %%xmm3,%%xmm1 \n"
3251 "pand %%xmm6,%%xmm7 \n"
3252 "paddw %%xmm4,%%xmm0 \n"
3253 "paddw %%xmm4,%%xmm1 \n"
3254 "packuswb %%xmm1,%%xmm0 \n"
3255 "por %%xmm7,%%xmm0 \n"
3256 "sub $0x4,%1 \n"
3257 "movdqa %%xmm0,(%0) \n"
3258 "lea 0x10(%0),%0 \n"
3259 "jg 1b \n"
3260 : "+r"(dst_argb), // %0
3261 "+r"(width) // %1
3262 : "r"(scale), // %2
3263 "r"(interval_size), // %3
3264 "r"(interval_offset) // %4
3265 : "memory", "cc"
3266 #if defined(__SSE2__)
3267 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
3268 #endif
3269 );
3270 }
3271 #endif // HAS_ARGBQUANTIZEROW_SSE2
3272
3273 #ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2
3274 // Creates a table of cumulative sums where each value is a sum of all values
3275 // above and to the left of the value, inclusive of the value.
ComputeCumulativeSumRow_SSE2(const uint8 * row,int32 * cumsum,const int32 * previous_cumsum,int width)3276 void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
3277 const int32* previous_cumsum, int width) {
3278 asm volatile (
3279 "sub %1,%2 \n"
3280 "pxor %%xmm0,%%xmm0 \n"
3281 "pxor %%xmm1,%%xmm1 \n"
3282 "sub $0x4,%3 \n"
3283 "jl 49f \n"
3284 "test $0xf,%1 \n"
3285 "jne 49f \n"
3286
3287 // 4 pixel loop \n"
3288 ".p2align 2 \n"
3289 "40: \n"
3290 "movdqu (%0),%%xmm2 \n"
3291 "lea 0x10(%0),%0 \n"
3292 "movdqa %%xmm2,%%xmm4 \n"
3293 "punpcklbw %%xmm1,%%xmm2 \n"
3294 "movdqa %%xmm2,%%xmm3 \n"
3295 "punpcklwd %%xmm1,%%xmm2 \n"
3296 "punpckhwd %%xmm1,%%xmm3 \n"
3297 "punpckhbw %%xmm1,%%xmm4 \n"
3298 "movdqa %%xmm4,%%xmm5 \n"
3299 "punpcklwd %%xmm1,%%xmm4 \n"
3300 "punpckhwd %%xmm1,%%xmm5 \n"
3301 "paddd %%xmm2,%%xmm0 \n"
3302 "movdqa (%1,%2,1),%%xmm2 \n"
3303 "paddd %%xmm0,%%xmm2 \n"
3304 "paddd %%xmm3,%%xmm0 \n"
3305 "movdqa 0x10(%1,%2,1),%%xmm3 \n"
3306 "paddd %%xmm0,%%xmm3 \n"
3307 "paddd %%xmm4,%%xmm0 \n"
3308 "movdqa 0x20(%1,%2,1),%%xmm4 \n"
3309 "paddd %%xmm0,%%xmm4 \n"
3310 "paddd %%xmm5,%%xmm0 \n"
3311 "movdqa 0x30(%1,%2,1),%%xmm5 \n"
3312 "paddd %%xmm0,%%xmm5 \n"
3313 "movdqa %%xmm2,(%1) \n"
3314 "movdqa %%xmm3,0x10(%1) \n"
3315 "movdqa %%xmm4,0x20(%1) \n"
3316 "movdqa %%xmm5,0x30(%1) \n"
3317 "lea 0x40(%1),%1 \n"
3318 "sub $0x4,%3 \n"
3319 "jge 40b \n"
3320
3321 "49: \n"
3322 "add $0x3,%3 \n"
3323 "jl 19f \n"
3324
3325 // 1 pixel loop \n"
3326 ".p2align 2 \n"
3327 "10: \n"
3328 "movd (%0),%%xmm2 \n"
3329 "lea 0x4(%0),%0 \n"
3330 "punpcklbw %%xmm1,%%xmm2 \n"
3331 "punpcklwd %%xmm1,%%xmm2 \n"
3332 "paddd %%xmm2,%%xmm0 \n"
3333 "movdqu (%1,%2,1),%%xmm2 \n"
3334 "paddd %%xmm0,%%xmm2 \n"
3335 "movdqu %%xmm2,(%1) \n"
3336 "lea 0x10(%1),%1 \n"
3337 "sub $0x1,%3 \n"
3338 "jge 10b \n"
3339
3340 "19: \n"
3341 : "+r"(row), // %0
3342 "+r"(cumsum), // %1
3343 "+r"(previous_cumsum), // %2
3344 "+r"(width) // %3
3345 :
3346 : "memory", "cc"
3347 #if defined(__SSE2__)
3348 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
3349 #endif
3350 );
3351 }
3352 #endif // HAS_COMPUTECUMULATIVESUMROW_SSE2
3353
3354 #ifdef HAS_CUMULATIVESUMTOAVERAGE_SSE2
CumulativeSumToAverage_SSE2(const int32 * topleft,const int32 * botleft,int width,int area,uint8 * dst,int count)3355 void CumulativeSumToAverage_SSE2(const int32* topleft, const int32* botleft,
3356 int width, int area, uint8* dst, int count) {
3357 asm volatile (
3358 "movd %5,%%xmm4 \n"
3359 "cvtdq2ps %%xmm4,%%xmm4 \n"
3360 "rcpss %%xmm4,%%xmm4 \n"
3361 "pshufd $0x0,%%xmm4,%%xmm4 \n"
3362 "sub $0x4,%3 \n"
3363 "jl 49f \n"
3364
3365 // 4 pixel loop \n"
3366 ".p2align 2 \n"
3367 "40: \n"
3368 "movdqa (%0),%%xmm0 \n"
3369 "movdqa 0x10(%0),%%xmm1 \n"
3370 "movdqa 0x20(%0),%%xmm2 \n"
3371 "movdqa 0x30(%0),%%xmm3 \n"
3372 "psubd (%0,%4,4),%%xmm0 \n"
3373 "psubd 0x10(%0,%4,4),%%xmm1 \n"
3374 "psubd 0x20(%0,%4,4),%%xmm2 \n"
3375 "psubd 0x30(%0,%4,4),%%xmm3 \n"
3376 "lea 0x40(%0),%0 \n"
3377 "psubd (%1),%%xmm0 \n"
3378 "psubd 0x10(%1),%%xmm1 \n"
3379 "psubd 0x20(%1),%%xmm2 \n"
3380 "psubd 0x30(%1),%%xmm3 \n"
3381 "paddd (%1,%4,4),%%xmm0 \n"
3382 "paddd 0x10(%1,%4,4),%%xmm1 \n"
3383 "paddd 0x20(%1,%4,4),%%xmm2 \n"
3384 "paddd 0x30(%1,%4,4),%%xmm3 \n"
3385 "lea 0x40(%1),%1 \n"
3386 "cvtdq2ps %%xmm0,%%xmm0 \n"
3387 "cvtdq2ps %%xmm1,%%xmm1 \n"
3388 "mulps %%xmm4,%%xmm0 \n"
3389 "mulps %%xmm4,%%xmm1 \n"
3390 "cvtdq2ps %%xmm2,%%xmm2 \n"
3391 "cvtdq2ps %%xmm3,%%xmm3 \n"
3392 "mulps %%xmm4,%%xmm2 \n"
3393 "mulps %%xmm4,%%xmm3 \n"
3394 "cvtps2dq %%xmm0,%%xmm0 \n"
3395 "cvtps2dq %%xmm1,%%xmm1 \n"
3396 "cvtps2dq %%xmm2,%%xmm2 \n"
3397 "cvtps2dq %%xmm3,%%xmm3 \n"
3398 "packssdw %%xmm1,%%xmm0 \n"
3399 "packssdw %%xmm3,%%xmm2 \n"
3400 "packuswb %%xmm2,%%xmm0 \n"
3401 "movdqu %%xmm0,(%2) \n"
3402 "lea 0x10(%2),%2 \n"
3403 "sub $0x4,%3 \n"
3404 "jge 40b \n"
3405
3406 "49: \n"
3407 "add $0x3,%3 \n"
3408 "jl 19f \n"
3409
3410 // 1 pixel loop \n"
3411 ".p2align 2 \n"
3412 "10: \n"
3413 "movdqa (%0),%%xmm0 \n"
3414 "psubd (%0,%4,4),%%xmm0 \n"
3415 "lea 0x10(%0),%0 \n"
3416 "psubd (%1),%%xmm0 \n"
3417 "paddd (%1,%4,4),%%xmm0 \n"
3418 "lea 0x10(%1),%1 \n"
3419 "cvtdq2ps %%xmm0,%%xmm0 \n"
3420 "mulps %%xmm4,%%xmm0 \n"
3421 "cvtps2dq %%xmm0,%%xmm0 \n"
3422 "packssdw %%xmm0,%%xmm0 \n"
3423 "packuswb %%xmm0,%%xmm0 \n"
3424 "movd %%xmm0,(%2) \n"
3425 "lea 0x4(%2),%2 \n"
3426 "sub $0x1,%3 \n"
3427 "jge 10b \n"
3428 "19: \n"
3429 : "+r"(topleft), // %0
3430 "+r"(botleft), // %1
3431 "+r"(dst), // %2
3432 "+rm"(count) // %3
3433 : "r"(static_cast<intptr_t>(width)), // %4
3434 "rm"(area) // %5
3435 : "memory", "cc"
3436 #if defined(__SSE2__)
3437 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
3438 #endif
3439 );
3440 }
3441 #endif // HAS_CUMULATIVESUMTOAVERAGE_SSE2
3442 #ifdef HAS_ARGBSHADE_SSE2
3443 // Shade 4 pixels at a time by specified value.
3444 // Aligned to 16 bytes.
ARGBShadeRow_SSE2(const uint8 * src_argb,uint8 * dst_argb,int width,uint32 value)3445 void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
3446 uint32 value) {
3447 asm volatile (
3448 "movd %3,%%xmm2 \n"
3449 "sub %0,%1 \n"
3450 "punpcklbw %%xmm2,%%xmm2 \n"
3451 "punpcklqdq %%xmm2,%%xmm2 \n"
3452
3453 // 4 pixel loop.
3454 ".p2align 2 \n"
3455 "1: \n"
3456 "movdqa (%0),%%xmm0 \n"
3457 "movdqa %%xmm0,%%xmm1 \n"
3458 "punpcklbw %%xmm0,%%xmm0 \n"
3459 "punpckhbw %%xmm1,%%xmm1 \n"
3460 "pmulhuw %%xmm2,%%xmm0 \n"
3461 "pmulhuw %%xmm2,%%xmm1 \n"
3462 "psrlw $0x8,%%xmm0 \n"
3463 "psrlw $0x8,%%xmm1 \n"
3464 "packuswb %%xmm1,%%xmm0 \n"
3465 "sub $0x4,%2 \n"
3466 "movdqa %%xmm0,(%0,%1,1) \n"
3467 "lea 0x10(%0),%0 \n"
3468 "jg 1b \n"
3469 : "+r"(src_argb), // %0
3470 "+r"(dst_argb), // %1
3471 "+r"(width) // %2
3472 : "r"(value) // %3
3473 : "memory", "cc"
3474 #if defined(__SSE2__)
3475 , "xmm0", "xmm1", "xmm2"
3476 #endif
3477 );
3478 }
3479 #endif // HAS_ARGBSHADE_SSE2
3480
3481 #ifdef HAS_ARGBAFFINEROW_SSE2
3482 // TODO(fbarchard): Find 64 bit way to avoid masking.
3483 // TODO(fbarchard): Investigate why 4 pixels is slower than 2 on Core2.
3484 // Copy ARGB pixels from source image with slope to a row of destination.
3485 // Caveat - in 64 bit, movd is used with 64 bit gpr due to Mac gcc producing
3486 // an error if movq is used. movd %%xmm0,%1
3487
3488 LIBYUV_API
ARGBAffineRow_SSE2(const uint8 * src_argb,int src_argb_stride,uint8 * dst_argb,const float * uv_dudv,int width)3489 void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
3490 uint8* dst_argb, const float* uv_dudv, int width) {
3491 intptr_t src_argb_stride_temp = src_argb_stride;
3492 intptr_t temp = 0;
3493 asm volatile (
3494 "movq (%3),%%xmm2 \n"
3495 "movq 0x8(%3),%%xmm7 \n"
3496 "shl $0x10,%1 \n"
3497 "add $0x4,%1 \n"
3498 "movd %1,%%xmm5 \n"
3499 "sub $0x4,%4 \n"
3500 "jl 49f \n"
3501
3502 "pshufd $0x44,%%xmm7,%%xmm7 \n"
3503 "pshufd $0x0,%%xmm5,%%xmm5 \n"
3504 "movdqa %%xmm2,%%xmm0 \n"
3505 "addps %%xmm7,%%xmm0 \n"
3506 "movlhps %%xmm0,%%xmm2 \n"
3507 "movdqa %%xmm7,%%xmm4 \n"
3508 "addps %%xmm4,%%xmm4 \n"
3509 "movdqa %%xmm2,%%xmm3 \n"
3510 "addps %%xmm4,%%xmm3 \n"
3511 "addps %%xmm4,%%xmm4 \n"
3512
3513 // 4 pixel loop \n"
3514 ".p2align 4 \n"
3515 "40: \n"
3516 "cvttps2dq %%xmm2,%%xmm0 \n"
3517 "cvttps2dq %%xmm3,%%xmm1 \n"
3518 "packssdw %%xmm1,%%xmm0 \n"
3519 "pmaddwd %%xmm5,%%xmm0 \n"
3520 #if defined(__x86_64__)
3521 "movd %%xmm0,%1 \n"
3522 "mov %1,%5 \n"
3523 "and $0x0fffffff,%1 \n"
3524 "shr $32,%5 \n"
3525 "pshufd $0xEE,%%xmm0,%%xmm0 \n"
3526 #else
3527 "movd %%xmm0,%1 \n"
3528 "pshufd $0x39,%%xmm0,%%xmm0 \n"
3529 "movd %%xmm0,%5 \n"
3530 "pshufd $0x39,%%xmm0,%%xmm0 \n"
3531 #endif
3532 "movd (%0,%1,1),%%xmm1 \n"
3533 "movd (%0,%5,1),%%xmm6 \n"
3534 "punpckldq %%xmm6,%%xmm1 \n"
3535 "addps %%xmm4,%%xmm2 \n"
3536 "movq %%xmm1,(%2) \n"
3537 #if defined(__x86_64__)
3538 "movd %%xmm0,%1 \n"
3539 "mov %1,%5 \n"
3540 "and $0x0fffffff,%1 \n"
3541 "shr $32,%5 \n"
3542 #else
3543 "movd %%xmm0,%1 \n"
3544 "pshufd $0x39,%%xmm0,%%xmm0 \n"
3545 "movd %%xmm0,%5 \n"
3546 #endif
3547 "movd (%0,%1,1),%%xmm0 \n"
3548 "movd (%0,%5,1),%%xmm6 \n"
3549 "punpckldq %%xmm6,%%xmm0 \n"
3550 "addps %%xmm4,%%xmm3 \n"
3551 "sub $0x4,%4 \n"
3552 "movq %%xmm0,0x08(%2) \n"
3553 "lea 0x10(%2),%2 \n"
3554 "jge 40b \n"
3555
3556 "49: \n"
3557 "add $0x3,%4 \n"
3558 "jl 19f \n"
3559
3560 // 1 pixel loop \n"
3561 ".p2align 4 \n"
3562 "10: \n"
3563 "cvttps2dq %%xmm2,%%xmm0 \n"
3564 "packssdw %%xmm0,%%xmm0 \n"
3565 "pmaddwd %%xmm5,%%xmm0 \n"
3566 "addps %%xmm7,%%xmm2 \n"
3567 "movd %%xmm0,%1 \n"
3568 #if defined(__x86_64__)
3569 "and $0x0fffffff,%1 \n"
3570 #endif
3571 "movd (%0,%1,1),%%xmm0 \n"
3572 "sub $0x1,%4 \n"
3573 "movd %%xmm0,(%2) \n"
3574 "lea 0x4(%2),%2 \n"
3575 "jge 10b \n"
3576 "19: \n"
3577 : "+r"(src_argb), // %0
3578 "+r"(src_argb_stride_temp), // %1
3579 "+r"(dst_argb), // %2
3580 "+r"(uv_dudv), // %3
3581 "+rm"(width), // %4
3582 "+r"(temp) // %5
3583 :
3584 : "memory", "cc"
3585 #if defined(__SSE2__)
3586 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
3587 #endif
3588 );
3589 }
3590 #endif // HAS_ARGBAFFINEROW_SSE2
3591
3592 // Bilinear row filtering combines 4x2 -> 4x1. SSSE3 version
ARGBInterpolateRow_SSSE3(uint8 * dst_ptr,const uint8 * src_ptr,ptrdiff_t src_stride,int dst_width,int source_y_fraction)3593 void ARGBInterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
3594 ptrdiff_t src_stride, int dst_width,
3595 int source_y_fraction) {
3596 asm volatile (
3597 "sub %1,%0 \n"
3598 "shr %3 \n"
3599 "cmp $0x0,%3 \n"
3600 "je 2f \n"
3601 "cmp $0x40,%3 \n"
3602 "je 3f \n"
3603 "movd %3,%%xmm0 \n"
3604 "neg %3 \n"
3605 "add $0x80,%3 \n"
3606 "movd %3,%%xmm5 \n"
3607 "punpcklbw %%xmm0,%%xmm5 \n"
3608 "punpcklwd %%xmm5,%%xmm5 \n"
3609 "pshufd $0x0,%%xmm5,%%xmm5 \n"
3610 ".p2align 4 \n"
3611 "1: \n"
3612 "movdqa (%1),%%xmm0 \n"
3613 "movdqa (%1,%4,1),%%xmm2 \n"
3614 "movdqa %%xmm0,%%xmm1 \n"
3615 "punpcklbw %%xmm2,%%xmm0 \n"
3616 "punpckhbw %%xmm2,%%xmm1 \n"
3617 "pmaddubsw %%xmm5,%%xmm0 \n"
3618 "pmaddubsw %%xmm5,%%xmm1 \n"
3619 "psrlw $0x7,%%xmm0 \n"
3620 "psrlw $0x7,%%xmm1 \n"
3621 "packuswb %%xmm1,%%xmm0 \n"
3622 "sub $0x4,%2 \n"
3623 "movdqa %%xmm0,(%1,%0,1) \n"
3624 "lea 0x10(%1),%1 \n"
3625 "jg 1b \n"
3626 "jmp 4f \n"
3627 ".p2align 4 \n"
3628 "2: \n"
3629 "movdqa (%1),%%xmm0 \n"
3630 "sub $0x4,%2 \n"
3631 "movdqa %%xmm0,(%1,%0,1) \n"
3632 "lea 0x10(%1),%1 \n"
3633 "jg 2b \n"
3634 "jmp 4f \n"
3635 ".p2align 4 \n"
3636 "3: \n"
3637 "movdqa (%1),%%xmm0 \n"
3638 "pavgb (%1,%4,1),%%xmm0 \n"
3639 "sub $0x4,%2 \n"
3640 "movdqa %%xmm0,(%1,%0,1) \n"
3641 "lea 0x10(%1),%1 \n"
3642 "jg 3b \n"
3643 "4: \n"
3644 ".p2align 4 \n"
3645 : "+r"(dst_ptr), // %0
3646 "+r"(src_ptr), // %1
3647 "+r"(dst_width), // %2
3648 "+r"(source_y_fraction) // %3
3649 : "r"(static_cast<intptr_t>(src_stride)) // %4
3650 : "memory", "cc"
3651 #if defined(__SSE2__)
3652 , "xmm0", "xmm1", "xmm2", "xmm5"
3653 #endif
3654 );
3655 }
3656
3657 #endif // defined(__x86_64__) || defined(__i386__)
3658
3659 #ifdef __cplusplus
3660 } // extern "C"
3661 } // namespace libyuv
3662 #endif
3663