1 /*
2 * Copyright 2013 The LibYuv Project Authors. All rights reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include "libyuv/row.h"
12 #include "libyuv/scale_row.h"
13
14 #ifdef __cplusplus
15 namespace libyuv {
16 extern "C" {
17 #endif
18
19 // This module is for GCC x86 and x64.
20 #if !defined(LIBYUV_DISABLE_X86) && \
21 (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))
22
23 // Offsets for source bytes 0 to 9
24 static uvec8 kShuf0 =
25 { 0, 1, 3, 4, 5, 7, 8, 9, 128, 128, 128, 128, 128, 128, 128, 128 };
26
27 // Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12.
28 static uvec8 kShuf1 =
29 { 3, 4, 5, 7, 8, 9, 11, 12, 128, 128, 128, 128, 128, 128, 128, 128 };
30
31 // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
32 static uvec8 kShuf2 =
33 { 5, 7, 8, 9, 11, 12, 13, 15, 128, 128, 128, 128, 128, 128, 128, 128 };
34
35 // Offsets for source bytes 0 to 10
36 static uvec8 kShuf01 =
37 { 0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10 };
38
39 // Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13.
40 static uvec8 kShuf11 =
41 { 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13 };
42
43 // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
44 static uvec8 kShuf21 =
45 { 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13, 13, 14, 14, 15 };
46
47 // Coefficients for source bytes 0 to 10
48 static uvec8 kMadd01 =
49 { 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2 };
50
51 // Coefficients for source bytes 10 to 21
52 static uvec8 kMadd11 =
53 { 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1 };
54
55 // Coefficients for source bytes 21 to 31
56 static uvec8 kMadd21 =
57 { 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3 };
58
59 // Coefficients for source bytes 21 to 31
60 static vec16 kRound34 =
61 { 2, 2, 2, 2, 2, 2, 2, 2 };
62
63 static uvec8 kShuf38a =
64 { 0, 3, 6, 8, 11, 14, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };
65
66 static uvec8 kShuf38b =
67 { 128, 128, 128, 128, 128, 128, 0, 3, 6, 8, 11, 14, 128, 128, 128, 128 };
68
69 // Arrange words 0,3,6 into 0,1,2
70 static uvec8 kShufAc =
71 { 0, 1, 6, 7, 12, 13, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };
72
73 // Arrange words 0,3,6 into 3,4,5
74 static uvec8 kShufAc3 =
75 { 128, 128, 128, 128, 128, 128, 0, 1, 6, 7, 12, 13, 128, 128, 128, 128 };
76
77 // Scaling values for boxes of 3x3 and 2x3
78 static uvec16 kScaleAc33 =
79 { 65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9, 65536 / 9, 65536 / 6, 0, 0 };
80
81 // Arrange first value for pixels 0,1,2,3,4,5
82 static uvec8 kShufAb0 =
83 { 0, 128, 3, 128, 6, 128, 8, 128, 11, 128, 14, 128, 128, 128, 128, 128 };
84
85 // Arrange second value for pixels 0,1,2,3,4,5
86 static uvec8 kShufAb1 =
87 { 1, 128, 4, 128, 7, 128, 9, 128, 12, 128, 15, 128, 128, 128, 128, 128 };
88
89 // Arrange third value for pixels 0,1,2,3,4,5
90 static uvec8 kShufAb2 =
91 { 2, 128, 5, 128, 128, 128, 10, 128, 13, 128, 128, 128, 128, 128, 128, 128 };
92
93 // Scaling values for boxes of 3x2 and 2x2
94 static uvec16 kScaleAb2 =
95 { 65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, 65536 / 3, 65536 / 2, 0, 0 };
96
97 // GCC versions of row functions are verbatim conversions from Visual C.
98 // Generated using gcc disassembly on Visual C object file:
99 // objdump -D yuvscaler.obj >yuvscaler.txt
100
ScaleRowDown2_SSSE3(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)101 void ScaleRowDown2_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
102 uint8* dst_ptr, int dst_width) {
103 asm volatile (
104 LABELALIGN
105 "1: \n"
106 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
107 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
108 "lea " MEMLEA(0x20,0) ",%0 \n"
109 "psrlw $0x8,%%xmm0 \n"
110 "psrlw $0x8,%%xmm1 \n"
111 "packuswb %%xmm1,%%xmm0 \n"
112 "movdqu %%xmm0," MEMACCESS(1) " \n"
113 "lea " MEMLEA(0x10,1) ",%1 \n"
114 "sub $0x10,%2 \n"
115 "jg 1b \n"
116 : "+r"(src_ptr), // %0
117 "+r"(dst_ptr), // %1
118 "+r"(dst_width) // %2
119 :: "memory", "cc", "xmm0", "xmm1"
120 );
121 }
122
ScaleRowDown2Linear_SSSE3(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)123 void ScaleRowDown2Linear_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
124 uint8* dst_ptr, int dst_width) {
125 asm volatile (
126 "pcmpeqb %%xmm4,%%xmm4 \n"
127 "psrlw $0xf,%%xmm4 \n"
128 "packuswb %%xmm4,%%xmm4 \n"
129 "pxor %%xmm5,%%xmm5 \n"
130
131 LABELALIGN
132 "1: \n"
133 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
134 "movdqu " MEMACCESS2(0x10, 0) ",%%xmm1 \n"
135 "lea " MEMLEA(0x20,0) ",%0 \n"
136 "pmaddubsw %%xmm4,%%xmm0 \n"
137 "pmaddubsw %%xmm4,%%xmm1 \n"
138 "pavgw %%xmm5,%%xmm0 \n"
139 "pavgw %%xmm5,%%xmm1 \n"
140 "packuswb %%xmm1,%%xmm0 \n"
141 "movdqu %%xmm0," MEMACCESS(1) " \n"
142 "lea " MEMLEA(0x10,1) ",%1 \n"
143 "sub $0x10,%2 \n"
144 "jg 1b \n"
145 : "+r"(src_ptr), // %0
146 "+r"(dst_ptr), // %1
147 "+r"(dst_width) // %2
148 :: "memory", "cc", "xmm0", "xmm1", "xmm4", "xmm5"
149 );
150 }
151
ScaleRowDown2Box_SSSE3(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)152 void ScaleRowDown2Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
153 uint8* dst_ptr, int dst_width) {
154 asm volatile (
155 "pcmpeqb %%xmm4,%%xmm4 \n"
156 "psrlw $0xf,%%xmm4 \n"
157 "packuswb %%xmm4,%%xmm4 \n"
158 "pxor %%xmm5,%%xmm5 \n"
159
160 LABELALIGN
161 "1: \n"
162 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
163 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
164 MEMOPREG(movdqu,0x00,0,3,1,xmm2) // movdqu (%0,%3,1),%%xmm2
165 MEMOPREG(movdqu,0x10,0,3,1,xmm3) // movdqu 0x10(%0,%3,1),%%xmm3
166 "lea " MEMLEA(0x20,0) ",%0 \n"
167 "pmaddubsw %%xmm4,%%xmm0 \n"
168 "pmaddubsw %%xmm4,%%xmm1 \n"
169 "pmaddubsw %%xmm4,%%xmm2 \n"
170 "pmaddubsw %%xmm4,%%xmm3 \n"
171 "paddw %%xmm2,%%xmm0 \n"
172 "paddw %%xmm3,%%xmm1 \n"
173 "psrlw $0x1,%%xmm0 \n"
174 "psrlw $0x1,%%xmm1 \n"
175 "pavgw %%xmm5,%%xmm0 \n"
176 "pavgw %%xmm5,%%xmm1 \n"
177 "packuswb %%xmm1,%%xmm0 \n"
178 "movdqu %%xmm0," MEMACCESS(1) " \n"
179 "lea " MEMLEA(0x10,1) ",%1 \n"
180 "sub $0x10,%2 \n"
181 "jg 1b \n"
182 : "+r"(src_ptr), // %0
183 "+r"(dst_ptr), // %1
184 "+r"(dst_width) // %2
185 : "r"((intptr_t)(src_stride)) // %3
186 : "memory", "cc", NACL_R14
187 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
188 );
189 }
190
191 #ifdef HAS_SCALEROWDOWN2_AVX2
ScaleRowDown2_AVX2(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)192 void ScaleRowDown2_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
193 uint8* dst_ptr, int dst_width) {
194 asm volatile (
195 LABELALIGN
196 "1: \n"
197 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
198 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
199 "lea " MEMLEA(0x40,0) ",%0 \n"
200 "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
201 "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
202 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
203 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
204 "vmovdqu %%ymm0," MEMACCESS(1) " \n"
205 "lea " MEMLEA(0x20,1) ",%1 \n"
206 "sub $0x20,%2 \n"
207 "jg 1b \n"
208 "vzeroupper \n"
209 : "+r"(src_ptr), // %0
210 "+r"(dst_ptr), // %1
211 "+r"(dst_width) // %2
212 :: "memory", "cc", "xmm0", "xmm1"
213 );
214 }
215
ScaleRowDown2Linear_AVX2(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)216 void ScaleRowDown2Linear_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
217 uint8* dst_ptr, int dst_width) {
218 asm volatile (
219 "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"
220 "vpsrlw $0xf,%%ymm4,%%ymm4 \n"
221 "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n"
222 "vpxor %%ymm5,%%ymm5,%%ymm5 \n"
223
224 LABELALIGN
225 "1: \n"
226 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
227 "vmovdqu " MEMACCESS2(0x20, 0) ",%%ymm1 \n"
228 "lea " MEMLEA(0x40,0) ",%0 \n"
229 "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n"
230 "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n"
231 "vpavgw %%ymm5,%%ymm0,%%ymm0 \n"
232 "vpavgw %%ymm5,%%ymm1,%%ymm1 \n"
233 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
234 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
235 "vmovdqu %%ymm0," MEMACCESS(1) " \n"
236 "lea " MEMLEA(0x20,1) ",%1 \n"
237 "sub $0x20,%2 \n"
238 "jg 1b \n"
239 "vzeroupper \n"
240 : "+r"(src_ptr), // %0
241 "+r"(dst_ptr), // %1
242 "+r"(dst_width) // %2
243 :: "memory", "cc", "xmm0", "xmm1", "xmm4", "xmm5"
244 );
245 }
246
ScaleRowDown2Box_AVX2(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)247 void ScaleRowDown2Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
248 uint8* dst_ptr, int dst_width) {
249 asm volatile (
250 "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"
251 "vpsrlw $0xf,%%ymm4,%%ymm4 \n"
252 "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n"
253 "vpxor %%ymm5,%%ymm5,%%ymm5 \n"
254
255 LABELALIGN
256 "1: \n"
257 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
258 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
259 MEMOPREG(vmovdqu,0x00,0,3,1,ymm2) // vmovdqu (%0,%3,1),%%ymm2
260 MEMOPREG(vmovdqu,0x20,0,3,1,ymm3) // vmovdqu 0x20(%0,%3,1),%%ymm3
261 "lea " MEMLEA(0x40,0) ",%0 \n"
262 "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n"
263 "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n"
264 "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
265 "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
266 "vpaddw %%ymm2,%%ymm0,%%ymm0 \n"
267 "vpaddw %%ymm3,%%ymm1,%%ymm1 \n"
268 "vpsrlw $0x1,%%ymm0,%%ymm0 \n"
269 "vpsrlw $0x1,%%ymm1,%%ymm1 \n"
270 "vpavgw %%ymm5,%%ymm0,%%ymm0 \n"
271 "vpavgw %%ymm5,%%ymm1,%%ymm1 \n"
272 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
273 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
274 "vmovdqu %%ymm0," MEMACCESS(1) " \n"
275 "lea " MEMLEA(0x20,1) ",%1 \n"
276 "sub $0x20,%2 \n"
277 "jg 1b \n"
278 "vzeroupper \n"
279 : "+r"(src_ptr), // %0
280 "+r"(dst_ptr), // %1
281 "+r"(dst_width) // %2
282 : "r"((intptr_t)(src_stride)) // %3
283 : "memory", "cc", NACL_R14
284 "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
285 );
286 }
287 #endif // HAS_SCALEROWDOWN2_AVX2
288
ScaleRowDown4_SSSE3(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)289 void ScaleRowDown4_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
290 uint8* dst_ptr, int dst_width) {
291 asm volatile (
292 "pcmpeqb %%xmm5,%%xmm5 \n"
293 "psrld $0x18,%%xmm5 \n"
294 "pslld $0x10,%%xmm5 \n"
295
296 LABELALIGN
297 "1: \n"
298 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
299 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
300 "lea " MEMLEA(0x20,0) ",%0 \n"
301 "pand %%xmm5,%%xmm0 \n"
302 "pand %%xmm5,%%xmm1 \n"
303 "packuswb %%xmm1,%%xmm0 \n"
304 "psrlw $0x8,%%xmm0 \n"
305 "packuswb %%xmm0,%%xmm0 \n"
306 "movq %%xmm0," MEMACCESS(1) " \n"
307 "lea " MEMLEA(0x8,1) ",%1 \n"
308 "sub $0x8,%2 \n"
309 "jg 1b \n"
310 : "+r"(src_ptr), // %0
311 "+r"(dst_ptr), // %1
312 "+r"(dst_width) // %2
313 :: "memory", "cc", "xmm0", "xmm1", "xmm5"
314 );
315 }
316
ScaleRowDown4Box_SSSE3(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)317 void ScaleRowDown4Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
318 uint8* dst_ptr, int dst_width) {
319 intptr_t stridex3;
320 asm volatile (
321 "pcmpeqb %%xmm4,%%xmm4 \n"
322 "psrlw $0xf,%%xmm4 \n"
323 "movdqa %%xmm4,%%xmm5 \n"
324 "packuswb %%xmm4,%%xmm4 \n"
325 "psllw $0x3,%%xmm5 \n"
326 "lea " MEMLEA4(0x00,4,4,2) ",%3 \n"
327
328 LABELALIGN
329 "1: \n"
330 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
331 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
332 MEMOPREG(movdqu,0x00,0,4,1,xmm2) // movdqu (%0,%4,1),%%xmm2
333 MEMOPREG(movdqu,0x10,0,4,1,xmm3) // movdqu 0x10(%0,%4,1),%%xmm3
334 "pmaddubsw %%xmm4,%%xmm0 \n"
335 "pmaddubsw %%xmm4,%%xmm1 \n"
336 "pmaddubsw %%xmm4,%%xmm2 \n"
337 "pmaddubsw %%xmm4,%%xmm3 \n"
338 "paddw %%xmm2,%%xmm0 \n"
339 "paddw %%xmm3,%%xmm1 \n"
340 MEMOPREG(movdqu,0x00,0,4,2,xmm2) // movdqu (%0,%4,2),%%xmm2
341 MEMOPREG(movdqu,0x10,0,4,2,xmm3) // movdqu 0x10(%0,%4,2),%%xmm3
342 "pmaddubsw %%xmm4,%%xmm2 \n"
343 "pmaddubsw %%xmm4,%%xmm3 \n"
344 "paddw %%xmm2,%%xmm0 \n"
345 "paddw %%xmm3,%%xmm1 \n"
346 MEMOPREG(movdqu,0x00,0,3,1,xmm2) // movdqu (%0,%3,1),%%xmm2
347 MEMOPREG(movdqu,0x10,0,3,1,xmm3) // movdqu 0x10(%0,%3,1),%%xmm3
348 "lea " MEMLEA(0x20,0) ",%0 \n"
349 "pmaddubsw %%xmm4,%%xmm2 \n"
350 "pmaddubsw %%xmm4,%%xmm3 \n"
351 "paddw %%xmm2,%%xmm0 \n"
352 "paddw %%xmm3,%%xmm1 \n"
353 "phaddw %%xmm1,%%xmm0 \n"
354 "paddw %%xmm5,%%xmm0 \n"
355 "psrlw $0x4,%%xmm0 \n"
356 "packuswb %%xmm0,%%xmm0 \n"
357 "movq %%xmm0," MEMACCESS(1) " \n"
358 "lea " MEMLEA(0x8,1) ",%1 \n"
359 "sub $0x8,%2 \n"
360 "jg 1b \n"
361 : "+r"(src_ptr), // %0
362 "+r"(dst_ptr), // %1
363 "+r"(dst_width), // %2
364 "=&r"(stridex3) // %3
365 : "r"((intptr_t)(src_stride)) // %4
366 : "memory", "cc", NACL_R14
367 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
368 );
369 }
370
371
372 #ifdef HAS_SCALEROWDOWN4_AVX2
ScaleRowDown4_AVX2(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)373 void ScaleRowDown4_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
374 uint8* dst_ptr, int dst_width) {
375 asm volatile (
376 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
377 "vpsrld $0x18,%%ymm5,%%ymm5 \n"
378 "vpslld $0x10,%%ymm5,%%ymm5 \n"
379 LABELALIGN
380 "1: \n"
381 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
382 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
383 "lea " MEMLEA(0x40,0) ",%0 \n"
384 "vpand %%ymm5,%%ymm0,%%ymm0 \n"
385 "vpand %%ymm5,%%ymm1,%%ymm1 \n"
386 "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
387 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
388 "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
389 "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
390 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
391 "vmovdqu %%xmm0," MEMACCESS(1) " \n"
392 "lea " MEMLEA(0x10,1) ",%1 \n"
393 "sub $0x10,%2 \n"
394 "jg 1b \n"
395 "vzeroupper \n"
396 : "+r"(src_ptr), // %0
397 "+r"(dst_ptr), // %1
398 "+r"(dst_width) // %2
399 :: "memory", "cc", "xmm0", "xmm1", "xmm5"
400 );
401 }
402
ScaleRowDown4Box_AVX2(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)403 void ScaleRowDown4Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
404 uint8* dst_ptr, int dst_width) {
405 asm volatile (
406 "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"
407 "vpsrlw $0xf,%%ymm4,%%ymm4 \n"
408 "vpsllw $0x3,%%ymm4,%%ymm5 \n"
409 "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n"
410
411 LABELALIGN
412 "1: \n"
413 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
414 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
415 MEMOPREG(vmovdqu,0x00,0,3,1,ymm2) // vmovdqu (%0,%3,1),%%ymm2
416 MEMOPREG(vmovdqu,0x20,0,3,1,ymm3) // vmovdqu 0x20(%0,%3,1),%%ymm3
417 "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n"
418 "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n"
419 "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
420 "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
421 "vpaddw %%ymm2,%%ymm0,%%ymm0 \n"
422 "vpaddw %%ymm3,%%ymm1,%%ymm1 \n"
423 MEMOPREG(vmovdqu,0x00,0,3,2,ymm2) // vmovdqu (%0,%3,2),%%ymm2
424 MEMOPREG(vmovdqu,0x20,0,3,2,ymm3) // vmovdqu 0x20(%0,%3,2),%%ymm3
425 "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
426 "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
427 "vpaddw %%ymm2,%%ymm0,%%ymm0 \n"
428 "vpaddw %%ymm3,%%ymm1,%%ymm1 \n"
429 MEMOPREG(vmovdqu,0x00,0,4,1,ymm2) // vmovdqu (%0,%4,1),%%ymm2
430 MEMOPREG(vmovdqu,0x20,0,4,1,ymm3) // vmovdqu 0x20(%0,%4,1),%%ymm3
431 "lea " MEMLEA(0x40,0) ",%0 \n"
432 "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
433 "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
434 "vpaddw %%ymm2,%%ymm0,%%ymm0 \n"
435 "vpaddw %%ymm3,%%ymm1,%%ymm1 \n"
436 "vphaddw %%ymm1,%%ymm0,%%ymm0 \n"
437 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
438 "vpaddw %%ymm5,%%ymm0,%%ymm0 \n"
439 "vpsrlw $0x4,%%ymm0,%%ymm0 \n"
440 "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
441 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
442 "vmovdqu %%xmm0," MEMACCESS(1) " \n"
443 "lea " MEMLEA(0x10,1) ",%1 \n"
444 "sub $0x10,%2 \n"
445 "jg 1b \n"
446 "vzeroupper \n"
447 : "+r"(src_ptr), // %0
448 "+r"(dst_ptr), // %1
449 "+r"(dst_width) // %2
450 : "r"((intptr_t)(src_stride)), // %3
451 "r"((intptr_t)(src_stride * 3)) // %4
452 : "memory", "cc", NACL_R14
453 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
454 );
455 }
456 #endif // HAS_SCALEROWDOWN4_AVX2
457
ScaleRowDown34_SSSE3(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)458 void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
459 uint8* dst_ptr, int dst_width) {
460 asm volatile (
461 "movdqa %0,%%xmm3 \n"
462 "movdqa %1,%%xmm4 \n"
463 "movdqa %2,%%xmm5 \n"
464 :
465 : "m"(kShuf0), // %0
466 "m"(kShuf1), // %1
467 "m"(kShuf2) // %2
468 );
469 asm volatile (
470 LABELALIGN
471 "1: \n"
472 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
473 "movdqu " MEMACCESS2(0x10,0) ",%%xmm2 \n"
474 "lea " MEMLEA(0x20,0) ",%0 \n"
475 "movdqa %%xmm2,%%xmm1 \n"
476 "palignr $0x8,%%xmm0,%%xmm1 \n"
477 "pshufb %%xmm3,%%xmm0 \n"
478 "pshufb %%xmm4,%%xmm1 \n"
479 "pshufb %%xmm5,%%xmm2 \n"
480 "movq %%xmm0," MEMACCESS(1) " \n"
481 "movq %%xmm1," MEMACCESS2(0x8,1) " \n"
482 "movq %%xmm2," MEMACCESS2(0x10,1) " \n"
483 "lea " MEMLEA(0x18,1) ",%1 \n"
484 "sub $0x18,%2 \n"
485 "jg 1b \n"
486 : "+r"(src_ptr), // %0
487 "+r"(dst_ptr), // %1
488 "+r"(dst_width) // %2
489 :: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
490 );
491 }
492
ScaleRowDown34_1_Box_SSSE3(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)493 void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
494 ptrdiff_t src_stride,
495 uint8* dst_ptr, int dst_width) {
496 asm volatile (
497 "movdqa %0,%%xmm2 \n" // kShuf01
498 "movdqa %1,%%xmm3 \n" // kShuf11
499 "movdqa %2,%%xmm4 \n" // kShuf21
500 :
501 : "m"(kShuf01), // %0
502 "m"(kShuf11), // %1
503 "m"(kShuf21) // %2
504 );
505 asm volatile (
506 "movdqa %0,%%xmm5 \n" // kMadd01
507 "movdqa %1,%%xmm0 \n" // kMadd11
508 "movdqa %2,%%xmm1 \n" // kRound34
509 :
510 : "m"(kMadd01), // %0
511 "m"(kMadd11), // %1
512 "m"(kRound34) // %2
513 );
514 asm volatile (
515 LABELALIGN
516 "1: \n"
517 "movdqu " MEMACCESS(0) ",%%xmm6 \n"
518 MEMOPREG(movdqu,0x00,0,3,1,xmm7) // movdqu (%0,%3),%%xmm7
519 "pavgb %%xmm7,%%xmm6 \n"
520 "pshufb %%xmm2,%%xmm6 \n"
521 "pmaddubsw %%xmm5,%%xmm6 \n"
522 "paddsw %%xmm1,%%xmm6 \n"
523 "psrlw $0x2,%%xmm6 \n"
524 "packuswb %%xmm6,%%xmm6 \n"
525 "movq %%xmm6," MEMACCESS(1) " \n"
526 "movdqu " MEMACCESS2(0x8,0) ",%%xmm6 \n"
527 MEMOPREG(movdqu,0x8,0,3,1,xmm7) // movdqu 0x8(%0,%3),%%xmm7
528 "pavgb %%xmm7,%%xmm6 \n"
529 "pshufb %%xmm3,%%xmm6 \n"
530 "pmaddubsw %%xmm0,%%xmm6 \n"
531 "paddsw %%xmm1,%%xmm6 \n"
532 "psrlw $0x2,%%xmm6 \n"
533 "packuswb %%xmm6,%%xmm6 \n"
534 "movq %%xmm6," MEMACCESS2(0x8,1) " \n"
535 "movdqu " MEMACCESS2(0x10,0) ",%%xmm6 \n"
536 MEMOPREG(movdqu,0x10,0,3,1,xmm7) // movdqu 0x10(%0,%3),%%xmm7
537 "lea " MEMLEA(0x20,0) ",%0 \n"
538 "pavgb %%xmm7,%%xmm6 \n"
539 "pshufb %%xmm4,%%xmm6 \n"
540 "pmaddubsw %4,%%xmm6 \n"
541 "paddsw %%xmm1,%%xmm6 \n"
542 "psrlw $0x2,%%xmm6 \n"
543 "packuswb %%xmm6,%%xmm6 \n"
544 "movq %%xmm6," MEMACCESS2(0x10,1) " \n"
545 "lea " MEMLEA(0x18,1) ",%1 \n"
546 "sub $0x18,%2 \n"
547 "jg 1b \n"
548 : "+r"(src_ptr), // %0
549 "+r"(dst_ptr), // %1
550 "+r"(dst_width) // %2
551 : "r"((intptr_t)(src_stride)), // %3
552 "m"(kMadd21) // %4
553 : "memory", "cc", NACL_R14
554 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
555 );
556 }
557
ScaleRowDown34_0_Box_SSSE3(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)558 void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,
559 ptrdiff_t src_stride,
560 uint8* dst_ptr, int dst_width) {
561 asm volatile (
562 "movdqa %0,%%xmm2 \n" // kShuf01
563 "movdqa %1,%%xmm3 \n" // kShuf11
564 "movdqa %2,%%xmm4 \n" // kShuf21
565 :
566 : "m"(kShuf01), // %0
567 "m"(kShuf11), // %1
568 "m"(kShuf21) // %2
569 );
570 asm volatile (
571 "movdqa %0,%%xmm5 \n" // kMadd01
572 "movdqa %1,%%xmm0 \n" // kMadd11
573 "movdqa %2,%%xmm1 \n" // kRound34
574 :
575 : "m"(kMadd01), // %0
576 "m"(kMadd11), // %1
577 "m"(kRound34) // %2
578 );
579
580 asm volatile (
581 LABELALIGN
582 "1: \n"
583 "movdqu " MEMACCESS(0) ",%%xmm6 \n"
584 MEMOPREG(movdqu,0x00,0,3,1,xmm7) // movdqu (%0,%3,1),%%xmm7
585 "pavgb %%xmm6,%%xmm7 \n"
586 "pavgb %%xmm7,%%xmm6 \n"
587 "pshufb %%xmm2,%%xmm6 \n"
588 "pmaddubsw %%xmm5,%%xmm6 \n"
589 "paddsw %%xmm1,%%xmm6 \n"
590 "psrlw $0x2,%%xmm6 \n"
591 "packuswb %%xmm6,%%xmm6 \n"
592 "movq %%xmm6," MEMACCESS(1) " \n"
593 "movdqu " MEMACCESS2(0x8,0) ",%%xmm6 \n"
594 MEMOPREG(movdqu,0x8,0,3,1,xmm7) // movdqu 0x8(%0,%3,1),%%xmm7
595 "pavgb %%xmm6,%%xmm7 \n"
596 "pavgb %%xmm7,%%xmm6 \n"
597 "pshufb %%xmm3,%%xmm6 \n"
598 "pmaddubsw %%xmm0,%%xmm6 \n"
599 "paddsw %%xmm1,%%xmm6 \n"
600 "psrlw $0x2,%%xmm6 \n"
601 "packuswb %%xmm6,%%xmm6 \n"
602 "movq %%xmm6," MEMACCESS2(0x8,1) " \n"
603 "movdqu " MEMACCESS2(0x10,0) ",%%xmm6 \n"
604 MEMOPREG(movdqu,0x10,0,3,1,xmm7) // movdqu 0x10(%0,%3,1),%%xmm7
605 "lea " MEMLEA(0x20,0) ",%0 \n"
606 "pavgb %%xmm6,%%xmm7 \n"
607 "pavgb %%xmm7,%%xmm6 \n"
608 "pshufb %%xmm4,%%xmm6 \n"
609 "pmaddubsw %4,%%xmm6 \n"
610 "paddsw %%xmm1,%%xmm6 \n"
611 "psrlw $0x2,%%xmm6 \n"
612 "packuswb %%xmm6,%%xmm6 \n"
613 "movq %%xmm6," MEMACCESS2(0x10,1) " \n"
614 "lea " MEMLEA(0x18,1) ",%1 \n"
615 "sub $0x18,%2 \n"
616 "jg 1b \n"
617 : "+r"(src_ptr), // %0
618 "+r"(dst_ptr), // %1
619 "+r"(dst_width) // %2
620 : "r"((intptr_t)(src_stride)), // %3
621 "m"(kMadd21) // %4
622 : "memory", "cc", NACL_R14
623 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
624 );
625 }
626
ScaleRowDown38_SSSE3(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)627 void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
628 uint8* dst_ptr, int dst_width) {
629 asm volatile (
630 "movdqa %3,%%xmm4 \n"
631 "movdqa %4,%%xmm5 \n"
632
633 LABELALIGN
634 "1: \n"
635 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
636 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
637 "lea " MEMLEA(0x20,0) ",%0 \n"
638 "pshufb %%xmm4,%%xmm0 \n"
639 "pshufb %%xmm5,%%xmm1 \n"
640 "paddusb %%xmm1,%%xmm0 \n"
641 "movq %%xmm0," MEMACCESS(1) " \n"
642 "movhlps %%xmm0,%%xmm1 \n"
643 "movd %%xmm1," MEMACCESS2(0x8,1) " \n"
644 "lea " MEMLEA(0xc,1) ",%1 \n"
645 "sub $0xc,%2 \n"
646 "jg 1b \n"
647 : "+r"(src_ptr), // %0
648 "+r"(dst_ptr), // %1
649 "+r"(dst_width) // %2
650 : "m"(kShuf38a), // %3
651 "m"(kShuf38b) // %4
652 : "memory", "cc", "xmm0", "xmm1", "xmm4", "xmm5"
653 );
654 }
655
ScaleRowDown38_2_Box_SSSE3(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)656 void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,
657 ptrdiff_t src_stride,
658 uint8* dst_ptr, int dst_width) {
659 asm volatile (
660 "movdqa %0,%%xmm2 \n"
661 "movdqa %1,%%xmm3 \n"
662 "movdqa %2,%%xmm4 \n"
663 "movdqa %3,%%xmm5 \n"
664 :
665 : "m"(kShufAb0), // %0
666 "m"(kShufAb1), // %1
667 "m"(kShufAb2), // %2
668 "m"(kScaleAb2) // %3
669 );
670 asm volatile (
671 LABELALIGN
672 "1: \n"
673 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
674 MEMOPREG(movdqu,0x00,0,3,1,xmm1) // movdqu (%0,%3,1),%%xmm1
675 "lea " MEMLEA(0x10,0) ",%0 \n"
676 "pavgb %%xmm1,%%xmm0 \n"
677 "movdqa %%xmm0,%%xmm1 \n"
678 "pshufb %%xmm2,%%xmm1 \n"
679 "movdqa %%xmm0,%%xmm6 \n"
680 "pshufb %%xmm3,%%xmm6 \n"
681 "paddusw %%xmm6,%%xmm1 \n"
682 "pshufb %%xmm4,%%xmm0 \n"
683 "paddusw %%xmm0,%%xmm1 \n"
684 "pmulhuw %%xmm5,%%xmm1 \n"
685 "packuswb %%xmm1,%%xmm1 \n"
686 "movd %%xmm1," MEMACCESS(1) " \n"
687 "psrlq $0x10,%%xmm1 \n"
688 "movd %%xmm1," MEMACCESS2(0x2,1) " \n"
689 "lea " MEMLEA(0x6,1) ",%1 \n"
690 "sub $0x6,%2 \n"
691 "jg 1b \n"
692 : "+r"(src_ptr), // %0
693 "+r"(dst_ptr), // %1
694 "+r"(dst_width) // %2
695 : "r"((intptr_t)(src_stride)) // %3
696 : "memory", "cc", NACL_R14
697 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
698 );
699 }
700
ScaleRowDown38_3_Box_SSSE3(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)701 void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
702 ptrdiff_t src_stride,
703 uint8* dst_ptr, int dst_width) {
704 asm volatile (
705 "movdqa %0,%%xmm2 \n"
706 "movdqa %1,%%xmm3 \n"
707 "movdqa %2,%%xmm4 \n"
708 "pxor %%xmm5,%%xmm5 \n"
709 :
710 : "m"(kShufAc), // %0
711 "m"(kShufAc3), // %1
712 "m"(kScaleAc33) // %2
713 );
714 asm volatile (
715 LABELALIGN
716 "1: \n"
717 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
718 MEMOPREG(movdqu,0x00,0,3,1,xmm6) // movdqu (%0,%3,1),%%xmm6
719 "movhlps %%xmm0,%%xmm1 \n"
720 "movhlps %%xmm6,%%xmm7 \n"
721 "punpcklbw %%xmm5,%%xmm0 \n"
722 "punpcklbw %%xmm5,%%xmm1 \n"
723 "punpcklbw %%xmm5,%%xmm6 \n"
724 "punpcklbw %%xmm5,%%xmm7 \n"
725 "paddusw %%xmm6,%%xmm0 \n"
726 "paddusw %%xmm7,%%xmm1 \n"
727 MEMOPREG(movdqu,0x00,0,3,2,xmm6) // movdqu (%0,%3,2),%%xmm6
728 "lea " MEMLEA(0x10,0) ",%0 \n"
729 "movhlps %%xmm6,%%xmm7 \n"
730 "punpcklbw %%xmm5,%%xmm6 \n"
731 "punpcklbw %%xmm5,%%xmm7 \n"
732 "paddusw %%xmm6,%%xmm0 \n"
733 "paddusw %%xmm7,%%xmm1 \n"
734 "movdqa %%xmm0,%%xmm6 \n"
735 "psrldq $0x2,%%xmm0 \n"
736 "paddusw %%xmm0,%%xmm6 \n"
737 "psrldq $0x2,%%xmm0 \n"
738 "paddusw %%xmm0,%%xmm6 \n"
739 "pshufb %%xmm2,%%xmm6 \n"
740 "movdqa %%xmm1,%%xmm7 \n"
741 "psrldq $0x2,%%xmm1 \n"
742 "paddusw %%xmm1,%%xmm7 \n"
743 "psrldq $0x2,%%xmm1 \n"
744 "paddusw %%xmm1,%%xmm7 \n"
745 "pshufb %%xmm3,%%xmm7 \n"
746 "paddusw %%xmm7,%%xmm6 \n"
747 "pmulhuw %%xmm4,%%xmm6 \n"
748 "packuswb %%xmm6,%%xmm6 \n"
749 "movd %%xmm6," MEMACCESS(1) " \n"
750 "psrlq $0x10,%%xmm6 \n"
751 "movd %%xmm6," MEMACCESS2(0x2,1) " \n"
752 "lea " MEMLEA(0x6,1) ",%1 \n"
753 "sub $0x6,%2 \n"
754 "jg 1b \n"
755 : "+r"(src_ptr), // %0
756 "+r"(dst_ptr), // %1
757 "+r"(dst_width) // %2
758 : "r"((intptr_t)(src_stride)) // %3
759 : "memory", "cc", NACL_R14
760 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
761 );
762 }
763
764 // Reads 16xN bytes and produces 16 shorts at a time.
ScaleAddRow_SSE2(const uint8 * src_ptr,uint16 * dst_ptr,int src_width)765 void ScaleAddRow_SSE2(const uint8* src_ptr, uint16* dst_ptr, int src_width) {
766 asm volatile (
767 "pxor %%xmm5,%%xmm5 \n"
768
769 LABELALIGN
770 "1: \n"
771 "movdqu " MEMACCESS(0) ",%%xmm3 \n"
772 "lea " MEMLEA(0x10,0) ",%0 \n" // src_ptr += 16
773 "movdqu " MEMACCESS(1) ",%%xmm0 \n"
774 "movdqu " MEMACCESS2(0x10,1) ",%%xmm1 \n"
775 "movdqa %%xmm3,%%xmm2 \n"
776 "punpcklbw %%xmm5,%%xmm2 \n"
777 "punpckhbw %%xmm5,%%xmm3 \n"
778 "paddusw %%xmm2,%%xmm0 \n"
779 "paddusw %%xmm3,%%xmm1 \n"
780 "movdqu %%xmm0," MEMACCESS(1) " \n"
781 "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
782 "lea " MEMLEA(0x20,1) ",%1 \n"
783 "sub $0x10,%2 \n"
784 "jg 1b \n"
785 : "+r"(src_ptr), // %0
786 "+r"(dst_ptr), // %1
787 "+r"(src_width) // %2
788 :
789 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
790 );
791 }
792
793
794 #ifdef HAS_SCALEADDROW_AVX2
795 // Reads 32 bytes and accumulates to 32 shorts at a time.
ScaleAddRow_AVX2(const uint8 * src_ptr,uint16 * dst_ptr,int src_width)796 void ScaleAddRow_AVX2(const uint8* src_ptr, uint16* dst_ptr, int src_width) {
797 asm volatile (
798 "vpxor %%ymm5,%%ymm5,%%ymm5 \n"
799
800 LABELALIGN
801 "1: \n"
802 "vmovdqu " MEMACCESS(0) ",%%ymm3 \n"
803 "lea " MEMLEA(0x20,0) ",%0 \n" // src_ptr += 32
804 "vpermq $0xd8,%%ymm3,%%ymm3 \n"
805 "vpunpcklbw %%ymm5,%%ymm3,%%ymm2 \n"
806 "vpunpckhbw %%ymm5,%%ymm3,%%ymm3 \n"
807 "vpaddusw " MEMACCESS(1) ",%%ymm2,%%ymm0 \n"
808 "vpaddusw " MEMACCESS2(0x20,1) ",%%ymm3,%%ymm1 \n"
809 "vmovdqu %%ymm0," MEMACCESS(1) " \n"
810 "vmovdqu %%ymm1," MEMACCESS2(0x20,1) " \n"
811 "lea " MEMLEA(0x40,1) ",%1 \n"
812 "sub $0x20,%2 \n"
813 "jg 1b \n"
814 "vzeroupper \n"
815 : "+r"(src_ptr), // %0
816 "+r"(dst_ptr), // %1
817 "+r"(src_width) // %2
818 :
819 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
820 );
821 }
822 #endif // HAS_SCALEADDROW_AVX2
823
824 // Constant for making pixels signed to avoid pmaddubsw
825 // saturation.
826 static uvec8 kFsub80 =
827 { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
828 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 };
829
830 // Constant for making pixels unsigned and adding .5 for rounding.
831 static uvec16 kFadd40 =
832 { 0x4040, 0x4040, 0x4040, 0x4040, 0x4040, 0x4040, 0x4040, 0x4040 };
833
834 // Bilinear column filtering. SSSE3 version.
ScaleFilterCols_SSSE3(uint8 * dst_ptr,const uint8 * src_ptr,int dst_width,int x,int dx)835 void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
836 int dst_width, int x, int dx) {
837 intptr_t x0, x1, temp_pixel;
838 asm volatile (
839 "movd %6,%%xmm2 \n"
840 "movd %7,%%xmm3 \n"
841 "movl $0x04040000,%k2 \n"
842 "movd %k2,%%xmm5 \n"
843 "pcmpeqb %%xmm6,%%xmm6 \n"
844 "psrlw $0x9,%%xmm6 \n" // 0x007f007f
845 "pcmpeqb %%xmm7,%%xmm7 \n"
846 "psrlw $15,%%xmm7 \n" // 0x00010001
847
848 "pextrw $0x1,%%xmm2,%k3 \n"
849 "subl $0x2,%5 \n"
850 "jl 29f \n"
851 "movdqa %%xmm2,%%xmm0 \n"
852 "paddd %%xmm3,%%xmm0 \n"
853 "punpckldq %%xmm0,%%xmm2 \n"
854 "punpckldq %%xmm3,%%xmm3 \n"
855 "paddd %%xmm3,%%xmm3 \n"
856 "pextrw $0x3,%%xmm2,%k4 \n"
857
858 LABELALIGN
859 "2: \n"
860 "movdqa %%xmm2,%%xmm1 \n"
861 "paddd %%xmm3,%%xmm2 \n"
862 MEMOPARG(movzwl,0x00,1,3,1,k2) // movzwl (%1,%3,1),%k2
863 "movd %k2,%%xmm0 \n"
864 "psrlw $0x9,%%xmm1 \n"
865 MEMOPARG(movzwl,0x00,1,4,1,k2) // movzwl (%1,%4,1),%k2
866 "movd %k2,%%xmm4 \n"
867 "pshufb %%xmm5,%%xmm1 \n"
868 "punpcklwd %%xmm4,%%xmm0 \n"
869 "psubb %8,%%xmm0 \n" // make pixels signed.
870 "pxor %%xmm6,%%xmm1 \n" // 128 -f = (f ^ 127 ) + 1
871 "paddusb %%xmm7,%%xmm1 \n"
872 "pmaddubsw %%xmm0,%%xmm1 \n"
873 "pextrw $0x1,%%xmm2,%k3 \n"
874 "pextrw $0x3,%%xmm2,%k4 \n"
875 "paddw %9,%%xmm1 \n" // make pixels unsigned.
876 "psrlw $0x7,%%xmm1 \n"
877 "packuswb %%xmm1,%%xmm1 \n"
878 "movd %%xmm1,%k2 \n"
879 "mov %w2," MEMACCESS(0) " \n"
880 "lea " MEMLEA(0x2,0) ",%0 \n"
881 "subl $0x2,%5 \n"
882 "jge 2b \n"
883
884 LABELALIGN
885 "29: \n"
886 "addl $0x1,%5 \n"
887 "jl 99f \n"
888 MEMOPARG(movzwl,0x00,1,3,1,k2) // movzwl (%1,%3,1),%k2
889 "movd %k2,%%xmm0 \n"
890 "psrlw $0x9,%%xmm2 \n"
891 "pshufb %%xmm5,%%xmm2 \n"
892 "psubb %8,%%xmm0 \n" // make pixels signed.
893 "pxor %%xmm6,%%xmm2 \n"
894 "paddusb %%xmm7,%%xmm2 \n"
895 "pmaddubsw %%xmm0,%%xmm2 \n"
896 "paddw %9,%%xmm2 \n" // make pixels unsigned.
897 "psrlw $0x7,%%xmm2 \n"
898 "packuswb %%xmm2,%%xmm2 \n"
899 "movd %%xmm2,%k2 \n"
900 "mov %b2," MEMACCESS(0) " \n"
901 "99: \n"
902 : "+r"(dst_ptr), // %0
903 "+r"(src_ptr), // %1
904 "=&a"(temp_pixel), // %2
905 "=&r"(x0), // %3
906 "=&r"(x1), // %4
907 #if defined(__x86_64__)
908 "+rm"(dst_width) // %5
909 #else
910 "+m"(dst_width) // %5
911 #endif
912 : "rm"(x), // %6
913 "rm"(dx), // %7
914 #if defined(__x86_64__)
915 "x"(kFsub80), // %8
916 "x"(kFadd40) // %9
917 #else
918 "m"(kFsub80), // %8
919 "m"(kFadd40) // %9
920 #endif
921 : "memory", "cc", NACL_R14
922 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
923 );
924 }
925
926 // Reads 4 pixels, duplicates them and writes 8 pixels.
927 // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
ScaleColsUp2_SSE2(uint8 * dst_ptr,const uint8 * src_ptr,int dst_width,int x,int dx)928 void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr,
929 int dst_width, int x, int dx) {
930 asm volatile (
931 LABELALIGN
932 "1: \n"
933 "movdqu " MEMACCESS(1) ",%%xmm0 \n"
934 "lea " MEMLEA(0x10,1) ",%1 \n"
935 "movdqa %%xmm0,%%xmm1 \n"
936 "punpcklbw %%xmm0,%%xmm0 \n"
937 "punpckhbw %%xmm1,%%xmm1 \n"
938 "movdqu %%xmm0," MEMACCESS(0) " \n"
939 "movdqu %%xmm1," MEMACCESS2(0x10,0) " \n"
940 "lea " MEMLEA(0x20,0) ",%0 \n"
941 "sub $0x20,%2 \n"
942 "jg 1b \n"
943
944 : "+r"(dst_ptr), // %0
945 "+r"(src_ptr), // %1
946 "+r"(dst_width) // %2
947 :: "memory", "cc", "xmm0", "xmm1"
948 );
949 }
950
ScaleARGBRowDown2_SSE2(const uint8 * src_argb,ptrdiff_t src_stride,uint8 * dst_argb,int dst_width)951 void ScaleARGBRowDown2_SSE2(const uint8* src_argb,
952 ptrdiff_t src_stride,
953 uint8* dst_argb, int dst_width) {
954 asm volatile (
955 LABELALIGN
956 "1: \n"
957 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
958 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
959 "lea " MEMLEA(0x20,0) ",%0 \n"
960 "shufps $0xdd,%%xmm1,%%xmm0 \n"
961 "movdqu %%xmm0," MEMACCESS(1) " \n"
962 "lea " MEMLEA(0x10,1) ",%1 \n"
963 "sub $0x4,%2 \n"
964 "jg 1b \n"
965 : "+r"(src_argb), // %0
966 "+r"(dst_argb), // %1
967 "+r"(dst_width) // %2
968 :: "memory", "cc", "xmm0", "xmm1"
969 );
970 }
971
ScaleARGBRowDown2Linear_SSE2(const uint8 * src_argb,ptrdiff_t src_stride,uint8 * dst_argb,int dst_width)972 void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb,
973 ptrdiff_t src_stride,
974 uint8* dst_argb, int dst_width) {
975 asm volatile (
976 LABELALIGN
977 "1: \n"
978 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
979 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
980 "lea " MEMLEA(0x20,0) ",%0 \n"
981 "movdqa %%xmm0,%%xmm2 \n"
982 "shufps $0x88,%%xmm1,%%xmm0 \n"
983 "shufps $0xdd,%%xmm1,%%xmm2 \n"
984 "pavgb %%xmm2,%%xmm0 \n"
985 "movdqu %%xmm0," MEMACCESS(1) " \n"
986 "lea " MEMLEA(0x10,1) ",%1 \n"
987 "sub $0x4,%2 \n"
988 "jg 1b \n"
989 : "+r"(src_argb), // %0
990 "+r"(dst_argb), // %1
991 "+r"(dst_width) // %2
992 :: "memory", "cc", "xmm0", "xmm1"
993 );
994 }
995
ScaleARGBRowDown2Box_SSE2(const uint8 * src_argb,ptrdiff_t src_stride,uint8 * dst_argb,int dst_width)996 void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,
997 ptrdiff_t src_stride,
998 uint8* dst_argb, int dst_width) {
999 asm volatile (
1000 LABELALIGN
1001 "1: \n"
1002 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
1003 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
1004 MEMOPREG(movdqu,0x00,0,3,1,xmm2) // movdqu (%0,%3,1),%%xmm2
1005 MEMOPREG(movdqu,0x10,0,3,1,xmm3) // movdqu 0x10(%0,%3,1),%%xmm3
1006 "lea " MEMLEA(0x20,0) ",%0 \n"
1007 "pavgb %%xmm2,%%xmm0 \n"
1008 "pavgb %%xmm3,%%xmm1 \n"
1009 "movdqa %%xmm0,%%xmm2 \n"
1010 "shufps $0x88,%%xmm1,%%xmm0 \n"
1011 "shufps $0xdd,%%xmm1,%%xmm2 \n"
1012 "pavgb %%xmm2,%%xmm0 \n"
1013 "movdqu %%xmm0," MEMACCESS(1) " \n"
1014 "lea " MEMLEA(0x10,1) ",%1 \n"
1015 "sub $0x4,%2 \n"
1016 "jg 1b \n"
1017 : "+r"(src_argb), // %0
1018 "+r"(dst_argb), // %1
1019 "+r"(dst_width) // %2
1020 : "r"((intptr_t)(src_stride)) // %3
1021 : "memory", "cc", NACL_R14
1022 "xmm0", "xmm1", "xmm2", "xmm3"
1023 );
1024 }
1025
1026 // Reads 4 pixels at a time.
1027 // Alignment requirement: dst_argb 16 byte aligned.
ScaleARGBRowDownEven_SSE2(const uint8 * src_argb,ptrdiff_t src_stride,int src_stepx,uint8 * dst_argb,int dst_width)1028 void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
1029 int src_stepx, uint8* dst_argb, int dst_width) {
1030 intptr_t src_stepx_x4 = (intptr_t)(src_stepx);
1031 intptr_t src_stepx_x12;
1032 asm volatile (
1033 "lea " MEMLEA3(0x00,1,4) ",%1 \n"
1034 "lea " MEMLEA4(0x00,1,1,2) ",%4 \n"
1035 LABELALIGN
1036 "1: \n"
1037 "movd " MEMACCESS(0) ",%%xmm0 \n"
1038 MEMOPREG(movd,0x00,0,1,1,xmm1) // movd (%0,%1,1),%%xmm1
1039 "punpckldq %%xmm1,%%xmm0 \n"
1040 MEMOPREG(movd,0x00,0,1,2,xmm2) // movd (%0,%1,2),%%xmm2
1041 MEMOPREG(movd,0x00,0,4,1,xmm3) // movd (%0,%4,1),%%xmm3
1042 "lea " MEMLEA4(0x00,0,1,4) ",%0 \n"
1043 "punpckldq %%xmm3,%%xmm2 \n"
1044 "punpcklqdq %%xmm2,%%xmm0 \n"
1045 "movdqu %%xmm0," MEMACCESS(2) " \n"
1046 "lea " MEMLEA(0x10,2) ",%2 \n"
1047 "sub $0x4,%3 \n"
1048 "jg 1b \n"
1049 : "+r"(src_argb), // %0
1050 "+r"(src_stepx_x4), // %1
1051 "+r"(dst_argb), // %2
1052 "+r"(dst_width), // %3
1053 "=&r"(src_stepx_x12) // %4
1054 :: "memory", "cc", NACL_R14
1055 "xmm0", "xmm1", "xmm2", "xmm3"
1056 );
1057 }
1058
1059 // Blends four 2x2 to 4x1.
1060 // Alignment requirement: dst_argb 16 byte aligned.
ScaleARGBRowDownEvenBox_SSE2(const uint8 * src_argb,ptrdiff_t src_stride,int src_stepx,uint8 * dst_argb,int dst_width)1061 void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,
1062 ptrdiff_t src_stride, int src_stepx,
1063 uint8* dst_argb, int dst_width) {
1064 intptr_t src_stepx_x4 = (intptr_t)(src_stepx);
1065 intptr_t src_stepx_x12;
1066 intptr_t row1 = (intptr_t)(src_stride);
1067 asm volatile (
1068 "lea " MEMLEA3(0x00,1,4) ",%1 \n"
1069 "lea " MEMLEA4(0x00,1,1,2) ",%4 \n"
1070 "lea " MEMLEA4(0x00,0,5,1) ",%5 \n"
1071
1072 LABELALIGN
1073 "1: \n"
1074 "movq " MEMACCESS(0) ",%%xmm0 \n"
1075 MEMOPREG(movhps,0x00,0,1,1,xmm0) // movhps (%0,%1,1),%%xmm0
1076 MEMOPREG(movq,0x00,0,1,2,xmm1) // movq (%0,%1,2),%%xmm1
1077 MEMOPREG(movhps,0x00,0,4,1,xmm1) // movhps (%0,%4,1),%%xmm1
1078 "lea " MEMLEA4(0x00,0,1,4) ",%0 \n"
1079 "movq " MEMACCESS(5) ",%%xmm2 \n"
1080 MEMOPREG(movhps,0x00,5,1,1,xmm2) // movhps (%5,%1,1),%%xmm2
1081 MEMOPREG(movq,0x00,5,1,2,xmm3) // movq (%5,%1,2),%%xmm3
1082 MEMOPREG(movhps,0x00,5,4,1,xmm3) // movhps (%5,%4,1),%%xmm3
1083 "lea " MEMLEA4(0x00,5,1,4) ",%5 \n"
1084 "pavgb %%xmm2,%%xmm0 \n"
1085 "pavgb %%xmm3,%%xmm1 \n"
1086 "movdqa %%xmm0,%%xmm2 \n"
1087 "shufps $0x88,%%xmm1,%%xmm0 \n"
1088 "shufps $0xdd,%%xmm1,%%xmm2 \n"
1089 "pavgb %%xmm2,%%xmm0 \n"
1090 "movdqu %%xmm0," MEMACCESS(2) " \n"
1091 "lea " MEMLEA(0x10,2) ",%2 \n"
1092 "sub $0x4,%3 \n"
1093 "jg 1b \n"
1094 : "+r"(src_argb), // %0
1095 "+r"(src_stepx_x4), // %1
1096 "+r"(dst_argb), // %2
1097 "+rm"(dst_width), // %3
1098 "=&r"(src_stepx_x12), // %4
1099 "+r"(row1) // %5
1100 :: "memory", "cc", NACL_R14
1101 "xmm0", "xmm1", "xmm2", "xmm3"
1102 );
1103 }
1104
ScaleARGBCols_SSE2(uint8 * dst_argb,const uint8 * src_argb,int dst_width,int x,int dx)1105 void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb,
1106 int dst_width, int x, int dx) {
1107 intptr_t x0, x1;
1108 asm volatile (
1109 "movd %5,%%xmm2 \n"
1110 "movd %6,%%xmm3 \n"
1111 "pshufd $0x0,%%xmm2,%%xmm2 \n"
1112 "pshufd $0x11,%%xmm3,%%xmm0 \n"
1113 "paddd %%xmm0,%%xmm2 \n"
1114 "paddd %%xmm3,%%xmm3 \n"
1115 "pshufd $0x5,%%xmm3,%%xmm0 \n"
1116 "paddd %%xmm0,%%xmm2 \n"
1117 "paddd %%xmm3,%%xmm3 \n"
1118 "pshufd $0x0,%%xmm3,%%xmm3 \n"
1119 "pextrw $0x1,%%xmm2,%k0 \n"
1120 "pextrw $0x3,%%xmm2,%k1 \n"
1121 "cmp $0x0,%4 \n"
1122 "jl 99f \n"
1123 "sub $0x4,%4 \n"
1124 "jl 49f \n"
1125
1126 LABELALIGN
1127 "40: \n"
1128 MEMOPREG(movd,0x00,3,0,4,xmm0) // movd (%3,%0,4),%%xmm0
1129 MEMOPREG(movd,0x00,3,1,4,xmm1) // movd (%3,%1,4),%%xmm1
1130 "pextrw $0x5,%%xmm2,%k0 \n"
1131 "pextrw $0x7,%%xmm2,%k1 \n"
1132 "paddd %%xmm3,%%xmm2 \n"
1133 "punpckldq %%xmm1,%%xmm0 \n"
1134 MEMOPREG(movd,0x00,3,0,4,xmm1) // movd (%3,%0,4),%%xmm1
1135 MEMOPREG(movd,0x00,3,1,4,xmm4) // movd (%3,%1,4),%%xmm4
1136 "pextrw $0x1,%%xmm2,%k0 \n"
1137 "pextrw $0x3,%%xmm2,%k1 \n"
1138 "punpckldq %%xmm4,%%xmm1 \n"
1139 "punpcklqdq %%xmm1,%%xmm0 \n"
1140 "movdqu %%xmm0," MEMACCESS(2) " \n"
1141 "lea " MEMLEA(0x10,2) ",%2 \n"
1142 "sub $0x4,%4 \n"
1143 "jge 40b \n"
1144
1145 "49: \n"
1146 "test $0x2,%4 \n"
1147 "je 29f \n"
1148 MEMOPREG(movd,0x00,3,0,4,xmm0) // movd (%3,%0,4),%%xmm0
1149 MEMOPREG(movd,0x00,3,1,4,xmm1) // movd (%3,%1,4),%%xmm1
1150 "pextrw $0x5,%%xmm2,%k0 \n"
1151 "punpckldq %%xmm1,%%xmm0 \n"
1152 "movq %%xmm0," MEMACCESS(2) " \n"
1153 "lea " MEMLEA(0x8,2) ",%2 \n"
1154 "29: \n"
1155 "test $0x1,%4 \n"
1156 "je 99f \n"
1157 MEMOPREG(movd,0x00,3,0,4,xmm0) // movd (%3,%0,4),%%xmm0
1158 "movd %%xmm0," MEMACCESS(2) " \n"
1159 "99: \n"
1160 : "=&a"(x0), // %0
1161 "=&d"(x1), // %1
1162 "+r"(dst_argb), // %2
1163 "+r"(src_argb), // %3
1164 "+r"(dst_width) // %4
1165 : "rm"(x), // %5
1166 "rm"(dx) // %6
1167 : "memory", "cc", NACL_R14
1168 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
1169 );
1170 }
1171
1172 // Reads 4 pixels, duplicates them and writes 8 pixels.
1173 // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
ScaleARGBColsUp2_SSE2(uint8 * dst_argb,const uint8 * src_argb,int dst_width,int x,int dx)1174 void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb,
1175 int dst_width, int x, int dx) {
1176 asm volatile (
1177 LABELALIGN
1178 "1: \n"
1179 "movdqu " MEMACCESS(1) ",%%xmm0 \n"
1180 "lea " MEMLEA(0x10,1) ",%1 \n"
1181 "movdqa %%xmm0,%%xmm1 \n"
1182 "punpckldq %%xmm0,%%xmm0 \n"
1183 "punpckhdq %%xmm1,%%xmm1 \n"
1184 "movdqu %%xmm0," MEMACCESS(0) " \n"
1185 "movdqu %%xmm1," MEMACCESS2(0x10,0) " \n"
1186 "lea " MEMLEA(0x20,0) ",%0 \n"
1187 "sub $0x8,%2 \n"
1188 "jg 1b \n"
1189
1190 : "+r"(dst_argb), // %0
1191 "+r"(src_argb), // %1
1192 "+r"(dst_width) // %2
1193 :: "memory", "cc", NACL_R14
1194 "xmm0", "xmm1"
1195 );
1196 }
1197
1198 // Shuffle table for arranging 2 pixels into pairs for pmaddubsw
1199 static uvec8 kShuffleColARGB = {
1200 0u, 4u, 1u, 5u, 2u, 6u, 3u, 7u, // bbggrraa 1st pixel
1201 8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u // bbggrraa 2nd pixel
1202 };
1203
1204 // Shuffle table for duplicating 2 fractions into 8 bytes each
1205 static uvec8 kShuffleFractions = {
1206 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u,
1207 };
1208
1209 // Bilinear row filtering combines 4x2 -> 4x1. SSSE3 version
ScaleARGBFilterCols_SSSE3(uint8 * dst_argb,const uint8 * src_argb,int dst_width,int x,int dx)1210 void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb,
1211 int dst_width, int x, int dx) {
1212 intptr_t x0, x1;
1213 asm volatile (
1214 "movdqa %0,%%xmm4 \n"
1215 "movdqa %1,%%xmm5 \n"
1216 :
1217 : "m"(kShuffleColARGB), // %0
1218 "m"(kShuffleFractions) // %1
1219 );
1220
1221 asm volatile (
1222 "movd %5,%%xmm2 \n"
1223 "movd %6,%%xmm3 \n"
1224 "pcmpeqb %%xmm6,%%xmm6 \n"
1225 "psrlw $0x9,%%xmm6 \n"
1226 "pextrw $0x1,%%xmm2,%k3 \n"
1227 "sub $0x2,%2 \n"
1228 "jl 29f \n"
1229 "movdqa %%xmm2,%%xmm0 \n"
1230 "paddd %%xmm3,%%xmm0 \n"
1231 "punpckldq %%xmm0,%%xmm2 \n"
1232 "punpckldq %%xmm3,%%xmm3 \n"
1233 "paddd %%xmm3,%%xmm3 \n"
1234 "pextrw $0x3,%%xmm2,%k4 \n"
1235
1236 LABELALIGN
1237 "2: \n"
1238 "movdqa %%xmm2,%%xmm1 \n"
1239 "paddd %%xmm3,%%xmm2 \n"
1240 MEMOPREG(movq,0x00,1,3,4,xmm0) // movq (%1,%3,4),%%xmm0
1241 "psrlw $0x9,%%xmm1 \n"
1242 MEMOPREG(movhps,0x00,1,4,4,xmm0) // movhps (%1,%4,4),%%xmm0
1243 "pshufb %%xmm5,%%xmm1 \n"
1244 "pshufb %%xmm4,%%xmm0 \n"
1245 "pxor %%xmm6,%%xmm1 \n"
1246 "pmaddubsw %%xmm1,%%xmm0 \n"
1247 "psrlw $0x7,%%xmm0 \n"
1248 "pextrw $0x1,%%xmm2,%k3 \n"
1249 "pextrw $0x3,%%xmm2,%k4 \n"
1250 "packuswb %%xmm0,%%xmm0 \n"
1251 "movq %%xmm0," MEMACCESS(0) " \n"
1252 "lea " MEMLEA(0x8,0) ",%0 \n"
1253 "sub $0x2,%2 \n"
1254 "jge 2b \n"
1255
1256 LABELALIGN
1257 "29: \n"
1258 "add $0x1,%2 \n"
1259 "jl 99f \n"
1260 "psrlw $0x9,%%xmm2 \n"
1261 MEMOPREG(movq,0x00,1,3,4,xmm0) // movq (%1,%3,4),%%xmm0
1262 "pshufb %%xmm5,%%xmm2 \n"
1263 "pshufb %%xmm4,%%xmm0 \n"
1264 "pxor %%xmm6,%%xmm2 \n"
1265 "pmaddubsw %%xmm2,%%xmm0 \n"
1266 "psrlw $0x7,%%xmm0 \n"
1267 "packuswb %%xmm0,%%xmm0 \n"
1268 "movd %%xmm0," MEMACCESS(0) " \n"
1269
1270 LABELALIGN
1271 "99: \n"
1272 : "+r"(dst_argb), // %0
1273 "+r"(src_argb), // %1
1274 "+rm"(dst_width), // %2
1275 "=&r"(x0), // %3
1276 "=&r"(x1) // %4
1277 : "rm"(x), // %5
1278 "rm"(dx) // %6
1279 : "memory", "cc", NACL_R14
1280 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
1281 );
1282 }
1283
1284 // Divide num by div and return as 16.16 fixed point result.
FixedDiv_X86(int num,int div)1285 int FixedDiv_X86(int num, int div) {
1286 asm volatile (
1287 "cdq \n"
1288 "shld $0x10,%%eax,%%edx \n"
1289 "shl $0x10,%%eax \n"
1290 "idiv %1 \n"
1291 "mov %0, %%eax \n"
1292 : "+a"(num) // %0
1293 : "c"(div) // %1
1294 : "memory", "cc", "edx"
1295 );
1296 return num;
1297 }
1298
1299 // Divide num - 1 by div - 1 and return as 16.16 fixed point result.
FixedDiv1_X86(int num,int div)1300 int FixedDiv1_X86(int num, int div) {
1301 asm volatile (
1302 "cdq \n"
1303 "shld $0x10,%%eax,%%edx \n"
1304 "shl $0x10,%%eax \n"
1305 "sub $0x10001,%%eax \n"
1306 "sbb $0x0,%%edx \n"
1307 "sub $0x1,%1 \n"
1308 "idiv %1 \n"
1309 "mov %0, %%eax \n"
1310 : "+a"(num) // %0
1311 : "c"(div) // %1
1312 : "memory", "cc", "edx"
1313 );
1314 return num;
1315 }
1316
1317 #endif // defined(__x86_64__) || defined(__i386__)
1318
1319 #ifdef __cplusplus
1320 } // extern "C"
1321 } // namespace libyuv
1322 #endif
1323