1 /*
2  *  Copyright 2013 The LibYuv Project Authors. All rights reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS. All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include "libyuv/row.h"
12 #include "libyuv/scale_row.h"
13 
14 #ifdef __cplusplus
15 namespace libyuv {
16 extern "C" {
17 #endif
18 
19 // This module is for GCC x86 and x64.
20 #if !defined(LIBYUV_DISABLE_X86) && \
21     (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))
22 
23 // Offsets for source bytes 0 to 9
24 static uvec8 kShuf0 =
25   { 0, 1, 3, 4, 5, 7, 8, 9, 128, 128, 128, 128, 128, 128, 128, 128 };
26 
27 // Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12.
28 static uvec8 kShuf1 =
29   { 3, 4, 5, 7, 8, 9, 11, 12, 128, 128, 128, 128, 128, 128, 128, 128 };
30 
31 // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
32 static uvec8 kShuf2 =
33   { 5, 7, 8, 9, 11, 12, 13, 15, 128, 128, 128, 128, 128, 128, 128, 128 };
34 
35 // Offsets for source bytes 0 to 10
36 static uvec8 kShuf01 =
37   { 0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10 };
38 
39 // Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13.
40 static uvec8 kShuf11 =
41   { 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13 };
42 
43 // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
44 static uvec8 kShuf21 =
45   { 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13, 13, 14, 14, 15 };
46 
47 // Coefficients for source bytes 0 to 10
48 static uvec8 kMadd01 =
49   { 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2 };
50 
51 // Coefficients for source bytes 10 to 21
52 static uvec8 kMadd11 =
53   { 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1 };
54 
55 // Coefficients for source bytes 21 to 31
56 static uvec8 kMadd21 =
57   { 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3 };
58 
59 // Coefficients for source bytes 21 to 31
60 static vec16 kRound34 =
61   { 2, 2, 2, 2, 2, 2, 2, 2 };
62 
63 static uvec8 kShuf38a =
64   { 0, 3, 6, 8, 11, 14, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };
65 
66 static uvec8 kShuf38b =
67   { 128, 128, 128, 128, 128, 128, 0, 3, 6, 8, 11, 14, 128, 128, 128, 128 };
68 
69 // Arrange words 0,3,6 into 0,1,2
70 static uvec8 kShufAc =
71   { 0, 1, 6, 7, 12, 13, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };
72 
73 // Arrange words 0,3,6 into 3,4,5
74 static uvec8 kShufAc3 =
75   { 128, 128, 128, 128, 128, 128, 0, 1, 6, 7, 12, 13, 128, 128, 128, 128 };
76 
77 // Scaling values for boxes of 3x3 and 2x3
78 static uvec16 kScaleAc33 =
79   { 65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9, 65536 / 9, 65536 / 6, 0, 0 };
80 
81 // Arrange first value for pixels 0,1,2,3,4,5
82 static uvec8 kShufAb0 =
83   { 0, 128, 3, 128, 6, 128, 8, 128, 11, 128, 14, 128, 128, 128, 128, 128 };
84 
85 // Arrange second value for pixels 0,1,2,3,4,5
86 static uvec8 kShufAb1 =
87   { 1, 128, 4, 128, 7, 128, 9, 128, 12, 128, 15, 128, 128, 128, 128, 128 };
88 
89 // Arrange third value for pixels 0,1,2,3,4,5
90 static uvec8 kShufAb2 =
91   { 2, 128, 5, 128, 128, 128, 10, 128, 13, 128, 128, 128, 128, 128, 128, 128 };
92 
93 // Scaling values for boxes of 3x2 and 2x2
94 static uvec16 kScaleAb2 =
95   { 65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, 65536 / 3, 65536 / 2, 0, 0 };
96 
97 // GCC versions of row functions are verbatim conversions from Visual C.
98 // Generated using gcc disassembly on Visual C object file:
99 // objdump -D yuvscaler.obj >yuvscaler.txt
100 
ScaleRowDown2_SSSE3(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)101 void ScaleRowDown2_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
102                          uint8* dst_ptr, int dst_width) {
103   asm volatile (
104     LABELALIGN
105   "1:                                          \n"
106     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
107     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
108     "lea       " MEMLEA(0x20,0) ",%0           \n"
109     "psrlw     $0x8,%%xmm0                     \n"
110     "psrlw     $0x8,%%xmm1                     \n"
111     "packuswb  %%xmm1,%%xmm0                   \n"
112     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
113     "lea       " MEMLEA(0x10,1) ",%1           \n"
114     "sub       $0x10,%2                        \n"
115     "jg        1b                              \n"
116   : "+r"(src_ptr),    // %0
117     "+r"(dst_ptr),    // %1
118     "+r"(dst_width)   // %2
119   :: "memory", "cc", "xmm0", "xmm1"
120   );
121 }
122 
ScaleRowDown2Linear_SSSE3(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)123 void ScaleRowDown2Linear_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
124                                uint8* dst_ptr, int dst_width) {
125   asm volatile (
126     "pcmpeqb    %%xmm4,%%xmm4                  \n"
127     "psrlw      $0xf,%%xmm4                    \n"
128     "packuswb   %%xmm4,%%xmm4                  \n"
129     "pxor       %%xmm5,%%xmm5                  \n"
130 
131     LABELALIGN
132   "1:                                          \n"
133     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
134     "movdqu    " MEMACCESS2(0x10, 0) ",%%xmm1  \n"
135     "lea       " MEMLEA(0x20,0) ",%0           \n"
136     "pmaddubsw  %%xmm4,%%xmm0                  \n"
137     "pmaddubsw  %%xmm4,%%xmm1                  \n"
138     "pavgw      %%xmm5,%%xmm0                  \n"
139     "pavgw      %%xmm5,%%xmm1                  \n"
140     "packuswb   %%xmm1,%%xmm0                  \n"
141     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
142     "lea       " MEMLEA(0x10,1) ",%1           \n"
143     "sub       $0x10,%2                        \n"
144     "jg        1b                              \n"
145   : "+r"(src_ptr),    // %0
146     "+r"(dst_ptr),    // %1
147     "+r"(dst_width)   // %2
148   :: "memory", "cc", "xmm0", "xmm1", "xmm4", "xmm5"
149   );
150 }
151 
ScaleRowDown2Box_SSSE3(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)152 void ScaleRowDown2Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
153                             uint8* dst_ptr, int dst_width) {
154   asm volatile (
155     "pcmpeqb    %%xmm4,%%xmm4                  \n"
156     "psrlw      $0xf,%%xmm4                    \n"
157     "packuswb   %%xmm4,%%xmm4                  \n"
158     "pxor       %%xmm5,%%xmm5                  \n"
159 
160     LABELALIGN
161   "1:                                          \n"
162     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
163     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
164     MEMOPREG(movdqu,0x00,0,3,1,xmm2)           //  movdqu  (%0,%3,1),%%xmm2
165     MEMOPREG(movdqu,0x10,0,3,1,xmm3)           //  movdqu  0x10(%0,%3,1),%%xmm3
166     "lea       " MEMLEA(0x20,0) ",%0           \n"
167     "pmaddubsw  %%xmm4,%%xmm0                  \n"
168     "pmaddubsw  %%xmm4,%%xmm1                  \n"
169     "pmaddubsw  %%xmm4,%%xmm2                  \n"
170     "pmaddubsw  %%xmm4,%%xmm3                  \n"
171     "paddw      %%xmm2,%%xmm0                  \n"
172     "paddw      %%xmm3,%%xmm1                  \n"
173     "psrlw      $0x1,%%xmm0                    \n"
174     "psrlw      $0x1,%%xmm1                    \n"
175     "pavgw      %%xmm5,%%xmm0                  \n"
176     "pavgw      %%xmm5,%%xmm1                  \n"
177     "packuswb   %%xmm1,%%xmm0                  \n"
178     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
179     "lea       " MEMLEA(0x10,1) ",%1           \n"
180     "sub       $0x10,%2                        \n"
181     "jg        1b                              \n"
182   : "+r"(src_ptr),    // %0
183     "+r"(dst_ptr),    // %1
184     "+r"(dst_width)   // %2
185   : "r"((intptr_t)(src_stride))   // %3
186   : "memory", "cc", NACL_R14
187     "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
188   );
189 }
190 
191 #ifdef HAS_SCALEROWDOWN2_AVX2
ScaleRowDown2_AVX2(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)192 void ScaleRowDown2_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
193                         uint8* dst_ptr, int dst_width) {
194   asm volatile (
195     LABELALIGN
196   "1:                                          \n"
197     "vmovdqu    " MEMACCESS(0) ",%%ymm0        \n"
198     "vmovdqu    " MEMACCESS2(0x20,0) ",%%ymm1  \n"
199     "lea        " MEMLEA(0x40,0) ",%0          \n"
200     "vpsrlw     $0x8,%%ymm0,%%ymm0             \n"
201     "vpsrlw     $0x8,%%ymm1,%%ymm1             \n"
202     "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"
203     "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
204     "vmovdqu    %%ymm0," MEMACCESS(1) "        \n"
205     "lea        " MEMLEA(0x20,1) ",%1          \n"
206     "sub        $0x20,%2                       \n"
207     "jg         1b                             \n"
208     "vzeroupper                                \n"
209   : "+r"(src_ptr),    // %0
210     "+r"(dst_ptr),    // %1
211     "+r"(dst_width)   // %2
212   :: "memory", "cc", "xmm0", "xmm1"
213   );
214 }
215 
ScaleRowDown2Linear_AVX2(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)216 void ScaleRowDown2Linear_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
217                               uint8* dst_ptr, int dst_width) {
218   asm volatile (
219     "vpcmpeqb   %%ymm4,%%ymm4,%%ymm4           \n"
220     "vpsrlw     $0xf,%%ymm4,%%ymm4             \n"
221     "vpackuswb  %%ymm4,%%ymm4,%%ymm4           \n"
222     "vpxor      %%ymm5,%%ymm5,%%ymm5           \n"
223 
224     LABELALIGN
225   "1:                                          \n"
226     "vmovdqu    " MEMACCESS(0) ",%%ymm0        \n"
227     "vmovdqu    " MEMACCESS2(0x20, 0) ",%%ymm1 \n"
228     "lea        " MEMLEA(0x40,0) ",%0          \n"
229     "vpmaddubsw %%ymm4,%%ymm0,%%ymm0           \n"
230     "vpmaddubsw %%ymm4,%%ymm1,%%ymm1           \n"
231     "vpavgw     %%ymm5,%%ymm0,%%ymm0           \n"
232     "vpavgw     %%ymm5,%%ymm1,%%ymm1           \n"
233     "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"
234     "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
235     "vmovdqu    %%ymm0," MEMACCESS(1) "        \n"
236     "lea        " MEMLEA(0x20,1) ",%1          \n"
237     "sub        $0x20,%2                       \n"
238     "jg         1b                             \n"
239     "vzeroupper                                \n"
240   : "+r"(src_ptr),    // %0
241     "+r"(dst_ptr),    // %1
242     "+r"(dst_width)   // %2
243   :: "memory", "cc", "xmm0", "xmm1", "xmm4", "xmm5"
244   );
245 }
246 
ScaleRowDown2Box_AVX2(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)247 void ScaleRowDown2Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
248                            uint8* dst_ptr, int dst_width) {
249   asm volatile (
250     "vpcmpeqb   %%ymm4,%%ymm4,%%ymm4           \n"
251     "vpsrlw     $0xf,%%ymm4,%%ymm4             \n"
252     "vpackuswb  %%ymm4,%%ymm4,%%ymm4           \n"
253     "vpxor      %%ymm5,%%ymm5,%%ymm5           \n"
254 
255     LABELALIGN
256   "1:                                          \n"
257     "vmovdqu    " MEMACCESS(0) ",%%ymm0        \n"
258     "vmovdqu    " MEMACCESS2(0x20,0) ",%%ymm1  \n"
259     MEMOPREG(vmovdqu,0x00,0,3,1,ymm2)          //  vmovdqu  (%0,%3,1),%%ymm2
260     MEMOPREG(vmovdqu,0x20,0,3,1,ymm3)          //  vmovdqu  0x20(%0,%3,1),%%ymm3
261     "lea        " MEMLEA(0x40,0) ",%0          \n"
262     "vpmaddubsw %%ymm4,%%ymm0,%%ymm0           \n"
263     "vpmaddubsw %%ymm4,%%ymm1,%%ymm1           \n"
264     "vpmaddubsw %%ymm4,%%ymm2,%%ymm2           \n"
265     "vpmaddubsw %%ymm4,%%ymm3,%%ymm3           \n"
266     "vpaddw     %%ymm2,%%ymm0,%%ymm0           \n"
267     "vpaddw     %%ymm3,%%ymm1,%%ymm1           \n"
268     "vpsrlw     $0x1,%%ymm0,%%ymm0             \n"
269     "vpsrlw     $0x1,%%ymm1,%%ymm1             \n"
270     "vpavgw     %%ymm5,%%ymm0,%%ymm0           \n"
271     "vpavgw     %%ymm5,%%ymm1,%%ymm1           \n"
272     "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"
273     "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
274     "vmovdqu    %%ymm0," MEMACCESS(1) "        \n"
275     "lea        " MEMLEA(0x20,1) ",%1          \n"
276     "sub        $0x20,%2                       \n"
277     "jg         1b                             \n"
278     "vzeroupper                                \n"
279   : "+r"(src_ptr),    // %0
280     "+r"(dst_ptr),    // %1
281     "+r"(dst_width)   // %2
282   : "r"((intptr_t)(src_stride))   // %3
283   : "memory", "cc", NACL_R14
284     "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
285   );
286 }
287 #endif  // HAS_SCALEROWDOWN2_AVX2
288 
ScaleRowDown4_SSSE3(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)289 void ScaleRowDown4_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
290                         uint8* dst_ptr, int dst_width) {
291   asm volatile (
292     "pcmpeqb   %%xmm5,%%xmm5                   \n"
293     "psrld     $0x18,%%xmm5                    \n"
294     "pslld     $0x10,%%xmm5                    \n"
295 
296     LABELALIGN
297   "1:                                          \n"
298     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
299     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
300     "lea       " MEMLEA(0x20,0) ",%0           \n"
301     "pand      %%xmm5,%%xmm0                   \n"
302     "pand      %%xmm5,%%xmm1                   \n"
303     "packuswb  %%xmm1,%%xmm0                   \n"
304     "psrlw     $0x8,%%xmm0                     \n"
305     "packuswb  %%xmm0,%%xmm0                   \n"
306     "movq      %%xmm0," MEMACCESS(1) "         \n"
307     "lea       " MEMLEA(0x8,1) ",%1            \n"
308     "sub       $0x8,%2                         \n"
309     "jg        1b                              \n"
310   : "+r"(src_ptr),    // %0
311     "+r"(dst_ptr),    // %1
312     "+r"(dst_width)   // %2
313   :: "memory", "cc", "xmm0", "xmm1", "xmm5"
314   );
315 }
316 
ScaleRowDown4Box_SSSE3(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)317 void ScaleRowDown4Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
318                            uint8* dst_ptr, int dst_width) {
319   intptr_t stridex3;
320   asm volatile (
321     "pcmpeqb    %%xmm4,%%xmm4                  \n"
322     "psrlw      $0xf,%%xmm4                    \n"
323     "movdqa     %%xmm4,%%xmm5                  \n"
324     "packuswb   %%xmm4,%%xmm4                  \n"
325     "psllw      $0x3,%%xmm5                    \n"
326     "lea       " MEMLEA4(0x00,4,4,2) ",%3      \n"
327 
328     LABELALIGN
329   "1:                                          \n"
330     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
331     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
332     MEMOPREG(movdqu,0x00,0,4,1,xmm2)           //  movdqu  (%0,%4,1),%%xmm2
333     MEMOPREG(movdqu,0x10,0,4,1,xmm3)           //  movdqu  0x10(%0,%4,1),%%xmm3
334     "pmaddubsw  %%xmm4,%%xmm0                  \n"
335     "pmaddubsw  %%xmm4,%%xmm1                  \n"
336     "pmaddubsw  %%xmm4,%%xmm2                  \n"
337     "pmaddubsw  %%xmm4,%%xmm3                  \n"
338     "paddw      %%xmm2,%%xmm0                  \n"
339     "paddw      %%xmm3,%%xmm1                  \n"
340     MEMOPREG(movdqu,0x00,0,4,2,xmm2)           //  movdqu  (%0,%4,2),%%xmm2
341     MEMOPREG(movdqu,0x10,0,4,2,xmm3)           //  movdqu  0x10(%0,%4,2),%%xmm3
342     "pmaddubsw  %%xmm4,%%xmm2                  \n"
343     "pmaddubsw  %%xmm4,%%xmm3                  \n"
344     "paddw      %%xmm2,%%xmm0                  \n"
345     "paddw      %%xmm3,%%xmm1                  \n"
346     MEMOPREG(movdqu,0x00,0,3,1,xmm2)           //  movdqu  (%0,%3,1),%%xmm2
347     MEMOPREG(movdqu,0x10,0,3,1,xmm3)           //  movdqu  0x10(%0,%3,1),%%xmm3
348     "lea       " MEMLEA(0x20,0) ",%0           \n"
349     "pmaddubsw  %%xmm4,%%xmm2                  \n"
350     "pmaddubsw  %%xmm4,%%xmm3                  \n"
351     "paddw      %%xmm2,%%xmm0                  \n"
352     "paddw      %%xmm3,%%xmm1                  \n"
353     "phaddw     %%xmm1,%%xmm0                  \n"
354     "paddw      %%xmm5,%%xmm0                  \n"
355     "psrlw      $0x4,%%xmm0                    \n"
356     "packuswb   %%xmm0,%%xmm0                  \n"
357     "movq      %%xmm0," MEMACCESS(1) "         \n"
358     "lea       " MEMLEA(0x8,1) ",%1            \n"
359     "sub       $0x8,%2                         \n"
360     "jg        1b                              \n"
361   : "+r"(src_ptr),     // %0
362     "+r"(dst_ptr),     // %1
363     "+r"(dst_width),   // %2
364     "=&r"(stridex3)    // %3
365   : "r"((intptr_t)(src_stride))    // %4
366   : "memory", "cc", NACL_R14
367     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
368   );
369 }
370 
371 
372 #ifdef HAS_SCALEROWDOWN4_AVX2
ScaleRowDown4_AVX2(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)373 void ScaleRowDown4_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
374                         uint8* dst_ptr, int dst_width) {
375   asm volatile (
376     "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
377     "vpsrld     $0x18,%%ymm5,%%ymm5            \n"
378     "vpslld     $0x10,%%ymm5,%%ymm5            \n"
379     LABELALIGN
380   "1:                                          \n"
381     "vmovdqu    " MEMACCESS(0) ",%%ymm0        \n"
382     "vmovdqu    " MEMACCESS2(0x20,0) ",%%ymm1  \n"
383     "lea        " MEMLEA(0x40,0) ",%0          \n"
384     "vpand      %%ymm5,%%ymm0,%%ymm0           \n"
385     "vpand      %%ymm5,%%ymm1,%%ymm1           \n"
386     "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"
387     "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
388     "vpsrlw     $0x8,%%ymm0,%%ymm0             \n"
389     "vpackuswb  %%ymm0,%%ymm0,%%ymm0           \n"
390     "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
391     "vmovdqu    %%xmm0," MEMACCESS(1) "        \n"
392     "lea        " MEMLEA(0x10,1) ",%1          \n"
393     "sub        $0x10,%2                       \n"
394     "jg         1b                             \n"
395     "vzeroupper                                \n"
396   : "+r"(src_ptr),    // %0
397     "+r"(dst_ptr),    // %1
398     "+r"(dst_width)   // %2
399   :: "memory", "cc", "xmm0", "xmm1", "xmm5"
400   );
401 }
402 
ScaleRowDown4Box_AVX2(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)403 void ScaleRowDown4Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
404                            uint8* dst_ptr, int dst_width) {
405   asm volatile (
406     "vpcmpeqb   %%ymm4,%%ymm4,%%ymm4           \n"
407     "vpsrlw     $0xf,%%ymm4,%%ymm4             \n"
408     "vpsllw     $0x3,%%ymm4,%%ymm5             \n"
409     "vpackuswb  %%ymm4,%%ymm4,%%ymm4           \n"
410 
411     LABELALIGN
412   "1:                                          \n"
413     "vmovdqu    " MEMACCESS(0) ",%%ymm0        \n"
414     "vmovdqu    " MEMACCESS2(0x20,0) ",%%ymm1  \n"
415     MEMOPREG(vmovdqu,0x00,0,3,1,ymm2)          //  vmovdqu  (%0,%3,1),%%ymm2
416     MEMOPREG(vmovdqu,0x20,0,3,1,ymm3)          //  vmovdqu  0x20(%0,%3,1),%%ymm3
417     "vpmaddubsw %%ymm4,%%ymm0,%%ymm0           \n"
418     "vpmaddubsw %%ymm4,%%ymm1,%%ymm1           \n"
419     "vpmaddubsw %%ymm4,%%ymm2,%%ymm2           \n"
420     "vpmaddubsw %%ymm4,%%ymm3,%%ymm3           \n"
421     "vpaddw     %%ymm2,%%ymm0,%%ymm0           \n"
422     "vpaddw     %%ymm3,%%ymm1,%%ymm1           \n"
423     MEMOPREG(vmovdqu,0x00,0,3,2,ymm2)          //  vmovdqu  (%0,%3,2),%%ymm2
424     MEMOPREG(vmovdqu,0x20,0,3,2,ymm3)          //  vmovdqu  0x20(%0,%3,2),%%ymm3
425     "vpmaddubsw %%ymm4,%%ymm2,%%ymm2           \n"
426     "vpmaddubsw %%ymm4,%%ymm3,%%ymm3           \n"
427     "vpaddw     %%ymm2,%%ymm0,%%ymm0           \n"
428     "vpaddw     %%ymm3,%%ymm1,%%ymm1           \n"
429     MEMOPREG(vmovdqu,0x00,0,4,1,ymm2)          //  vmovdqu  (%0,%4,1),%%ymm2
430     MEMOPREG(vmovdqu,0x20,0,4,1,ymm3)          //  vmovdqu  0x20(%0,%4,1),%%ymm3
431     "lea        " MEMLEA(0x40,0) ",%0          \n"
432     "vpmaddubsw %%ymm4,%%ymm2,%%ymm2           \n"
433     "vpmaddubsw %%ymm4,%%ymm3,%%ymm3           \n"
434     "vpaddw     %%ymm2,%%ymm0,%%ymm0           \n"
435     "vpaddw     %%ymm3,%%ymm1,%%ymm1           \n"
436     "vphaddw    %%ymm1,%%ymm0,%%ymm0           \n"
437     "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
438     "vpaddw     %%ymm5,%%ymm0,%%ymm0           \n"
439     "vpsrlw     $0x4,%%ymm0,%%ymm0             \n"
440     "vpackuswb  %%ymm0,%%ymm0,%%ymm0           \n"
441     "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
442     "vmovdqu    %%xmm0," MEMACCESS(1) "        \n"
443     "lea        " MEMLEA(0x10,1) ",%1          \n"
444     "sub        $0x10,%2                       \n"
445     "jg         1b                             \n"
446     "vzeroupper                                \n"
447   : "+r"(src_ptr),    // %0
448     "+r"(dst_ptr),    // %1
449     "+r"(dst_width)   // %2
450   : "r"((intptr_t)(src_stride)),  // %3
451     "r"((intptr_t)(src_stride * 3))   // %4
452   : "memory", "cc", NACL_R14
453     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
454   );
455 }
456 #endif  // HAS_SCALEROWDOWN4_AVX2
457 
ScaleRowDown34_SSSE3(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)458 void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
459                           uint8* dst_ptr, int dst_width) {
460   asm volatile (
461     "movdqa    %0,%%xmm3                       \n"
462     "movdqa    %1,%%xmm4                       \n"
463     "movdqa    %2,%%xmm5                       \n"
464   :
465   : "m"(kShuf0),  // %0
466     "m"(kShuf1),  // %1
467     "m"(kShuf2)   // %2
468   );
469   asm volatile (
470     LABELALIGN
471   "1:                                          \n"
472     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
473     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm2   \n"
474     "lea       " MEMLEA(0x20,0) ",%0           \n"
475     "movdqa    %%xmm2,%%xmm1                   \n"
476     "palignr   $0x8,%%xmm0,%%xmm1              \n"
477     "pshufb    %%xmm3,%%xmm0                   \n"
478     "pshufb    %%xmm4,%%xmm1                   \n"
479     "pshufb    %%xmm5,%%xmm2                   \n"
480     "movq      %%xmm0," MEMACCESS(1) "         \n"
481     "movq      %%xmm1," MEMACCESS2(0x8,1) "    \n"
482     "movq      %%xmm2," MEMACCESS2(0x10,1) "   \n"
483     "lea       " MEMLEA(0x18,1) ",%1           \n"
484     "sub       $0x18,%2                        \n"
485     "jg        1b                              \n"
486   : "+r"(src_ptr),   // %0
487     "+r"(dst_ptr),   // %1
488     "+r"(dst_width)  // %2
489   :: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
490   );
491 }
492 
ScaleRowDown34_1_Box_SSSE3(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)493 void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
494                                 ptrdiff_t src_stride,
495                                 uint8* dst_ptr, int dst_width) {
496   asm volatile (
497     "movdqa    %0,%%xmm2                       \n"  // kShuf01
498     "movdqa    %1,%%xmm3                       \n"  // kShuf11
499     "movdqa    %2,%%xmm4                       \n"  // kShuf21
500   :
501   : "m"(kShuf01),  // %0
502     "m"(kShuf11),  // %1
503     "m"(kShuf21)   // %2
504   );
505   asm volatile (
506     "movdqa    %0,%%xmm5                       \n"  // kMadd01
507     "movdqa    %1,%%xmm0                       \n"  // kMadd11
508     "movdqa    %2,%%xmm1                       \n"  // kRound34
509   :
510   : "m"(kMadd01),  // %0
511     "m"(kMadd11),  // %1
512     "m"(kRound34)  // %2
513   );
514   asm volatile (
515     LABELALIGN
516   "1:                                          \n"
517     "movdqu    " MEMACCESS(0) ",%%xmm6         \n"
518     MEMOPREG(movdqu,0x00,0,3,1,xmm7)           //  movdqu  (%0,%3),%%xmm7
519     "pavgb     %%xmm7,%%xmm6                   \n"
520     "pshufb    %%xmm2,%%xmm6                   \n"
521     "pmaddubsw %%xmm5,%%xmm6                   \n"
522     "paddsw    %%xmm1,%%xmm6                   \n"
523     "psrlw     $0x2,%%xmm6                     \n"
524     "packuswb  %%xmm6,%%xmm6                   \n"
525     "movq      %%xmm6," MEMACCESS(1) "         \n"
526     "movdqu    " MEMACCESS2(0x8,0) ",%%xmm6    \n"
527     MEMOPREG(movdqu,0x8,0,3,1,xmm7)            //  movdqu  0x8(%0,%3),%%xmm7
528     "pavgb     %%xmm7,%%xmm6                   \n"
529     "pshufb    %%xmm3,%%xmm6                   \n"
530     "pmaddubsw %%xmm0,%%xmm6                   \n"
531     "paddsw    %%xmm1,%%xmm6                   \n"
532     "psrlw     $0x2,%%xmm6                     \n"
533     "packuswb  %%xmm6,%%xmm6                   \n"
534     "movq      %%xmm6," MEMACCESS2(0x8,1) "    \n"
535     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm6   \n"
536     MEMOPREG(movdqu,0x10,0,3,1,xmm7)           //  movdqu  0x10(%0,%3),%%xmm7
537     "lea       " MEMLEA(0x20,0) ",%0           \n"
538     "pavgb     %%xmm7,%%xmm6                   \n"
539     "pshufb    %%xmm4,%%xmm6                   \n"
540     "pmaddubsw %4,%%xmm6                       \n"
541     "paddsw    %%xmm1,%%xmm6                   \n"
542     "psrlw     $0x2,%%xmm6                     \n"
543     "packuswb  %%xmm6,%%xmm6                   \n"
544     "movq      %%xmm6," MEMACCESS2(0x10,1) "   \n"
545     "lea       " MEMLEA(0x18,1) ",%1           \n"
546     "sub       $0x18,%2                        \n"
547     "jg        1b                              \n"
548   : "+r"(src_ptr),   // %0
549     "+r"(dst_ptr),   // %1
550     "+r"(dst_width)  // %2
551   : "r"((intptr_t)(src_stride)),  // %3
552     "m"(kMadd21)     // %4
553   : "memory", "cc", NACL_R14
554     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
555   );
556 }
557 
ScaleRowDown34_0_Box_SSSE3(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)558 void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,
559                                 ptrdiff_t src_stride,
560                                 uint8* dst_ptr, int dst_width) {
561   asm volatile (
562     "movdqa    %0,%%xmm2                       \n"  // kShuf01
563     "movdqa    %1,%%xmm3                       \n"  // kShuf11
564     "movdqa    %2,%%xmm4                       \n"  // kShuf21
565   :
566   : "m"(kShuf01),  // %0
567     "m"(kShuf11),  // %1
568     "m"(kShuf21)   // %2
569   );
570   asm volatile (
571     "movdqa    %0,%%xmm5                       \n"  // kMadd01
572     "movdqa    %1,%%xmm0                       \n"  // kMadd11
573     "movdqa    %2,%%xmm1                       \n"  // kRound34
574   :
575   : "m"(kMadd01),  // %0
576     "m"(kMadd11),  // %1
577     "m"(kRound34)  // %2
578   );
579 
580   asm volatile (
581     LABELALIGN
582   "1:                                          \n"
583     "movdqu    " MEMACCESS(0) ",%%xmm6         \n"
584     MEMOPREG(movdqu,0x00,0,3,1,xmm7)           //  movdqu  (%0,%3,1),%%xmm7
585     "pavgb     %%xmm6,%%xmm7                   \n"
586     "pavgb     %%xmm7,%%xmm6                   \n"
587     "pshufb    %%xmm2,%%xmm6                   \n"
588     "pmaddubsw %%xmm5,%%xmm6                   \n"
589     "paddsw    %%xmm1,%%xmm6                   \n"
590     "psrlw     $0x2,%%xmm6                     \n"
591     "packuswb  %%xmm6,%%xmm6                   \n"
592     "movq      %%xmm6," MEMACCESS(1) "         \n"
593     "movdqu    " MEMACCESS2(0x8,0) ",%%xmm6    \n"
594     MEMOPREG(movdqu,0x8,0,3,1,xmm7)            //  movdqu  0x8(%0,%3,1),%%xmm7
595     "pavgb     %%xmm6,%%xmm7                   \n"
596     "pavgb     %%xmm7,%%xmm6                   \n"
597     "pshufb    %%xmm3,%%xmm6                   \n"
598     "pmaddubsw %%xmm0,%%xmm6                   \n"
599     "paddsw    %%xmm1,%%xmm6                   \n"
600     "psrlw     $0x2,%%xmm6                     \n"
601     "packuswb  %%xmm6,%%xmm6                   \n"
602     "movq      %%xmm6," MEMACCESS2(0x8,1) "    \n"
603     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm6   \n"
604     MEMOPREG(movdqu,0x10,0,3,1,xmm7)           //  movdqu  0x10(%0,%3,1),%%xmm7
605     "lea       " MEMLEA(0x20,0) ",%0           \n"
606     "pavgb     %%xmm6,%%xmm7                   \n"
607     "pavgb     %%xmm7,%%xmm6                   \n"
608     "pshufb    %%xmm4,%%xmm6                   \n"
609     "pmaddubsw %4,%%xmm6                       \n"
610     "paddsw    %%xmm1,%%xmm6                   \n"
611     "psrlw     $0x2,%%xmm6                     \n"
612     "packuswb  %%xmm6,%%xmm6                   \n"
613     "movq      %%xmm6," MEMACCESS2(0x10,1) "   \n"
614     "lea       " MEMLEA(0x18,1) ",%1           \n"
615     "sub       $0x18,%2                        \n"
616     "jg        1b                              \n"
617     : "+r"(src_ptr),   // %0
618       "+r"(dst_ptr),   // %1
619       "+r"(dst_width)  // %2
620     : "r"((intptr_t)(src_stride)),  // %3
621       "m"(kMadd21)     // %4
622     : "memory", "cc", NACL_R14
623       "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
624   );
625 }
626 
ScaleRowDown38_SSSE3(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)627 void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
628                           uint8* dst_ptr, int dst_width) {
629   asm volatile (
630     "movdqa    %3,%%xmm4                       \n"
631     "movdqa    %4,%%xmm5                       \n"
632 
633     LABELALIGN
634   "1:                                          \n"
635     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
636     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
637     "lea       " MEMLEA(0x20,0) ",%0           \n"
638     "pshufb    %%xmm4,%%xmm0                   \n"
639     "pshufb    %%xmm5,%%xmm1                   \n"
640     "paddusb   %%xmm1,%%xmm0                   \n"
641     "movq      %%xmm0," MEMACCESS(1) "         \n"
642     "movhlps   %%xmm0,%%xmm1                   \n"
643     "movd      %%xmm1," MEMACCESS2(0x8,1) "    \n"
644     "lea       " MEMLEA(0xc,1) ",%1            \n"
645     "sub       $0xc,%2                         \n"
646     "jg        1b                              \n"
647   : "+r"(src_ptr),   // %0
648     "+r"(dst_ptr),   // %1
649     "+r"(dst_width)  // %2
650   : "m"(kShuf38a),   // %3
651     "m"(kShuf38b)    // %4
652   : "memory", "cc", "xmm0", "xmm1", "xmm4", "xmm5"
653   );
654 }
655 
ScaleRowDown38_2_Box_SSSE3(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)656 void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,
657                                 ptrdiff_t src_stride,
658                                 uint8* dst_ptr, int dst_width) {
659   asm volatile (
660     "movdqa    %0,%%xmm2                       \n"
661     "movdqa    %1,%%xmm3                       \n"
662     "movdqa    %2,%%xmm4                       \n"
663     "movdqa    %3,%%xmm5                       \n"
664   :
665   : "m"(kShufAb0),   // %0
666     "m"(kShufAb1),   // %1
667     "m"(kShufAb2),   // %2
668     "m"(kScaleAb2)   // %3
669   );
670   asm volatile (
671     LABELALIGN
672   "1:                                          \n"
673     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
674     MEMOPREG(movdqu,0x00,0,3,1,xmm1)           //  movdqu  (%0,%3,1),%%xmm1
675     "lea       " MEMLEA(0x10,0) ",%0           \n"
676     "pavgb     %%xmm1,%%xmm0                   \n"
677     "movdqa    %%xmm0,%%xmm1                   \n"
678     "pshufb    %%xmm2,%%xmm1                   \n"
679     "movdqa    %%xmm0,%%xmm6                   \n"
680     "pshufb    %%xmm3,%%xmm6                   \n"
681     "paddusw   %%xmm6,%%xmm1                   \n"
682     "pshufb    %%xmm4,%%xmm0                   \n"
683     "paddusw   %%xmm0,%%xmm1                   \n"
684     "pmulhuw   %%xmm5,%%xmm1                   \n"
685     "packuswb  %%xmm1,%%xmm1                   \n"
686     "movd      %%xmm1," MEMACCESS(1) "         \n"
687     "psrlq     $0x10,%%xmm1                    \n"
688     "movd      %%xmm1," MEMACCESS2(0x2,1) "    \n"
689     "lea       " MEMLEA(0x6,1) ",%1            \n"
690     "sub       $0x6,%2                         \n"
691     "jg        1b                              \n"
692   : "+r"(src_ptr),     // %0
693     "+r"(dst_ptr),     // %1
694     "+r"(dst_width)    // %2
695   : "r"((intptr_t)(src_stride))  // %3
696   : "memory", "cc", NACL_R14
697     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
698   );
699 }
700 
ScaleRowDown38_3_Box_SSSE3(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)701 void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
702                                 ptrdiff_t src_stride,
703                                 uint8* dst_ptr, int dst_width) {
704   asm volatile (
705     "movdqa    %0,%%xmm2                       \n"
706     "movdqa    %1,%%xmm3                       \n"
707     "movdqa    %2,%%xmm4                       \n"
708     "pxor      %%xmm5,%%xmm5                   \n"
709   :
710   : "m"(kShufAc),    // %0
711     "m"(kShufAc3),   // %1
712     "m"(kScaleAc33)  // %2
713   );
714   asm volatile (
715     LABELALIGN
716   "1:                                          \n"
717     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
718     MEMOPREG(movdqu,0x00,0,3,1,xmm6)           //  movdqu  (%0,%3,1),%%xmm6
719     "movhlps   %%xmm0,%%xmm1                   \n"
720     "movhlps   %%xmm6,%%xmm7                   \n"
721     "punpcklbw %%xmm5,%%xmm0                   \n"
722     "punpcklbw %%xmm5,%%xmm1                   \n"
723     "punpcklbw %%xmm5,%%xmm6                   \n"
724     "punpcklbw %%xmm5,%%xmm7                   \n"
725     "paddusw   %%xmm6,%%xmm0                   \n"
726     "paddusw   %%xmm7,%%xmm1                   \n"
727     MEMOPREG(movdqu,0x00,0,3,2,xmm6)           //  movdqu  (%0,%3,2),%%xmm6
728     "lea       " MEMLEA(0x10,0) ",%0           \n"
729     "movhlps   %%xmm6,%%xmm7                   \n"
730     "punpcklbw %%xmm5,%%xmm6                   \n"
731     "punpcklbw %%xmm5,%%xmm7                   \n"
732     "paddusw   %%xmm6,%%xmm0                   \n"
733     "paddusw   %%xmm7,%%xmm1                   \n"
734     "movdqa    %%xmm0,%%xmm6                   \n"
735     "psrldq    $0x2,%%xmm0                     \n"
736     "paddusw   %%xmm0,%%xmm6                   \n"
737     "psrldq    $0x2,%%xmm0                     \n"
738     "paddusw   %%xmm0,%%xmm6                   \n"
739     "pshufb    %%xmm2,%%xmm6                   \n"
740     "movdqa    %%xmm1,%%xmm7                   \n"
741     "psrldq    $0x2,%%xmm1                     \n"
742     "paddusw   %%xmm1,%%xmm7                   \n"
743     "psrldq    $0x2,%%xmm1                     \n"
744     "paddusw   %%xmm1,%%xmm7                   \n"
745     "pshufb    %%xmm3,%%xmm7                   \n"
746     "paddusw   %%xmm7,%%xmm6                   \n"
747     "pmulhuw   %%xmm4,%%xmm6                   \n"
748     "packuswb  %%xmm6,%%xmm6                   \n"
749     "movd      %%xmm6," MEMACCESS(1) "         \n"
750     "psrlq     $0x10,%%xmm6                    \n"
751     "movd      %%xmm6," MEMACCESS2(0x2,1) "    \n"
752     "lea       " MEMLEA(0x6,1) ",%1            \n"
753     "sub       $0x6,%2                         \n"
754     "jg        1b                              \n"
755   : "+r"(src_ptr),    // %0
756     "+r"(dst_ptr),    // %1
757     "+r"(dst_width)   // %2
758   : "r"((intptr_t)(src_stride))   // %3
759   : "memory", "cc", NACL_R14
760     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
761   );
762 }
763 
764 // Reads 16xN bytes and produces 16 shorts at a time.
ScaleAddRow_SSE2(const uint8 * src_ptr,uint16 * dst_ptr,int src_width)765 void ScaleAddRow_SSE2(const uint8* src_ptr, uint16* dst_ptr, int src_width) {
766   asm volatile (
767     "pxor      %%xmm5,%%xmm5                   \n"
768 
769     LABELALIGN
770   "1:                                          \n"
771     "movdqu    " MEMACCESS(0) ",%%xmm3         \n"
772     "lea       " MEMLEA(0x10,0) ",%0           \n"  // src_ptr += 16
773     "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
774     "movdqu    " MEMACCESS2(0x10,1) ",%%xmm1   \n"
775     "movdqa    %%xmm3,%%xmm2                   \n"
776     "punpcklbw %%xmm5,%%xmm2                   \n"
777     "punpckhbw %%xmm5,%%xmm3                   \n"
778     "paddusw   %%xmm2,%%xmm0                   \n"
779     "paddusw   %%xmm3,%%xmm1                   \n"
780     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
781     "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
782     "lea       " MEMLEA(0x20,1) ",%1           \n"
783     "sub       $0x10,%2                        \n"
784     "jg        1b                              \n"
785   : "+r"(src_ptr),     // %0
786     "+r"(dst_ptr),     // %1
787     "+r"(src_width)    // %2
788   :
789   : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
790   );
791 }
792 
793 
794 #ifdef HAS_SCALEADDROW_AVX2
795 // Reads 32 bytes and accumulates to 32 shorts at a time.
ScaleAddRow_AVX2(const uint8 * src_ptr,uint16 * dst_ptr,int src_width)796 void ScaleAddRow_AVX2(const uint8* src_ptr, uint16* dst_ptr, int src_width) {
797   asm volatile (
798     "vpxor      %%ymm5,%%ymm5,%%ymm5           \n"
799 
800     LABELALIGN
801   "1:                                          \n"
802     "vmovdqu    " MEMACCESS(0) ",%%ymm3        \n"
803     "lea        " MEMLEA(0x20,0) ",%0          \n"  // src_ptr += 32
804     "vpermq     $0xd8,%%ymm3,%%ymm3            \n"
805     "vpunpcklbw %%ymm5,%%ymm3,%%ymm2           \n"
806     "vpunpckhbw %%ymm5,%%ymm3,%%ymm3           \n"
807     "vpaddusw   " MEMACCESS(1) ",%%ymm2,%%ymm0 \n"
808     "vpaddusw   " MEMACCESS2(0x20,1) ",%%ymm3,%%ymm1 \n"
809     "vmovdqu    %%ymm0," MEMACCESS(1) "        \n"
810     "vmovdqu    %%ymm1," MEMACCESS2(0x20,1) "  \n"
811     "lea       " MEMLEA(0x40,1) ",%1           \n"
812     "sub       $0x20,%2                        \n"
813     "jg        1b                              \n"
814     "vzeroupper                                \n"
815   : "+r"(src_ptr),     // %0
816     "+r"(dst_ptr),     // %1
817     "+r"(src_width)    // %2
818   :
819   : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
820   );
821 }
822 #endif  // HAS_SCALEADDROW_AVX2
823 
824 // Constant for making pixels signed to avoid pmaddubsw
825 // saturation.
826 static uvec8 kFsub80 =
827   { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
828     0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 };
829 
830 // Constant for making pixels unsigned and adding .5 for rounding.
831 static uvec16 kFadd40 =
832   { 0x4040, 0x4040, 0x4040, 0x4040, 0x4040, 0x4040, 0x4040, 0x4040 };
833 
834 // Bilinear column filtering. SSSE3 version.
ScaleFilterCols_SSSE3(uint8 * dst_ptr,const uint8 * src_ptr,int dst_width,int x,int dx)835 void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
836                            int dst_width, int x, int dx) {
837   intptr_t x0, x1, temp_pixel;
838   asm volatile (
839     "movd      %6,%%xmm2                       \n"
840     "movd      %7,%%xmm3                       \n"
841     "movl      $0x04040000,%k2                 \n"
842     "movd      %k2,%%xmm5                      \n"
843     "pcmpeqb   %%xmm6,%%xmm6                   \n"
844     "psrlw     $0x9,%%xmm6                     \n"  // 0x007f007f
845     "pcmpeqb   %%xmm7,%%xmm7                   \n"
846     "psrlw     $15,%%xmm7                      \n"  // 0x00010001
847 
848     "pextrw    $0x1,%%xmm2,%k3                 \n"
849     "subl      $0x2,%5                         \n"
850     "jl        29f                             \n"
851     "movdqa    %%xmm2,%%xmm0                   \n"
852     "paddd     %%xmm3,%%xmm0                   \n"
853     "punpckldq %%xmm0,%%xmm2                   \n"
854     "punpckldq %%xmm3,%%xmm3                   \n"
855     "paddd     %%xmm3,%%xmm3                   \n"
856     "pextrw    $0x3,%%xmm2,%k4                 \n"
857 
858     LABELALIGN
859   "2:                                          \n"
860     "movdqa    %%xmm2,%%xmm1                   \n"
861     "paddd     %%xmm3,%%xmm2                   \n"
862     MEMOPARG(movzwl,0x00,1,3,1,k2)             //  movzwl  (%1,%3,1),%k2
863     "movd      %k2,%%xmm0                      \n"
864     "psrlw     $0x9,%%xmm1                     \n"
865     MEMOPARG(movzwl,0x00,1,4,1,k2)             //  movzwl  (%1,%4,1),%k2
866     "movd      %k2,%%xmm4                      \n"
867     "pshufb    %%xmm5,%%xmm1                   \n"
868     "punpcklwd %%xmm4,%%xmm0                   \n"
869     "psubb     %8,%%xmm0                       \n"  // make pixels signed.
870     "pxor      %%xmm6,%%xmm1                   \n"  // 128 -f = (f ^ 127 ) + 1
871     "paddusb   %%xmm7,%%xmm1                   \n"
872     "pmaddubsw %%xmm0,%%xmm1                   \n"
873     "pextrw    $0x1,%%xmm2,%k3                 \n"
874     "pextrw    $0x3,%%xmm2,%k4                 \n"
875     "paddw     %9,%%xmm1                       \n"  // make pixels unsigned.
876     "psrlw     $0x7,%%xmm1                     \n"
877     "packuswb  %%xmm1,%%xmm1                   \n"
878     "movd      %%xmm1,%k2                      \n"
879     "mov       %w2," MEMACCESS(0) "            \n"
880     "lea       " MEMLEA(0x2,0) ",%0            \n"
881     "subl      $0x2,%5                         \n"
882     "jge       2b                              \n"
883 
884     LABELALIGN
885   "29:                                         \n"
886     "addl      $0x1,%5                         \n"
887     "jl        99f                             \n"
888     MEMOPARG(movzwl,0x00,1,3,1,k2)             //  movzwl  (%1,%3,1),%k2
889     "movd      %k2,%%xmm0                      \n"
890     "psrlw     $0x9,%%xmm2                     \n"
891     "pshufb    %%xmm5,%%xmm2                   \n"
892     "psubb     %8,%%xmm0                       \n"  // make pixels signed.
893     "pxor      %%xmm6,%%xmm2                   \n"
894     "paddusb   %%xmm7,%%xmm2                   \n"
895     "pmaddubsw %%xmm0,%%xmm2                   \n"
896     "paddw     %9,%%xmm2                       \n"  // make pixels unsigned.
897     "psrlw     $0x7,%%xmm2                     \n"
898     "packuswb  %%xmm2,%%xmm2                   \n"
899     "movd      %%xmm2,%k2                      \n"
900     "mov       %b2," MEMACCESS(0) "            \n"
901   "99:                                         \n"
902   : "+r"(dst_ptr),      // %0
903     "+r"(src_ptr),      // %1
904     "=&a"(temp_pixel),  // %2
905     "=&r"(x0),          // %3
906     "=&r"(x1),          // %4
907 #if defined(__x86_64__)
908     "+rm"(dst_width)    // %5
909 #else
910     "+m"(dst_width)    // %5
911 #endif
912   : "rm"(x),            // %6
913     "rm"(dx),           // %7
914 #if defined(__x86_64__)
915     "x"(kFsub80),       // %8
916     "x"(kFadd40)        // %9
917 #else
918     "m"(kFsub80),       // %8
919     "m"(kFadd40)        // %9
920 #endif
921   : "memory", "cc", NACL_R14
922     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
923   );
924 }
925 
926 // Reads 4 pixels, duplicates them and writes 8 pixels.
927 // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
ScaleColsUp2_SSE2(uint8 * dst_ptr,const uint8 * src_ptr,int dst_width,int x,int dx)928 void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr,
929                        int dst_width, int x, int dx) {
930   asm volatile (
931     LABELALIGN
932   "1:                                          \n"
933     "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
934     "lea       " MEMLEA(0x10,1) ",%1           \n"
935     "movdqa    %%xmm0,%%xmm1                   \n"
936     "punpcklbw %%xmm0,%%xmm0                   \n"
937     "punpckhbw %%xmm1,%%xmm1                   \n"
938     "movdqu    %%xmm0," MEMACCESS(0) "         \n"
939     "movdqu    %%xmm1," MEMACCESS2(0x10,0) "   \n"
940     "lea       " MEMLEA(0x20,0) ",%0           \n"
941     "sub       $0x20,%2                         \n"
942     "jg        1b                              \n"
943 
944   : "+r"(dst_ptr),     // %0
945     "+r"(src_ptr),     // %1
946     "+r"(dst_width)    // %2
947   :: "memory", "cc", "xmm0", "xmm1"
948   );
949 }
950 
ScaleARGBRowDown2_SSE2(const uint8 * src_argb,ptrdiff_t src_stride,uint8 * dst_argb,int dst_width)951 void ScaleARGBRowDown2_SSE2(const uint8* src_argb,
952                             ptrdiff_t src_stride,
953                             uint8* dst_argb, int dst_width) {
954   asm volatile (
955     LABELALIGN
956   "1:                                          \n"
957     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
958     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
959     "lea       " MEMLEA(0x20,0) ",%0           \n"
960     "shufps    $0xdd,%%xmm1,%%xmm0             \n"
961     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
962     "lea       " MEMLEA(0x10,1) ",%1           \n"
963     "sub       $0x4,%2                         \n"
964     "jg        1b                              \n"
965   : "+r"(src_argb),  // %0
966     "+r"(dst_argb),  // %1
967     "+r"(dst_width)  // %2
968   :: "memory", "cc", "xmm0", "xmm1"
969   );
970 }
971 
ScaleARGBRowDown2Linear_SSE2(const uint8 * src_argb,ptrdiff_t src_stride,uint8 * dst_argb,int dst_width)972 void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb,
973                                   ptrdiff_t src_stride,
974                                   uint8* dst_argb, int dst_width) {
975   asm volatile (
976     LABELALIGN
977   "1:                                          \n"
978     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
979     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
980     "lea       " MEMLEA(0x20,0) ",%0           \n"
981     "movdqa    %%xmm0,%%xmm2                   \n"
982     "shufps    $0x88,%%xmm1,%%xmm0             \n"
983     "shufps    $0xdd,%%xmm1,%%xmm2             \n"
984     "pavgb     %%xmm2,%%xmm0                   \n"
985     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
986     "lea       " MEMLEA(0x10,1) ",%1           \n"
987     "sub       $0x4,%2                         \n"
988     "jg        1b                              \n"
989   : "+r"(src_argb),  // %0
990     "+r"(dst_argb),  // %1
991     "+r"(dst_width)  // %2
992   :: "memory", "cc", "xmm0", "xmm1"
993   );
994 }
995 
ScaleARGBRowDown2Box_SSE2(const uint8 * src_argb,ptrdiff_t src_stride,uint8 * dst_argb,int dst_width)996 void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,
997                                ptrdiff_t src_stride,
998                                uint8* dst_argb, int dst_width) {
999   asm volatile (
1000     LABELALIGN
1001   "1:                                          \n"
1002     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
1003     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
1004     MEMOPREG(movdqu,0x00,0,3,1,xmm2)           //  movdqu   (%0,%3,1),%%xmm2
1005     MEMOPREG(movdqu,0x10,0,3,1,xmm3)           //  movdqu   0x10(%0,%3,1),%%xmm3
1006     "lea       " MEMLEA(0x20,0) ",%0           \n"
1007     "pavgb     %%xmm2,%%xmm0                   \n"
1008     "pavgb     %%xmm3,%%xmm1                   \n"
1009     "movdqa    %%xmm0,%%xmm2                   \n"
1010     "shufps    $0x88,%%xmm1,%%xmm0             \n"
1011     "shufps    $0xdd,%%xmm1,%%xmm2             \n"
1012     "pavgb     %%xmm2,%%xmm0                   \n"
1013     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
1014     "lea       " MEMLEA(0x10,1) ",%1           \n"
1015     "sub       $0x4,%2                         \n"
1016     "jg        1b                              \n"
1017   : "+r"(src_argb),   // %0
1018     "+r"(dst_argb),   // %1
1019     "+r"(dst_width)   // %2
1020   : "r"((intptr_t)(src_stride))   // %3
1021   : "memory", "cc", NACL_R14
1022     "xmm0", "xmm1", "xmm2", "xmm3"
1023   );
1024 }
1025 
1026 // Reads 4 pixels at a time.
1027 // Alignment requirement: dst_argb 16 byte aligned.
ScaleARGBRowDownEven_SSE2(const uint8 * src_argb,ptrdiff_t src_stride,int src_stepx,uint8 * dst_argb,int dst_width)1028 void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
1029                                int src_stepx, uint8* dst_argb, int dst_width) {
1030   intptr_t src_stepx_x4 = (intptr_t)(src_stepx);
1031   intptr_t src_stepx_x12;
1032   asm volatile (
1033     "lea       " MEMLEA3(0x00,1,4) ",%1        \n"
1034     "lea       " MEMLEA4(0x00,1,1,2) ",%4      \n"
1035     LABELALIGN
1036   "1:                                          \n"
1037     "movd      " MEMACCESS(0) ",%%xmm0         \n"
1038     MEMOPREG(movd,0x00,0,1,1,xmm1)             //  movd      (%0,%1,1),%%xmm1
1039     "punpckldq %%xmm1,%%xmm0                   \n"
1040     MEMOPREG(movd,0x00,0,1,2,xmm2)             //  movd      (%0,%1,2),%%xmm2
1041     MEMOPREG(movd,0x00,0,4,1,xmm3)             //  movd      (%0,%4,1),%%xmm3
1042     "lea       " MEMLEA4(0x00,0,1,4) ",%0      \n"
1043     "punpckldq %%xmm3,%%xmm2                   \n"
1044     "punpcklqdq %%xmm2,%%xmm0                  \n"
1045     "movdqu    %%xmm0," MEMACCESS(2) "         \n"
1046     "lea       " MEMLEA(0x10,2) ",%2           \n"
1047     "sub       $0x4,%3                         \n"
1048     "jg        1b                              \n"
1049   : "+r"(src_argb),       // %0
1050     "+r"(src_stepx_x4),   // %1
1051     "+r"(dst_argb),       // %2
1052     "+r"(dst_width),      // %3
1053     "=&r"(src_stepx_x12)  // %4
1054   :: "memory", "cc", NACL_R14
1055     "xmm0", "xmm1", "xmm2", "xmm3"
1056   );
1057 }
1058 
1059 // Blends four 2x2 to 4x1.
1060 // Alignment requirement: dst_argb 16 byte aligned.
ScaleARGBRowDownEvenBox_SSE2(const uint8 * src_argb,ptrdiff_t src_stride,int src_stepx,uint8 * dst_argb,int dst_width)1061 void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,
1062                                   ptrdiff_t src_stride, int src_stepx,
1063                                   uint8* dst_argb, int dst_width) {
1064   intptr_t src_stepx_x4 = (intptr_t)(src_stepx);
1065   intptr_t src_stepx_x12;
1066   intptr_t row1 = (intptr_t)(src_stride);
1067   asm volatile (
1068     "lea       " MEMLEA3(0x00,1,4) ",%1        \n"
1069     "lea       " MEMLEA4(0x00,1,1,2) ",%4      \n"
1070     "lea       " MEMLEA4(0x00,0,5,1) ",%5      \n"
1071 
1072     LABELALIGN
1073   "1:                                          \n"
1074     "movq      " MEMACCESS(0) ",%%xmm0         \n"
1075     MEMOPREG(movhps,0x00,0,1,1,xmm0)           //  movhps    (%0,%1,1),%%xmm0
1076     MEMOPREG(movq,0x00,0,1,2,xmm1)             //  movq      (%0,%1,2),%%xmm1
1077     MEMOPREG(movhps,0x00,0,4,1,xmm1)           //  movhps    (%0,%4,1),%%xmm1
1078     "lea       " MEMLEA4(0x00,0,1,4) ",%0      \n"
1079     "movq      " MEMACCESS(5) ",%%xmm2         \n"
1080     MEMOPREG(movhps,0x00,5,1,1,xmm2)           //  movhps    (%5,%1,1),%%xmm2
1081     MEMOPREG(movq,0x00,5,1,2,xmm3)             //  movq      (%5,%1,2),%%xmm3
1082     MEMOPREG(movhps,0x00,5,4,1,xmm3)           //  movhps    (%5,%4,1),%%xmm3
1083     "lea       " MEMLEA4(0x00,5,1,4) ",%5      \n"
1084     "pavgb     %%xmm2,%%xmm0                   \n"
1085     "pavgb     %%xmm3,%%xmm1                   \n"
1086     "movdqa    %%xmm0,%%xmm2                   \n"
1087     "shufps    $0x88,%%xmm1,%%xmm0             \n"
1088     "shufps    $0xdd,%%xmm1,%%xmm2             \n"
1089     "pavgb     %%xmm2,%%xmm0                   \n"
1090     "movdqu    %%xmm0," MEMACCESS(2) "         \n"
1091     "lea       " MEMLEA(0x10,2) ",%2           \n"
1092     "sub       $0x4,%3                         \n"
1093     "jg        1b                              \n"
1094   : "+r"(src_argb),        // %0
1095     "+r"(src_stepx_x4),    // %1
1096     "+r"(dst_argb),        // %2
1097     "+rm"(dst_width),      // %3
1098     "=&r"(src_stepx_x12),  // %4
1099     "+r"(row1)             // %5
1100   :: "memory", "cc", NACL_R14
1101     "xmm0", "xmm1", "xmm2", "xmm3"
1102   );
1103 }
1104 
ScaleARGBCols_SSE2(uint8 * dst_argb,const uint8 * src_argb,int dst_width,int x,int dx)1105 void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb,
1106                         int dst_width, int x, int dx) {
1107   intptr_t x0, x1;
1108   asm volatile (
1109     "movd      %5,%%xmm2                       \n"
1110     "movd      %6,%%xmm3                       \n"
1111     "pshufd    $0x0,%%xmm2,%%xmm2              \n"
1112     "pshufd    $0x11,%%xmm3,%%xmm0             \n"
1113     "paddd     %%xmm0,%%xmm2                   \n"
1114     "paddd     %%xmm3,%%xmm3                   \n"
1115     "pshufd    $0x5,%%xmm3,%%xmm0              \n"
1116     "paddd     %%xmm0,%%xmm2                   \n"
1117     "paddd     %%xmm3,%%xmm3                   \n"
1118     "pshufd    $0x0,%%xmm3,%%xmm3              \n"
1119     "pextrw    $0x1,%%xmm2,%k0                 \n"
1120     "pextrw    $0x3,%%xmm2,%k1                 \n"
1121     "cmp       $0x0,%4                         \n"
1122     "jl        99f                             \n"
1123     "sub       $0x4,%4                         \n"
1124     "jl        49f                             \n"
1125 
1126     LABELALIGN
1127   "40:                                         \n"
1128     MEMOPREG(movd,0x00,3,0,4,xmm0)             //  movd      (%3,%0,4),%%xmm0
1129     MEMOPREG(movd,0x00,3,1,4,xmm1)             //  movd      (%3,%1,4),%%xmm1
1130     "pextrw    $0x5,%%xmm2,%k0                 \n"
1131     "pextrw    $0x7,%%xmm2,%k1                 \n"
1132     "paddd     %%xmm3,%%xmm2                   \n"
1133     "punpckldq %%xmm1,%%xmm0                   \n"
1134     MEMOPREG(movd,0x00,3,0,4,xmm1)             //  movd      (%3,%0,4),%%xmm1
1135     MEMOPREG(movd,0x00,3,1,4,xmm4)             //  movd      (%3,%1,4),%%xmm4
1136     "pextrw    $0x1,%%xmm2,%k0                 \n"
1137     "pextrw    $0x3,%%xmm2,%k1                 \n"
1138     "punpckldq %%xmm4,%%xmm1                   \n"
1139     "punpcklqdq %%xmm1,%%xmm0                  \n"
1140     "movdqu    %%xmm0," MEMACCESS(2) "         \n"
1141     "lea       " MEMLEA(0x10,2) ",%2           \n"
1142     "sub       $0x4,%4                         \n"
1143     "jge       40b                             \n"
1144 
1145   "49:                                         \n"
1146     "test      $0x2,%4                         \n"
1147     "je        29f                             \n"
1148     MEMOPREG(movd,0x00,3,0,4,xmm0)             //  movd      (%3,%0,4),%%xmm0
1149     MEMOPREG(movd,0x00,3,1,4,xmm1)             //  movd      (%3,%1,4),%%xmm1
1150     "pextrw    $0x5,%%xmm2,%k0                 \n"
1151     "punpckldq %%xmm1,%%xmm0                   \n"
1152     "movq      %%xmm0," MEMACCESS(2) "         \n"
1153     "lea       " MEMLEA(0x8,2) ",%2            \n"
1154   "29:                                         \n"
1155     "test      $0x1,%4                         \n"
1156     "je        99f                             \n"
1157     MEMOPREG(movd,0x00,3,0,4,xmm0)             //  movd      (%3,%0,4),%%xmm0
1158     "movd      %%xmm0," MEMACCESS(2) "         \n"
1159   "99:                                         \n"
1160   : "=&a"(x0),         // %0
1161     "=&d"(x1),         // %1
1162     "+r"(dst_argb),    // %2
1163     "+r"(src_argb),    // %3
1164     "+r"(dst_width)    // %4
1165   : "rm"(x),           // %5
1166     "rm"(dx)           // %6
1167   : "memory", "cc", NACL_R14
1168     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
1169   );
1170 }
1171 
1172 // Reads 4 pixels, duplicates them and writes 8 pixels.
1173 // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
ScaleARGBColsUp2_SSE2(uint8 * dst_argb,const uint8 * src_argb,int dst_width,int x,int dx)1174 void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb,
1175                            int dst_width, int x, int dx) {
1176   asm volatile (
1177     LABELALIGN
1178   "1:                                          \n"
1179     "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
1180     "lea       " MEMLEA(0x10,1) ",%1           \n"
1181     "movdqa    %%xmm0,%%xmm1                   \n"
1182     "punpckldq %%xmm0,%%xmm0                   \n"
1183     "punpckhdq %%xmm1,%%xmm1                   \n"
1184     "movdqu    %%xmm0," MEMACCESS(0) "         \n"
1185     "movdqu    %%xmm1," MEMACCESS2(0x10,0) "   \n"
1186     "lea       " MEMLEA(0x20,0) ",%0           \n"
1187     "sub       $0x8,%2                         \n"
1188     "jg        1b                              \n"
1189 
1190   : "+r"(dst_argb),    // %0
1191     "+r"(src_argb),    // %1
1192     "+r"(dst_width)    // %2
1193   :: "memory", "cc", NACL_R14
1194     "xmm0", "xmm1"
1195   );
1196 }
1197 
1198 // Shuffle table for arranging 2 pixels into pairs for pmaddubsw
1199 static uvec8 kShuffleColARGB = {
1200   0u, 4u, 1u, 5u, 2u, 6u, 3u, 7u,  // bbggrraa 1st pixel
1201   8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u  // bbggrraa 2nd pixel
1202 };
1203 
1204 // Shuffle table for duplicating 2 fractions into 8 bytes each
1205 static uvec8 kShuffleFractions = {
1206   0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u,
1207 };
1208 
1209 // Bilinear row filtering combines 4x2 -> 4x1. SSSE3 version
ScaleARGBFilterCols_SSSE3(uint8 * dst_argb,const uint8 * src_argb,int dst_width,int x,int dx)1210 void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb,
1211                                int dst_width, int x, int dx) {
1212   intptr_t x0, x1;
1213   asm volatile (
1214     "movdqa    %0,%%xmm4                       \n"
1215     "movdqa    %1,%%xmm5                       \n"
1216   :
1217   : "m"(kShuffleColARGB),  // %0
1218     "m"(kShuffleFractions)  // %1
1219   );
1220 
1221   asm volatile (
1222     "movd      %5,%%xmm2                       \n"
1223     "movd      %6,%%xmm3                       \n"
1224     "pcmpeqb   %%xmm6,%%xmm6                   \n"
1225     "psrlw     $0x9,%%xmm6                     \n"
1226     "pextrw    $0x1,%%xmm2,%k3                 \n"
1227     "sub       $0x2,%2                         \n"
1228     "jl        29f                             \n"
1229     "movdqa    %%xmm2,%%xmm0                   \n"
1230     "paddd     %%xmm3,%%xmm0                   \n"
1231     "punpckldq %%xmm0,%%xmm2                   \n"
1232     "punpckldq %%xmm3,%%xmm3                   \n"
1233     "paddd     %%xmm3,%%xmm3                   \n"
1234     "pextrw    $0x3,%%xmm2,%k4                 \n"
1235 
1236     LABELALIGN
1237   "2:                                          \n"
1238     "movdqa    %%xmm2,%%xmm1                   \n"
1239     "paddd     %%xmm3,%%xmm2                   \n"
1240     MEMOPREG(movq,0x00,1,3,4,xmm0)             //  movq      (%1,%3,4),%%xmm0
1241     "psrlw     $0x9,%%xmm1                     \n"
1242     MEMOPREG(movhps,0x00,1,4,4,xmm0)           //  movhps    (%1,%4,4),%%xmm0
1243     "pshufb    %%xmm5,%%xmm1                   \n"
1244     "pshufb    %%xmm4,%%xmm0                   \n"
1245     "pxor      %%xmm6,%%xmm1                   \n"
1246     "pmaddubsw %%xmm1,%%xmm0                   \n"
1247     "psrlw     $0x7,%%xmm0                     \n"
1248     "pextrw    $0x1,%%xmm2,%k3                 \n"
1249     "pextrw    $0x3,%%xmm2,%k4                 \n"
1250     "packuswb  %%xmm0,%%xmm0                   \n"
1251     "movq      %%xmm0," MEMACCESS(0) "         \n"
1252     "lea       " MEMLEA(0x8,0) ",%0            \n"
1253     "sub       $0x2,%2                         \n"
1254     "jge       2b                              \n"
1255 
1256     LABELALIGN
1257   "29:                                         \n"
1258     "add       $0x1,%2                         \n"
1259     "jl        99f                             \n"
1260     "psrlw     $0x9,%%xmm2                     \n"
1261     MEMOPREG(movq,0x00,1,3,4,xmm0)             //  movq      (%1,%3,4),%%xmm0
1262     "pshufb    %%xmm5,%%xmm2                   \n"
1263     "pshufb    %%xmm4,%%xmm0                   \n"
1264     "pxor      %%xmm6,%%xmm2                   \n"
1265     "pmaddubsw %%xmm2,%%xmm0                   \n"
1266     "psrlw     $0x7,%%xmm0                     \n"
1267     "packuswb  %%xmm0,%%xmm0                   \n"
1268     "movd      %%xmm0," MEMACCESS(0) "         \n"
1269 
1270     LABELALIGN
1271   "99:                                         \n"
1272   : "+r"(dst_argb),    // %0
1273     "+r"(src_argb),    // %1
1274     "+rm"(dst_width),  // %2
1275     "=&r"(x0),         // %3
1276     "=&r"(x1)          // %4
1277   : "rm"(x),           // %5
1278     "rm"(dx)           // %6
1279   : "memory", "cc", NACL_R14
1280     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
1281   );
1282 }
1283 
1284 // Divide num by div and return as 16.16 fixed point result.
FixedDiv_X86(int num,int div)1285 int FixedDiv_X86(int num, int div) {
1286   asm volatile (
1287     "cdq                                       \n"
1288     "shld      $0x10,%%eax,%%edx               \n"
1289     "shl       $0x10,%%eax                     \n"
1290     "idiv      %1                              \n"
1291     "mov       %0, %%eax                       \n"
1292     : "+a"(num)  // %0
1293     : "c"(div)   // %1
1294     : "memory", "cc", "edx"
1295   );
1296   return num;
1297 }
1298 
1299 // Divide num - 1 by div - 1 and return as 16.16 fixed point result.
FixedDiv1_X86(int num,int div)1300 int FixedDiv1_X86(int num, int div) {
1301   asm volatile (
1302     "cdq                                       \n"
1303     "shld      $0x10,%%eax,%%edx               \n"
1304     "shl       $0x10,%%eax                     \n"
1305     "sub       $0x10001,%%eax                  \n"
1306     "sbb       $0x0,%%edx                      \n"
1307     "sub       $0x1,%1                         \n"
1308     "idiv      %1                              \n"
1309     "mov       %0, %%eax                       \n"
1310     : "+a"(num)  // %0
1311     : "c"(div)   // %1
1312     : "memory", "cc", "edx"
1313   );
1314   return num;
1315 }
1316 
1317 #endif  // defined(__x86_64__) || defined(__i386__)
1318 
1319 #ifdef __cplusplus
1320 }  // extern "C"
1321 }  // namespace libyuv
1322 #endif
1323