1 /*
2  *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include "libyuv/scale.h"
12 
13 #include <assert.h>
14 #include <string.h>
15 #include <stdlib.h>  // For getenv()
16 
17 #include "libyuv/cpu_id.h"
18 #include "libyuv/planar_functions.h"  // For CopyARGB
19 #include "libyuv/row.h"
20 
21 #ifdef __cplusplus
22 namespace libyuv {
23 extern "C" {
24 #endif
25 
26 // Bilinear SSE2 is disabled.
27 #define SSE2_DISABLED 1
28 
29 // ARGB scaling uses bilinear or point, but not box filter.
30 /**
31  * SSE2 downscalers with bilinear interpolation.
32  */
33 
34 #if !defined(YUV_DISABLE_ASM) && defined(_M_IX86)
35 
36 #define HAS_SCALEARGBROWDOWN2_SSE2
37 // Reads 8 pixels, throws half away and writes 4 even pixels (0, 2, 4, 6)
38 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
39 __declspec(naked) __declspec(align(16))
ScaleARGBRowDown2_SSE2(const uint8 * src_ptr,ptrdiff_t,uint8 * dst_ptr,int dst_width)40 static void ScaleARGBRowDown2_SSE2(const uint8* src_ptr,
41                                    ptrdiff_t /* src_stride */,
42                                    uint8* dst_ptr, int dst_width) {
43   __asm {
44     mov        eax, [esp + 4]        // src_ptr
45                                      // src_stride ignored
46     mov        edx, [esp + 12]       // dst_ptr
47     mov        ecx, [esp + 16]       // dst_width
48 
49     align      16
50   wloop:
51     movdqa     xmm0, [eax]
52     movdqa     xmm1, [eax + 16]
53     lea        eax,  [eax + 32]
54     shufps     xmm0, xmm1, 0x88
55     sub        ecx, 4
56     movdqa     [edx], xmm0
57     lea        edx, [edx + 16]
58     jg         wloop
59 
60     ret
61   }
62 }
63 
64 // Blends 8x2 rectangle to 4x1.
65 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
66 __declspec(naked) __declspec(align(16))
ScaleARGBRowDown2Int_SSE2(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)67 static void ScaleARGBRowDown2Int_SSE2(const uint8* src_ptr,
68                                       ptrdiff_t src_stride,
69                                       uint8* dst_ptr, int dst_width) {
70   __asm {
71     push       esi
72     mov        eax, [esp + 4 + 4]    // src_ptr
73     mov        esi, [esp + 4 + 8]    // src_stride
74     mov        edx, [esp + 4 + 12]   // dst_ptr
75     mov        ecx, [esp + 4 + 16]   // dst_width
76 
77     align      16
78   wloop:
79     movdqa     xmm0, [eax]
80     movdqa     xmm1, [eax + 16]
81     movdqa     xmm2, [eax + esi]
82     movdqa     xmm3, [eax + esi + 16]
83     lea        eax,  [eax + 32]
84     pavgb      xmm0, xmm2            // average rows
85     pavgb      xmm1, xmm3
86     movdqa     xmm2, xmm0            // average columns (8 to 4 pixels)
87     shufps     xmm0, xmm1, 0x88      // even pixels
88     shufps     xmm2, xmm1, 0xdd      // odd pixels
89     pavgb      xmm0, xmm2
90     sub        ecx, 4
91     movdqa     [edx], xmm0
92     lea        edx, [edx + 16]
93     jg         wloop
94 
95     pop        esi
96     ret
97   }
98 }
99 
100 #define HAS_SCALEARGBROWDOWNEVEN_SSE2
101 // Reads 4 pixels at a time.
102 // Alignment requirement: dst_ptr 16 byte aligned.
103 __declspec(naked) __declspec(align(16))
ScaleARGBRowDownEven_SSE2(const uint8 * src_ptr,ptrdiff_t src_stride,int src_stepx,uint8 * dst_ptr,int dst_width)104 void ScaleARGBRowDownEven_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
105                                int src_stepx,
106                                uint8* dst_ptr, int dst_width) {
107   __asm {
108     push       ebx
109     push       edi
110     mov        eax, [esp + 8 + 4]    // src_ptr
111                                      // src_stride ignored
112     mov        ebx, [esp + 8 + 12]   // src_stepx
113     mov        edx, [esp + 8 + 16]   // dst_ptr
114     mov        ecx, [esp + 8 + 20]   // dst_width
115     lea        ebx, [ebx * 4]
116     lea        edi, [ebx + ebx * 2]
117 
118     align      16
119   wloop:
120     movd       xmm0, [eax]
121     movd       xmm1, [eax + ebx]
122     punpckldq  xmm0, xmm1
123     movd       xmm2, [eax + ebx * 2]
124     movd       xmm3, [eax + edi]
125     lea        eax,  [eax + ebx * 4]
126     punpckldq  xmm2, xmm3
127     punpcklqdq xmm0, xmm2
128     sub        ecx, 4
129     movdqa     [edx], xmm0
130     lea        edx, [edx + 16]
131     jg         wloop
132 
133     pop        edi
134     pop        ebx
135     ret
136   }
137 }
138 
139 // Blends four 2x2 to 4x1.
140 // Alignment requirement: dst_ptr 16 byte aligned.
141 __declspec(naked) __declspec(align(16))
ScaleARGBRowDownEvenInt_SSE2(const uint8 * src_ptr,ptrdiff_t src_stride,int src_stepx,uint8 * dst_ptr,int dst_width)142 static void ScaleARGBRowDownEvenInt_SSE2(const uint8* src_ptr,
143                                          ptrdiff_t src_stride,
144                                          int src_stepx,
145                                          uint8* dst_ptr, int dst_width) {
146   __asm {
147     push       ebx
148     push       esi
149     push       edi
150     mov        eax, [esp + 12 + 4]    // src_ptr
151     mov        esi, [esp + 12 + 8]    // src_stride
152     mov        ebx, [esp + 12 + 12]   // src_stepx
153     mov        edx, [esp + 12 + 16]   // dst_ptr
154     mov        ecx, [esp + 12 + 20]   // dst_width
155     lea        esi, [eax + esi]      // row1 pointer
156     lea        ebx, [ebx * 4]
157     lea        edi, [ebx + ebx * 2]
158 
159     align      16
160   wloop:
161     movq       xmm0, qword ptr [eax] // row0 4 pairs
162     movhps     xmm0, qword ptr [eax + ebx]
163     movq       xmm1, qword ptr [eax + ebx * 2]
164     movhps     xmm1, qword ptr [eax + edi]
165     lea        eax,  [eax + ebx * 4]
166     movq       xmm2, qword ptr [esi] // row1 4 pairs
167     movhps     xmm2, qword ptr [esi + ebx]
168     movq       xmm3, qword ptr [esi + ebx * 2]
169     movhps     xmm3, qword ptr [esi + edi]
170     lea        esi,  [esi + ebx * 4]
171     pavgb      xmm0, xmm2            // average rows
172     pavgb      xmm1, xmm3
173     movdqa     xmm2, xmm0            // average columns (8 to 4 pixels)
174     shufps     xmm0, xmm1, 0x88      // even pixels
175     shufps     xmm2, xmm1, 0xdd      // odd pixels
176     pavgb      xmm0, xmm2
177     sub        ecx, 4
178     movdqa     [edx], xmm0
179     lea        edx, [edx + 16]
180     jg         wloop
181 
182     pop        edi
183     pop        esi
184     pop        ebx
185     ret
186   }
187 }
188 
189 // Bilinear row filtering combines 4x2 -> 4x1. SSE2 version.
190 #ifndef SSE2_DISABLED
191 #define HAS_SCALEARGBFILTERROWS_SSE2_DISABLED
192 __declspec(naked) __declspec(align(16))
ScaleARGBFilterRows_SSE2(uint8 * dst_ptr,const uint8 * src_ptr,ptrdiff_t src_stride,int dst_width,int source_y_fraction)193 void ScaleARGBFilterRows_SSE2(uint8* dst_ptr, const uint8* src_ptr,
194                               ptrdiff_t src_stride, int dst_width,
195                               int source_y_fraction) {
196   __asm {
197     push       esi
198     push       edi
199     mov        edi, [esp + 8 + 4]   // dst_ptr
200     mov        esi, [esp + 8 + 8]   // src_ptr
201     mov        edx, [esp + 8 + 12]  // src_stride
202     mov        ecx, [esp + 8 + 16]  // dst_width
203     mov        eax, [esp + 8 + 20]  // source_y_fraction (0..255)
204     sub        edi, esi
205     cmp        eax, 0
206     je         xloop1
207     cmp        eax, 128
208     je         xloop2
209 
210     movd       xmm5, eax            // xmm5 = y fraction
211     punpcklbw  xmm5, xmm5
212     punpcklwd  xmm5, xmm5
213     pshufd     xmm5, xmm5, 0
214     pxor       xmm4, xmm4
215 
216     // f * row1 + (1 - frac) row0
217     // frac * (row1 - row0) + row0
218     align      16
219   xloop:
220     movdqa     xmm0, [esi]  // row0
221     movdqa     xmm2, [esi + edx]  // row1
222     movdqa     xmm1, xmm0
223     movdqa     xmm3, xmm2
224     punpcklbw  xmm2, xmm4
225     punpckhbw  xmm3, xmm4
226     punpcklbw  xmm0, xmm4
227     punpckhbw  xmm1, xmm4
228     psubw      xmm2, xmm0  // row1 - row0
229     psubw      xmm3, xmm1
230     pmulhw     xmm2, xmm5  // scale diff
231     pmulhw     xmm3, xmm5
232     paddw      xmm0, xmm2  // sum rows
233     paddw      xmm1, xmm3
234     packuswb   xmm0, xmm1
235     sub        ecx, 4
236     movdqa     [esi + edi], xmm0
237     lea        esi, [esi + 16]
238     jg         xloop
239 
240     shufps     xmm0, xmm0, 0xff
241     movdqa     [esi + edi], xmm0    // duplicate last pixel for filtering
242     pop        edi
243     pop        esi
244     ret
245 
246     align      16
247   xloop1:
248     movdqa     xmm0, [esi]
249     sub        ecx, 4
250     movdqa     [esi + edi], xmm0
251     lea        esi, [esi + 16]
252     jg         xloop1
253 
254     shufps     xmm0, xmm0, 0xff
255     movdqa     [esi + edi], xmm0
256     pop        edi
257     pop        esi
258     ret
259 
260     align      16
261   xloop2:
262     movdqa     xmm0, [esi]
263     pavgb      xmm0, [esi + edx]
264     sub        ecx, 4
265     movdqa     [esi + edi], xmm0
266     lea        esi, [esi + 16]
267     jg         xloop2
268 
269     shufps     xmm0, xmm0, 0xff
270     movdqa     [esi + edi], xmm0
271     pop        edi
272     pop        esi
273     ret
274   }
275 }
276 #endif  // SSE2_DISABLED
277 
278 // Bilinear row filtering combines 4x2 -> 4x1. SSSE3 version.
279 #define HAS_SCALEARGBFILTERROWS_SSSE3
280 __declspec(naked) __declspec(align(16))
ScaleARGBFilterRows_SSSE3(uint8 * dst_ptr,const uint8 * src_ptr,ptrdiff_t src_stride,int dst_width,int source_y_fraction)281 void ScaleARGBFilterRows_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
282                                ptrdiff_t src_stride, int dst_width,
283                                int source_y_fraction) {
284   __asm {
285     push       esi
286     push       edi
287     mov        edi, [esp + 8 + 4]   // dst_ptr
288     mov        esi, [esp + 8 + 8]   // src_ptr
289     mov        edx, [esp + 8 + 12]  // src_stride
290     mov        ecx, [esp + 8 + 16]  // dst_width
291     mov        eax, [esp + 8 + 20]  // source_y_fraction (0..255)
292     sub        edi, esi
293     shr        eax, 1
294     cmp        eax, 0
295     je         xloop1
296     cmp        eax, 64
297     je         xloop2
298     movd       xmm0, eax  // high fraction 0..127
299     neg        eax
300     add        eax, 128
301     movd       xmm5, eax  // low fraction 128..1
302     punpcklbw  xmm5, xmm0
303     punpcklwd  xmm5, xmm5
304     pshufd     xmm5, xmm5, 0
305 
306     align      16
307   xloop:
308     movdqa     xmm0, [esi]
309     movdqa     xmm2, [esi + edx]
310     movdqa     xmm1, xmm0
311     punpcklbw  xmm0, xmm2
312     punpckhbw  xmm1, xmm2
313     pmaddubsw  xmm0, xmm5
314     pmaddubsw  xmm1, xmm5
315     psrlw      xmm0, 7
316     psrlw      xmm1, 7
317     packuswb   xmm0, xmm1
318     sub        ecx, 4
319     movdqa     [esi + edi], xmm0
320     lea        esi, [esi + 16]
321     jg         xloop
322 
323     shufps     xmm0, xmm0, 0xff
324     movdqa     [esi + edi], xmm0    // duplicate last pixel for filtering
325     pop        edi
326     pop        esi
327     ret
328 
329     align      16
330   xloop1:
331     movdqa     xmm0, [esi]
332     sub        ecx, 4
333     movdqa     [esi + edi], xmm0
334     lea        esi, [esi + 16]
335     jg         xloop1
336 
337     shufps     xmm0, xmm0, 0xff
338     movdqa     [esi + edi], xmm0
339     pop        edi
340     pop        esi
341     ret
342 
343     align      16
344   xloop2:
345     movdqa     xmm0, [esi]
346     pavgb      xmm0, [esi + edx]
347     sub        ecx, 4
348     movdqa     [esi + edi], xmm0
349     lea        esi, [esi + 16]
350     jg         xloop2
351 
352     shufps     xmm0, xmm0, 0xff
353     movdqa     [esi + edi], xmm0
354     pop        edi
355     pop        esi
356     ret
357   }
358 }
359 
360 #elif !defined(YUV_DISABLE_ASM) && (defined(__x86_64__) || defined(__i386__))
361 
362 // GCC versions of row functions are verbatim conversions from Visual C.
363 // Generated using gcc disassembly on Visual C object file:
364 // objdump -D yuvscaler.obj >yuvscaler.txt
365 #define HAS_SCALEARGBROWDOWN2_SSE2
366 static void ScaleARGBRowDown2_SSE2(const uint8* src_ptr,
367                                    ptrdiff_t /* src_stride */,
368                                    uint8* dst_ptr, int dst_width) {
369   asm volatile (
370     ".p2align  4                               \n"
371   "1:                                          \n"
372     "movdqa    (%0),%%xmm0                     \n"
373     "movdqa    0x10(%0),%%xmm1                 \n"
374     "lea       0x20(%0),%0                     \n"
375     "shufps    $0x88,%%xmm1,%%xmm0             \n"
376     "sub       $0x4,%2                         \n"
377     "movdqa    %%xmm0,(%1)                     \n"
378     "lea       0x10(%1),%1                     \n"
379     "jg        1b                              \n"
380   : "+r"(src_ptr),   // %0
381     "+r"(dst_ptr),   // %1
382     "+r"(dst_width)  // %2
383   :
384   : "memory", "cc"
385 #if defined(__SSE2__)
386     , "xmm0", "xmm1"
387 #endif
388   );
389 }
390 
391 static void ScaleARGBRowDown2Int_SSE2(const uint8* src_ptr,
392                                       ptrdiff_t src_stride,
393                                       uint8* dst_ptr, int dst_width) {
394   asm volatile (
395     ".p2align  4                               \n"
396   "1:                                          \n"
397     "movdqa    (%0),%%xmm0                     \n"
398     "movdqa    0x10(%0),%%xmm1                 \n"
399     "movdqa    (%0,%3,1),%%xmm2                \n"
400     "movdqa    0x10(%0,%3,1),%%xmm3            \n"
401     "lea       0x20(%0),%0                     \n"
402     "pavgb     %%xmm2,%%xmm0                   \n"
403     "pavgb     %%xmm3,%%xmm1                   \n"
404     "movdqa    %%xmm0,%%xmm2                   \n"
405     "shufps    $0x88,%%xmm1,%%xmm0             \n"
406     "shufps    $0xdd,%%xmm1,%%xmm2             \n"
407     "pavgb     %%xmm2,%%xmm0                   \n"
408     "sub       $0x4,%2                         \n"
409     "movdqa    %%xmm0,(%1)                     \n"
410     "lea       0x10(%1),%1                     \n"
411     "jg        1b                              \n"
412   : "+r"(src_ptr),    // %0
413     "+r"(dst_ptr),    // %1
414     "+r"(dst_width)   // %2
415   : "r"(static_cast<intptr_t>(src_stride))   // %3
416   : "memory", "cc"
417 #if defined(__SSE2__)
418     , "xmm0", "xmm1", "xmm2", "xmm3"
419 #endif
420   );
421 }
422 
423 #define HAS_SCALEARGBROWDOWNEVEN_SSE2
424 // Reads 4 pixels at a time.
425 // Alignment requirement: dst_ptr 16 byte aligned.
426 void ScaleARGBRowDownEven_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
427                                int src_stepx,
428                                uint8* dst_ptr, int dst_width) {
429   intptr_t src_stepx_x4 = static_cast<intptr_t>(src_stepx);
430   intptr_t src_stepx_x12 = 0;
431   asm volatile (
432     "lea       0x0(,%1,4),%1                   \n"
433     "lea       (%1,%1,2),%4                    \n"
434     ".p2align  4                               \n"
435   "1:                                          \n"
436     "movd      (%0),%%xmm0                     \n"
437     "movd      (%0,%1,1),%%xmm1                \n"
438     "punpckldq %%xmm1,%%xmm0                   \n"
439     "movd      (%0,%1,2),%%xmm2                \n"
440     "movd      (%0,%4,1),%%xmm3                \n"
441     "lea       (%0,%1,4),%0                    \n"
442     "punpckldq %%xmm3,%%xmm2                   \n"
443     "punpcklqdq %%xmm2,%%xmm0                  \n"
444     "sub       $0x4,%3                         \n"
445     "movdqa    %%xmm0,(%2)                     \n"
446     "lea       0x10(%2),%2                     \n"
447     "jg        1b                              \n"
448   : "+r"(src_ptr),       // %0
449     "+r"(src_stepx_x4),  // %1
450     "+r"(dst_ptr),       // %2
451     "+r"(dst_width),     // %3
452     "+r"(src_stepx_x12)  // %4
453   :
454   : "memory", "cc"
455 #if defined(__SSE2__)
456     , "xmm0", "xmm1", "xmm2", "xmm3"
457 #endif
458   );
459 }
460 
461 // Blends four 2x2 to 4x1.
462 // Alignment requirement: dst_ptr 16 byte aligned.
463 static void ScaleARGBRowDownEvenInt_SSE2(const uint8* src_ptr,
464                                          ptrdiff_t src_stride, int src_stepx,
465                                          uint8* dst_ptr, int dst_width) {
466   intptr_t src_stepx_x4 = static_cast<intptr_t>(src_stepx);
467   intptr_t src_stepx_x12 = 0;
468   intptr_t row1 = static_cast<intptr_t>(src_stride);
469   asm volatile (
470     "lea       0x0(,%1,4),%1                   \n"
471     "lea       (%1,%1,2),%4                    \n"
472     "lea       (%0,%5,1),%5                    \n"
473     ".p2align  4                               \n"
474   "1:                                          \n"
475     "movq      (%0),%%xmm0                     \n"
476     "movhps    (%0,%1,1),%%xmm0                \n"
477     "movq      (%0,%1,2),%%xmm1                \n"
478     "movhps    (%0,%4,1),%%xmm1                \n"
479     "lea       (%0,%1,4),%0                    \n"
480     "movq      (%5),%%xmm2                     \n"
481     "movhps    (%5,%1,1),%%xmm2                \n"
482     "movq      (%5,%1,2),%%xmm3                \n"
483     "movhps    (%5,%4,1),%%xmm3                \n"
484     "lea       (%5,%1,4),%5                    \n"
485     "pavgb     %%xmm2,%%xmm0                   \n"
486     "pavgb     %%xmm3,%%xmm1                   \n"
487     "movdqa    %%xmm0,%%xmm2                   \n"
488     "shufps    $0x88,%%xmm1,%%xmm0             \n"
489     "shufps    $0xdd,%%xmm1,%%xmm2             \n"
490     "pavgb     %%xmm2,%%xmm0                   \n"
491     "sub       $0x4,%3                         \n"
492     "movdqa    %%xmm0,(%2)                     \n"
493     "lea       0x10(%2),%2                     \n"
494     "jg        1b                              \n"
495   : "+r"(src_ptr),        // %0
496     "+r"(src_stepx_x4),   // %1
497     "+r"(dst_ptr),        // %2
498     "+rm"(dst_width),     // %3
499     "+r"(src_stepx_x12),  // %4
500     "+r"(row1)            // %5
501   :
502   : "memory", "cc"
503 #if defined(__SSE2__)
504     , "xmm0", "xmm1", "xmm2", "xmm3"
505 #endif
506   );
507 }
508 
509 #ifndef SSE2_DISABLED
510 // Bilinear row filtering combines 4x2 -> 4x1. SSE2 version
511 #define HAS_SCALEARGBFILTERROWS_SSE2_DISABLED
512 void ScaleARGBFilterRows_SSE2(uint8* dst_ptr, const uint8* src_ptr,
513                               ptrdiff_t src_stride, int dst_width,
514                               int source_y_fraction) {
515   asm volatile (
516     "sub       %1,%0                           \n"
517     "cmp       $0x0,%3                         \n"
518     "je        2f                              \n"
519     "cmp       $0x80,%3                        \n"
520     "je        3f                              \n"
521     "movd      %3,%%xmm5                       \n"
522     "punpcklbw %%xmm5,%%xmm5                   \n"
523     "punpcklwd %%xmm5,%%xmm5                   \n"
524     "pshufd    $0x0,%%xmm5,%%xmm5              \n"
525     "pxor      %%xmm4,%%xmm4                   \n"
526     ".p2align  4                               \n"
527   "1:                                          \n"
528     "movdqa    (%1),%%xmm0                     \n"
529     "movdqa    (%1,%4,1),%%xmm2                \n"
530     "movdqa    %%xmm0,%%xmm1                   \n"
531     "movdqa    %%xmm2,%%xmm3                   \n"
532     "punpcklbw %%xmm4,%%xmm2                   \n"
533     "punpckhbw %%xmm4,%%xmm3                   \n"
534     "punpcklbw %%xmm4,%%xmm0                   \n"
535     "punpckhbw %%xmm4,%%xmm1                   \n"
536     "psubw     %%xmm0,%%xmm2                   \n"
537     "psubw     %%xmm1,%%xmm3                   \n"
538     "pmulhw    %%xmm5,%%xmm2                   \n"
539     "pmulhw    %%xmm5,%%xmm3                   \n"
540     "paddw     %%xmm2,%%xmm0                   \n"
541     "paddw     %%xmm3,%%xmm1                   \n"
542     "packuswb  %%xmm1,%%xmm0                   \n"
543     "sub       $0x4,%2                         \n"
544     "movdqa    %%xmm0,(%1,%0,1)                \n"
545     "lea       0x10(%1),%1                     \n"
546     "jg        1b                              \n"
547     "jmp       4f                              \n"
548     ".p2align  4                               \n"
549   "2:                                          \n"
550     "movdqa    (%1),%%xmm0                     \n"
551     "sub       $0x4,%2                         \n"
552     "movdqa    %%xmm0,(%1,%0,1)                \n"
553     "lea       0x10(%1),%1                     \n"
554     "jg        2b                              \n"
555     "jmp       4f                              \n"
556     ".p2align  4                               \n"
557   "3:                                          \n"
558     "movdqa    (%1),%%xmm0                     \n"
559     "pavgb     (%1,%4,1),%%xmm0                \n"
560     "sub       $0x4,%2                         \n"
561     "movdqa    %%xmm0,(%1,%0,1)                \n"
562     "lea       0x10(%1),%1                     \n"
563     "lea       0x10(%1),%1                     \n"
564     "jg        3b                              \n"
565     ".p2align  4                               \n"
566   "4:                                          \n"
567     "shufps    $0xff,%%xmm0,%%xmm0             \n"
568     "movdqa    %%xmm0,(%1,%0,1)                \n"
569   : "+r"(dst_ptr),     // %0
570     "+r"(src_ptr),     // %1
571     "+r"(dst_width),   // %2
572     "+r"(source_y_fraction)  // %3
573   : "r"(static_cast<intptr_t>(src_stride))  // %4
574   : "memory", "cc"
575 #if defined(__SSE2__)
576     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
577 #endif
578   );
579 }
580 #endif  // SSE2_DISABLED
581 
582 // Bilinear row filtering combines 4x2 -> 4x1. SSSE3 version
583 #define HAS_SCALEARGBFILTERROWS_SSSE3
584 void ScaleARGBFilterRows_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
585                                ptrdiff_t src_stride, int dst_width,
586                                int source_y_fraction) {
587   asm volatile (
588     "sub       %1,%0                           \n"
589     "shr       %3                              \n"
590     "cmp       $0x0,%3                         \n"
591     "je        2f                              \n"
592     "cmp       $0x40,%3                        \n"
593     "je        3f                              \n"
594     "movd      %3,%%xmm0                       \n"
595     "neg       %3                              \n"
596     "add       $0x80,%3                        \n"
597     "movd      %3,%%xmm5                       \n"
598     "punpcklbw %%xmm0,%%xmm5                   \n"
599     "punpcklwd %%xmm5,%%xmm5                   \n"
600     "pshufd    $0x0,%%xmm5,%%xmm5              \n"
601     ".p2align  4                               \n"
602   "1:                                          \n"
603     "movdqa    (%1),%%xmm0                     \n"
604     "movdqa    (%1,%4,1),%%xmm2                \n"
605     "movdqa    %%xmm0,%%xmm1                   \n"
606     "punpcklbw %%xmm2,%%xmm0                   \n"
607     "punpckhbw %%xmm2,%%xmm1                   \n"
608     "pmaddubsw %%xmm5,%%xmm0                   \n"
609     "pmaddubsw %%xmm5,%%xmm1                   \n"
610     "psrlw     $0x7,%%xmm0                     \n"
611     "psrlw     $0x7,%%xmm1                     \n"
612     "packuswb  %%xmm1,%%xmm0                   \n"
613     "sub       $0x4,%2                         \n"
614     "movdqa    %%xmm0,(%1,%0,1)                \n"
615     "lea       0x10(%1),%1                     \n"
616     "jg        1b                              \n"
617     "jmp       4f                              \n"
618     ".p2align  4                               \n"
619   "2:                                          \n"
620     "movdqa    (%1),%%xmm0                     \n"
621     "sub       $0x4,%2                         \n"
622     "movdqa    %%xmm0,(%1,%0,1)                \n"
623     "lea       0x10(%1),%1                     \n"
624     "jg        2b                              \n"
625     "jmp       4f                              \n"
626     ".p2align  4                               \n"
627   "3:                                          \n"
628     "movdqa    (%1),%%xmm0                     \n"
629     "pavgb     (%1,%4,1),%%xmm0                \n"
630     "sub       $0x4,%2                         \n"
631     "movdqa    %%xmm0,(%1,%0,1)                \n"
632     "lea       0x10(%1),%1                     \n"
633     "jg        3b                              \n"
634   "4:                                          \n"
635     ".p2align  4                               \n"
636     "shufps    $0xff,%%xmm0,%%xmm0             \n"
637     "movdqa    %%xmm0,(%1,%0,1)                \n"
638   : "+r"(dst_ptr),     // %0
639     "+r"(src_ptr),     // %1
640     "+r"(dst_width),   // %2
641     "+r"(source_y_fraction)  // %3
642   : "r"(static_cast<intptr_t>(src_stride))  // %4
643   : "memory", "cc"
644 #if defined(__SSE2__)
645     , "xmm0", "xmm1", "xmm2", "xmm5"
646 #endif
647   );
648 }
649 #endif  // defined(__x86_64__) || defined(__i386__)
650 
ScaleARGBRowDown2_C(const uint8 * src_ptr,ptrdiff_t,uint8 * dst_ptr,int dst_width)651 static void ScaleARGBRowDown2_C(const uint8* src_ptr,
652                                 ptrdiff_t /* src_stride */,
653                                 uint8* dst_ptr, int dst_width) {
654   const uint32* src = reinterpret_cast<const uint32*>(src_ptr);
655   uint32* dst = reinterpret_cast<uint32*>(dst_ptr);
656 
657   for (int x = 0; x < dst_width - 1; x += 2) {
658     dst[0] = src[0];
659     dst[1] = src[2];
660     src += 4;
661     dst += 2;
662   }
663   if (dst_width & 1) {
664     dst[0] = src[0];
665   }
666 }
667 
ScaleARGBRowDown2Int_C(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)668 static void ScaleARGBRowDown2Int_C(const uint8* src_ptr, ptrdiff_t src_stride,
669                                    uint8* dst_ptr, int dst_width) {
670   for (int x = 0; x < dst_width; ++x) {
671     dst_ptr[0] = (src_ptr[0] + src_ptr[4] +
672                   src_ptr[src_stride] + src_ptr[src_stride + 4] + 2) >> 2;
673     dst_ptr[1] = (src_ptr[1] + src_ptr[5] +
674                   src_ptr[src_stride + 1] + src_ptr[src_stride + 5] + 2) >> 2;
675     dst_ptr[2] = (src_ptr[2] + src_ptr[6] +
676                   src_ptr[src_stride + 2] + src_ptr[src_stride + 6] + 2) >> 2;
677     dst_ptr[3] = (src_ptr[3] + src_ptr[7] +
678                   src_ptr[src_stride + 3] + src_ptr[src_stride + 7] + 2) >> 2;
679     src_ptr += 8;
680     dst_ptr += 4;
681   }
682 }
683 
ScaleARGBRowDownEven_C(const uint8 * src_ptr,ptrdiff_t,int src_stepx,uint8 * dst_ptr,int dst_width)684 void ScaleARGBRowDownEven_C(const uint8* src_ptr, ptrdiff_t /* src_stride */,
685                             int src_stepx,
686                             uint8* dst_ptr, int dst_width) {
687   const uint32* src = reinterpret_cast<const uint32*>(src_ptr);
688   uint32* dst = reinterpret_cast<uint32*>(dst_ptr);
689 
690   for (int x = 0; x < dst_width - 1; x += 2) {
691     dst[0] = src[0];
692     dst[1] = src[src_stepx];
693     src += src_stepx * 2;
694     dst += 2;
695   }
696   if (dst_width & 1) {
697     dst[0] = src[0];
698   }
699 }
700 
ScaleARGBRowDownEvenInt_C(const uint8 * src_ptr,ptrdiff_t src_stride,int src_stepx,uint8 * dst_ptr,int dst_width)701 static void ScaleARGBRowDownEvenInt_C(const uint8* src_ptr,
702                                       ptrdiff_t src_stride,
703                                       int src_stepx,
704                                       uint8* dst_ptr, int dst_width) {
705   for (int x = 0; x < dst_width; ++x) {
706     dst_ptr[0] = (src_ptr[0] + src_ptr[4] +
707                   src_ptr[src_stride] + src_ptr[src_stride + 4] + 2) >> 2;
708     dst_ptr[1] = (src_ptr[1] + src_ptr[5] +
709                   src_ptr[src_stride + 1] + src_ptr[src_stride + 5] + 2) >> 2;
710     dst_ptr[2] = (src_ptr[2] + src_ptr[6] +
711                   src_ptr[src_stride + 2] + src_ptr[src_stride + 6] + 2) >> 2;
712     dst_ptr[3] = (src_ptr[3] + src_ptr[7] +
713                   src_ptr[src_stride + 3] + src_ptr[src_stride + 7] + 2) >> 2;
714     src_ptr += src_stepx * 4;
715     dst_ptr += 4;
716   }
717 }
718 
719 // (1-f)a + fb can be replaced with a + f(b-a)
720 
721 #define BLENDER1(a, b, f) (static_cast<int>(a) + \
722     ((f) * (static_cast<int>(b) - static_cast<int>(a)) >> 16))
723 
724 #define BLENDERC(a, b, f, s) static_cast<uint32>( \
725     BLENDER1(((a) >> s) & 255, ((b) >> s) & 255, f) << s)
726 
727 #define BLENDER(a, b, f) \
728     BLENDERC(a, b, f, 24) | BLENDERC(a, b, f, 16) | \
729     BLENDERC(a, b, f, 8) | BLENDERC(a, b, f, 0)
730 
ScaleARGBFilterCols_C(uint8 * dst_ptr,const uint8 * src_ptr,int dst_width,int x,int dx)731 static void ScaleARGBFilterCols_C(uint8* dst_ptr, const uint8* src_ptr,
732                                   int dst_width, int x, int dx) {
733   const uint32* src = reinterpret_cast<const uint32*>(src_ptr);
734   uint32* dst = reinterpret_cast<uint32*>(dst_ptr);
735   for (int j = 0; j < dst_width - 1; j += 2) {
736     int xi = x >> 16;
737     uint32 a = src[xi];
738     uint32 b = src[xi + 1];
739     dst[0] = BLENDER(a, b, x & 0xffff);
740     x += dx;
741     xi = x >> 16;
742     a = src[xi];
743     b = src[xi + 1];
744     dst[1] = BLENDER(a, b, x & 0xffff);
745     x += dx;
746     dst += 2;
747   }
748   if (dst_width & 1) {
749     int xi = x >> 16;
750     uint32 a = src[xi];
751     uint32 b = src[xi + 1];
752     dst[0] = BLENDER(a, b, x & 0xffff);
753   }
754 }
755 
756 static const int kMaxInputWidth = 2560;
757 
758 // C version 2x2 -> 2x1
ScaleARGBFilterRows_C(uint8 * dst_ptr,const uint8 * src_ptr,ptrdiff_t src_stride,int dst_width,int source_y_fraction)759 void ScaleARGBFilterRows_C(uint8* dst_ptr, const uint8* src_ptr,
760                            ptrdiff_t src_stride,
761                            int dst_width, int source_y_fraction) {
762   assert(dst_width > 0);
763   int y1_fraction = source_y_fraction;
764   int y0_fraction = 256 - y1_fraction;
765   const uint8* src_ptr1 = src_ptr + src_stride;
766   uint8* end = dst_ptr + (dst_width << 2);
767   do {
768     dst_ptr[0] = (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction) >> 8;
769     dst_ptr[1] = (src_ptr[1] * y0_fraction + src_ptr1[1] * y1_fraction) >> 8;
770     dst_ptr[2] = (src_ptr[2] * y0_fraction + src_ptr1[2] * y1_fraction) >> 8;
771     dst_ptr[3] = (src_ptr[3] * y0_fraction + src_ptr1[3] * y1_fraction) >> 8;
772     dst_ptr[4] = (src_ptr[4] * y0_fraction + src_ptr1[4] * y1_fraction) >> 8;
773     dst_ptr[5] = (src_ptr[5] * y0_fraction + src_ptr1[5] * y1_fraction) >> 8;
774     dst_ptr[6] = (src_ptr[6] * y0_fraction + src_ptr1[6] * y1_fraction) >> 8;
775     dst_ptr[7] = (src_ptr[7] * y0_fraction + src_ptr1[7] * y1_fraction) >> 8;
776     src_ptr += 8;
777     src_ptr1 += 8;
778     dst_ptr += 8;
779   } while (dst_ptr < end);
780   // Duplicate the last pixel (4 bytes) for filtering.
781   dst_ptr[0] = dst_ptr[-4];
782   dst_ptr[1] = dst_ptr[-3];
783   dst_ptr[2] = dst_ptr[-2];
784   dst_ptr[3] = dst_ptr[-1];
785 }
786 
787 /**
788  * ScaleARGB ARGB, 1/2
789  *
790  * This is an optimized version for scaling down a ARGB to 1/2 of
791  * its original size.
792  *
793  */
ScaleARGBDown2(int,int,int dst_width,int dst_height,int src_stride,int dst_stride,const uint8 * src_ptr,uint8 * dst_ptr,FilterMode filtering)794 static void ScaleARGBDown2(int /* src_width */, int /* src_height */,
795                            int dst_width, int dst_height,
796                            int src_stride, int dst_stride,
797                            const uint8* src_ptr, uint8* dst_ptr,
798                            FilterMode filtering) {
799   void (*ScaleARGBRowDown2)(const uint8* src_ptr, ptrdiff_t src_stride,
800                             uint8* dst_ptr, int dst_width) =
801       filtering ? ScaleARGBRowDown2Int_C : ScaleARGBRowDown2_C;
802 #if defined(HAS_SCALEARGBROWDOWN2_SSE2)
803   if (TestCpuFlag(kCpuHasSSE2) &&
804       IS_ALIGNED(dst_width, 4) &&
805       IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16) &&
806       IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) {
807     ScaleARGBRowDown2 = filtering ? ScaleARGBRowDown2Int_SSE2 :
808         ScaleARGBRowDown2_SSE2;
809   }
810 #endif
811 
812   // TODO(fbarchard): Loop through source height to allow odd height.
813   for (int y = 0; y < dst_height; ++y) {
814     ScaleARGBRowDown2(src_ptr, src_stride, dst_ptr, dst_width);
815     src_ptr += (src_stride << 1);
816     dst_ptr += dst_stride;
817   }
818 }
819 
820 /**
821  * ScaleARGB ARGB Even
822  *
823  * This is an optimized version for scaling down a ARGB to even
824  * multiple of its original size.
825  *
826  */
ScaleARGBDownEven(int src_width,int src_height,int dst_width,int dst_height,int src_stride,int dst_stride,const uint8 * src_ptr,uint8 * dst_ptr,FilterMode filtering)827 static void ScaleARGBDownEven(int src_width, int src_height,
828                               int dst_width, int dst_height,
829                               int src_stride, int dst_stride,
830                               const uint8* src_ptr, uint8* dst_ptr,
831                               FilterMode filtering) {
832   assert(IS_ALIGNED(src_width, 2));
833   assert(IS_ALIGNED(src_height, 2));
834   void (*ScaleARGBRowDownEven)(const uint8* src_ptr, ptrdiff_t src_stride,
835                                int src_step, uint8* dst_ptr, int dst_width) =
836       filtering ? ScaleARGBRowDownEvenInt_C : ScaleARGBRowDownEven_C;
837 #if defined(HAS_SCALEARGBROWDOWNEVEN_SSE2)
838   if (TestCpuFlag(kCpuHasSSE2) &&
839       IS_ALIGNED(dst_width, 4) &&
840       IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) {
841     ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenInt_SSE2 :
842         ScaleARGBRowDownEven_SSE2;
843   }
844 #endif
845   int src_step = src_width / dst_width;
846   // Adjust to point to center of box.
847   int row_step = src_height / dst_height;
848   int row_stride = row_step * src_stride;
849   src_ptr += ((row_step >> 1) - 1) * src_stride + ((src_step >> 1) - 1) * 4;
850   for (int y = 0; y < dst_height; ++y) {
851     ScaleARGBRowDownEven(src_ptr, src_stride, src_step, dst_ptr, dst_width);
852     src_ptr += row_stride;
853     dst_ptr += dst_stride;
854   }
855 }
856 /**
857  * ScaleARGB ARGB to/from any dimensions, with bilinear
858  * interpolation.
859  */
860 
ScaleARGBBilinear(int src_width,int src_height,int dst_width,int dst_height,int src_stride,int dst_stride,const uint8 * src_ptr,uint8 * dst_ptr)861 static void ScaleARGBBilinear(int src_width, int src_height,
862                               int dst_width, int dst_height,
863                               int src_stride, int dst_stride,
864                               const uint8* src_ptr, uint8* dst_ptr) {
865   assert(dst_width > 0);
866   assert(dst_height > 0);
867   assert(src_width <= kMaxInputWidth);
868   SIMD_ALIGNED(uint8 row[kMaxInputWidth * 4 + 16]);
869   void (*ScaleARGBFilterRows)(uint8* dst_ptr, const uint8* src_ptr,
870                               ptrdiff_t src_stride,
871                               int dst_width, int source_y_fraction) =
872       ScaleARGBFilterRows_C;
873 #if defined(HAS_SCALEARGBFILTERROWS_SSE2)
874   if (TestCpuFlag(kCpuHasSSE2) &&
875       IS_ALIGNED(src_stride, 16) && IS_ALIGNED(src_ptr, 16)) {
876     ScaleARGBFilterRows = ScaleARGBFilterRows_SSE2;
877   }
878 #endif
879 #if defined(HAS_SCALEARGBFILTERROWS_SSSE3)
880   if (TestCpuFlag(kCpuHasSSSE3) &&
881       IS_ALIGNED(src_stride, 16) && IS_ALIGNED(src_ptr, 16)) {
882     ScaleARGBFilterRows = ScaleARGBFilterRows_SSSE3;
883   }
884 #endif
885   int dx = (src_width << 16) / dst_width;
886   int dy = (src_height << 16) / dst_height;
887   int x = (dx >= 65536) ? ((dx >> 1) - 32768) : (dx >> 1);
888   int y = (dy >= 65536) ? ((dy >> 1) - 32768) : (dy >> 1);
889   int maxy = (src_height > 1) ? ((src_height - 1) << 16) - 1 : 0;
890   for (int j = 0; j < dst_height; ++j) {
891     int yi = y >> 16;
892     int yf = (y >> 8) & 255;
893     const uint8* src = src_ptr + yi * src_stride;
894     ScaleARGBFilterRows(row, src, src_stride, src_width, yf);
895     ScaleARGBFilterCols_C(dst_ptr, row, dst_width, x, dx);
896     dst_ptr += dst_stride;
897     y += dy;
898     if (y > maxy) {
899       y = maxy;
900     }
901   }
902 }
903 
904 // Scales a single row of pixels using point sampling.
905 // Code is adapted from libyuv bilinear yuv scaling, but with bilinear
906 //     interpolation off, and argb pixels instead of yuv.
ScaleARGBCols(uint8 * dst_ptr,const uint8 * src_ptr,int dst_width,int x,int dx)907 static void ScaleARGBCols(uint8* dst_ptr, const uint8* src_ptr,
908                           int dst_width, int x, int dx) {
909   const uint32* src = reinterpret_cast<const uint32*>(src_ptr);
910   uint32* dst = reinterpret_cast<uint32*>(dst_ptr);
911   for (int j = 0; j < dst_width - 1; j += 2) {
912     dst[0] = src[x >> 16];
913     x += dx;
914     dst[1] = src[x >> 16];
915     x += dx;
916     dst += 2;
917   }
918   if (dst_width & 1) {
919     dst[0] = src[x >> 16];
920   }
921 }
922 
923 /**
924  * ScaleARGB ARGB to/from any dimensions, without interpolation.
925  * Fixed point math is used for performance: The upper 16 bits
926  * of x and dx is the integer part of the source position and
927  * the lower 16 bits are the fixed decimal part.
928  */
929 
ScaleARGBSimple(int src_width,int src_height,int dst_width,int dst_height,int src_stride,int dst_stride,const uint8 * src_ptr,uint8 * dst_ptr)930 static void ScaleARGBSimple(int src_width, int src_height,
931                             int dst_width, int dst_height,
932                             int src_stride, int dst_stride,
933                             const uint8* src_ptr, uint8* dst_ptr) {
934   int dx = (src_width << 16) / dst_width;
935   int dy = (src_height << 16) / dst_height;
936   int x = (dx >= 65536) ? ((dx >> 1) - 32768) : (dx >> 1);
937   int y = (dy >= 65536) ? ((dy >> 1) - 32768) : (dy >> 1);
938   for (int i = 0; i < dst_height; ++i) {
939     ScaleARGBCols(dst_ptr, src_ptr + (y >> 16) * src_stride, dst_width, x, dx);
940     dst_ptr += dst_stride;
941     y += dy;
942   }
943 }
944 
945 /**
946  * ScaleARGB ARGB to/from any dimensions.
947  */
ScaleARGBAnySize(int src_width,int src_height,int dst_width,int dst_height,int src_stride,int dst_stride,const uint8 * src_ptr,uint8 * dst_ptr,FilterMode filtering)948 static void ScaleARGBAnySize(int src_width, int src_height,
949                              int dst_width, int dst_height,
950                              int src_stride, int dst_stride,
951                              const uint8* src_ptr, uint8* dst_ptr,
952                              FilterMode filtering) {
953   if (!filtering || (src_width > kMaxInputWidth)) {
954     ScaleARGBSimple(src_width, src_height, dst_width, dst_height,
955                     src_stride, dst_stride, src_ptr, dst_ptr);
956   } else {
957     ScaleARGBBilinear(src_width, src_height, dst_width, dst_height,
958                       src_stride, dst_stride, src_ptr, dst_ptr);
959   }
960 }
961 
962 // ScaleARGB a ARGB.
963 //
964 // This function in turn calls a scaling function
965 // suitable for handling the desired resolutions.
966 
ScaleARGB(const uint8 * src,int src_stride,int src_width,int src_height,uint8 * dst,int dst_stride,int dst_width,int dst_height,FilterMode filtering)967 static void ScaleARGB(const uint8* src, int src_stride,
968                       int src_width, int src_height,
969                       uint8* dst, int dst_stride,
970                       int dst_width, int dst_height,
971                       FilterMode filtering) {
972 #ifdef CPU_X86
973   // environment variable overrides for testing.
974   char *filter_override = getenv("LIBYUV_FILTER");
975   if (filter_override) {
976     filtering = (FilterMode)atoi(filter_override);  // NOLINT
977   }
978 #endif
979   if (dst_width == src_width && dst_height == src_height) {
980     // Straight copy.
981     ARGBCopy(src, src_stride, dst, dst_stride, dst_width, dst_height);
982     return;
983   }
984   if (2 * dst_width == src_width && 2 * dst_height == src_height) {
985     // Optimized 1/2.
986     ScaleARGBDown2(src_width, src_height, dst_width, dst_height,
987                    src_stride, dst_stride, src, dst, filtering);
988     return;
989   }
990   int scale_down_x = src_width / dst_width;
991   int scale_down_y = src_height / dst_height;
992   if (dst_width * scale_down_x == src_width &&
993       dst_height * scale_down_y == src_height) {
994     if (!(scale_down_x & 1) && !(scale_down_y & 1)) {
995       // Optimized even scale down. ie 4, 6, 8, 10x
996       ScaleARGBDownEven(src_width, src_height, dst_width, dst_height,
997                         src_stride, dst_stride, src, dst, filtering);
998       return;
999     }
1000     if ((scale_down_x & 1) && (scale_down_y & 1)) {
1001       filtering = kFilterNone;
1002     }
1003   }
1004   // Arbitrary scale up and/or down.
1005   ScaleARGBAnySize(src_width, src_height, dst_width, dst_height,
1006                    src_stride, dst_stride, src, dst, filtering);
1007 }
1008 
1009 // ScaleARGB an ARGB image.
1010 LIBYUV_API
ARGBScale(const uint8 * src_argb,int src_stride_argb,int src_width,int src_height,uint8 * dst_argb,int dst_stride_argb,int dst_width,int dst_height,FilterMode filtering)1011 int ARGBScale(const uint8* src_argb, int src_stride_argb,
1012              int src_width, int src_height,
1013              uint8* dst_argb, int dst_stride_argb,
1014              int dst_width, int dst_height,
1015              FilterMode filtering) {
1016   if (!src_argb || src_width <= 0 || src_height == 0 ||
1017       !dst_argb || dst_width <= 0 || dst_height <= 0) {
1018     return -1;
1019   }
1020   // Negative height means invert the image.
1021   if (src_height < 0) {
1022     src_height = -src_height;
1023     src_argb = src_argb + (src_height - 1) * src_stride_argb;
1024     src_stride_argb = -src_stride_argb;
1025   }
1026   ScaleARGB(src_argb, src_stride_argb, src_width, src_height,
1027             dst_argb, dst_stride_argb, dst_width, dst_height,
1028             filtering);
1029   return 0;
1030 }
1031 
1032 #ifdef __cplusplus
1033 }  // extern "C"
1034 }  // namespace libyuv
1035 #endif
1036