1 /*
2  *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include "libyuv/rotate.h"
12 
13 #include "libyuv/cpu_id.h"
14 #include "libyuv/convert.h"
15 #include "libyuv/planar_functions.h"
16 #include "libyuv/row.h"
17 
18 #ifdef __cplusplus
19 namespace libyuv {
20 extern "C" {
21 #endif
22 
23 #if !defined(YUV_DISABLE_ASM) && \
24     (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
25 #if defined(__APPLE__) && defined(__i386__)
26 #define DECLARE_FUNCTION(name)                                                 \
27     ".text                                     \n"                             \
28     ".private_extern _" #name "                \n"                             \
29     ".align 4,0x90                             \n"                             \
30 "_" #name ":                                   \n"
31 #elif defined(__MINGW32__) || defined(__CYGWIN__) && defined(__i386__)
32 #define DECLARE_FUNCTION(name)                                                 \
33     ".text                                     \n"                             \
34     ".align 4,0x90                             \n"                             \
35 "_" #name ":                                   \n"
36 #else
37 #define DECLARE_FUNCTION(name)                                                 \
38     ".text                                     \n"                             \
39     ".align 4,0x90                             \n"                             \
40 #name ":                                       \n"
41 #endif
42 #endif
43 
44 #if !defined(YUV_DISABLE_ASM) && defined(__ARM_NEON__)
45 #define HAS_MIRRORROW_NEON
46 void MirrorRow_NEON(const uint8* src, uint8* dst, int width);
47 #define HAS_MIRRORROW_UV_NEON
48 void MirrorRowUV_NEON(const uint8* src,
49                         uint8* dst_a, uint8* dst_b,
50                         int width);
51 #define HAS_TRANSPOSE_WX8_NEON
52 void TransposeWx8_NEON(const uint8* src, int src_stride,
53                        uint8* dst, int dst_stride, int width);
54 #define HAS_TRANSPOSE_UVWX8_NEON
55 void TransposeUVWx8_NEON(const uint8* src, int src_stride,
56                          uint8* dst_a, int dst_stride_a,
57                          uint8* dst_b, int dst_stride_b,
58                          int width);
59 #endif  // defined(__ARM_NEON__)
60 
61 #if !defined(YUV_DISABLE_ASM) && defined(_M_IX86)
62 #define HAS_TRANSPOSE_WX8_SSSE3
63 __declspec(naked) __declspec(align(16))
TransposeWx8_SSSE3(const uint8 * src,int src_stride,uint8 * dst,int dst_stride,int width)64 static void TransposeWx8_SSSE3(const uint8* src, int src_stride,
65                                uint8* dst, int dst_stride, int width) {
66   __asm {
67     push      edi
68     push      esi
69     push      ebp
70     mov       eax, [esp + 12 + 4]   // src
71     mov       edi, [esp + 12 + 8]   // src_stride
72     mov       edx, [esp + 12 + 12]  // dst
73     mov       esi, [esp + 12 + 16]  // dst_stride
74     mov       ecx, [esp + 12 + 20]  // width
75 
76     // Read in the data from the source pointer.
77     // First round of bit swap.
78     align      16
79  convertloop:
80     movq      xmm0, qword ptr [eax]
81     lea       ebp, [eax + 8]
82     movq      xmm1, qword ptr [eax + edi]
83     lea       eax, [eax + 2 * edi]
84     punpcklbw xmm0, xmm1
85     movq      xmm2, qword ptr [eax]
86     movdqa    xmm1, xmm0
87     palignr   xmm1, xmm1, 8
88     movq      xmm3, qword ptr [eax + edi]
89     lea       eax, [eax + 2 * edi]
90     punpcklbw xmm2, xmm3
91     movdqa    xmm3, xmm2
92     movq      xmm4, qword ptr [eax]
93     palignr   xmm3, xmm3, 8
94     movq      xmm5, qword ptr [eax + edi]
95     punpcklbw xmm4, xmm5
96     lea       eax, [eax + 2 * edi]
97     movdqa    xmm5, xmm4
98     movq      xmm6, qword ptr [eax]
99     palignr   xmm5, xmm5, 8
100     movq      xmm7, qword ptr [eax + edi]
101     punpcklbw xmm6, xmm7
102     mov       eax, ebp
103     movdqa    xmm7, xmm6
104     palignr   xmm7, xmm7, 8
105     // Second round of bit swap.
106     punpcklwd xmm0, xmm2
107     punpcklwd xmm1, xmm3
108     movdqa    xmm2, xmm0
109     movdqa    xmm3, xmm1
110     palignr   xmm2, xmm2, 8
111     palignr   xmm3, xmm3, 8
112     punpcklwd xmm4, xmm6
113     punpcklwd xmm5, xmm7
114     movdqa    xmm6, xmm4
115     movdqa    xmm7, xmm5
116     palignr   xmm6, xmm6, 8
117     palignr   xmm7, xmm7, 8
118     // Third round of bit swap.
119     // Write to the destination pointer.
120     punpckldq xmm0, xmm4
121     movq      qword ptr [edx], xmm0
122     movdqa    xmm4, xmm0
123     palignr   xmm4, xmm4, 8
124     movq      qword ptr [edx + esi], xmm4
125     lea       edx, [edx + 2 * esi]
126     punpckldq xmm2, xmm6
127     movdqa    xmm6, xmm2
128     palignr   xmm6, xmm6, 8
129     movq      qword ptr [edx], xmm2
130     punpckldq xmm1, xmm5
131     movq      qword ptr [edx + esi], xmm6
132     lea       edx, [edx + 2 * esi]
133     movdqa    xmm5, xmm1
134     movq      qword ptr [edx], xmm1
135     palignr   xmm5, xmm5, 8
136     punpckldq xmm3, xmm7
137     movq      qword ptr [edx + esi], xmm5
138     lea       edx, [edx + 2 * esi]
139     movq      qword ptr [edx], xmm3
140     movdqa    xmm7, xmm3
141     palignr   xmm7, xmm7, 8
142     sub       ecx, 8
143     movq      qword ptr [edx + esi], xmm7
144     lea       edx, [edx + 2 * esi]
145     jg        convertloop
146 
147     pop       ebp
148     pop       esi
149     pop       edi
150     ret
151   }
152 }
153 
154 #define HAS_TRANSPOSE_UVWX8_SSE2
155 __declspec(naked) __declspec(align(16))
TransposeUVWx8_SSE2(const uint8 * src,int src_stride,uint8 * dst_a,int dst_stride_a,uint8 * dst_b,int dst_stride_b,int w)156 static void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
157                                 uint8* dst_a, int dst_stride_a,
158                                 uint8* dst_b, int dst_stride_b,
159                                 int w) {
160   __asm {
161     push      ebx
162     push      esi
163     push      edi
164     push      ebp
165     mov       eax, [esp + 16 + 4]   // src
166     mov       edi, [esp + 16 + 8]   // src_stride
167     mov       edx, [esp + 16 + 12]  // dst_a
168     mov       esi, [esp + 16 + 16]  // dst_stride_a
169     mov       ebx, [esp + 16 + 20]  // dst_b
170     mov       ebp, [esp + 16 + 24]  // dst_stride_b
171     mov       ecx, esp
172     sub       esp, 4 + 16
173     and       esp, ~15
174     mov       [esp + 16], ecx
175     mov       ecx, [ecx + 16 + 28]  // w
176 
177     align      16
178  convertloop:
179     // Read in the data from the source pointer.
180     // First round of bit swap.
181     movdqa    xmm0, [eax]
182     movdqa    xmm1, [eax + edi]
183     lea       eax, [eax + 2 * edi]
184     movdqa    xmm7, xmm0  // use xmm7 as temp register.
185     punpcklbw xmm0, xmm1
186     punpckhbw xmm7, xmm1
187     movdqa    xmm1, xmm7
188     movdqa    xmm2, [eax]
189     movdqa    xmm3, [eax + edi]
190     lea       eax, [eax + 2 * edi]
191     movdqa    xmm7, xmm2
192     punpcklbw xmm2, xmm3
193     punpckhbw xmm7, xmm3
194     movdqa    xmm3, xmm7
195     movdqa    xmm4, [eax]
196     movdqa    xmm5, [eax + edi]
197     lea       eax, [eax + 2 * edi]
198     movdqa    xmm7, xmm4
199     punpcklbw xmm4, xmm5
200     punpckhbw xmm7, xmm5
201     movdqa    xmm5, xmm7
202     movdqa    xmm6, [eax]
203     movdqa    xmm7, [eax + edi]
204     lea       eax, [eax + 2 * edi]
205     movdqa    [esp], xmm5  // backup xmm5
206     neg       edi
207     movdqa    xmm5, xmm6   // use xmm5 as temp register.
208     punpcklbw xmm6, xmm7
209     punpckhbw xmm5, xmm7
210     movdqa    xmm7, xmm5
211     lea       eax, [eax + 8 * edi + 16]
212     neg       edi
213     // Second round of bit swap.
214     movdqa    xmm5, xmm0
215     punpcklwd xmm0, xmm2
216     punpckhwd xmm5, xmm2
217     movdqa    xmm2, xmm5
218     movdqa    xmm5, xmm1
219     punpcklwd xmm1, xmm3
220     punpckhwd xmm5, xmm3
221     movdqa    xmm3, xmm5
222     movdqa    xmm5, xmm4
223     punpcklwd xmm4, xmm6
224     punpckhwd xmm5, xmm6
225     movdqa    xmm6, xmm5
226     movdqa    xmm5, [esp]  // restore xmm5
227     movdqa    [esp], xmm6  // backup xmm6
228     movdqa    xmm6, xmm5    // use xmm6 as temp register.
229     punpcklwd xmm5, xmm7
230     punpckhwd xmm6, xmm7
231     movdqa    xmm7, xmm6
232     // Third round of bit swap.
233     // Write to the destination pointer.
234     movdqa    xmm6, xmm0
235     punpckldq xmm0, xmm4
236     punpckhdq xmm6, xmm4
237     movdqa    xmm4, xmm6
238     movdqa    xmm6, [esp]  // restore xmm6
239     movlpd    qword ptr [edx], xmm0
240     movhpd    qword ptr [ebx], xmm0
241     movlpd    qword ptr [edx + esi], xmm4
242     lea       edx, [edx + 2 * esi]
243     movhpd    qword ptr [ebx + ebp], xmm4
244     lea       ebx, [ebx + 2 * ebp]
245     movdqa    xmm0, xmm2   // use xmm0 as the temp register.
246     punpckldq xmm2, xmm6
247     movlpd    qword ptr [edx], xmm2
248     movhpd    qword ptr [ebx], xmm2
249     punpckhdq xmm0, xmm6
250     movlpd    qword ptr [edx + esi], xmm0
251     lea       edx, [edx + 2 * esi]
252     movhpd    qword ptr [ebx + ebp], xmm0
253     lea       ebx, [ebx + 2 * ebp]
254     movdqa    xmm0, xmm1   // use xmm0 as the temp register.
255     punpckldq xmm1, xmm5
256     movlpd    qword ptr [edx], xmm1
257     movhpd    qword ptr [ebx], xmm1
258     punpckhdq xmm0, xmm5
259     movlpd    qword ptr [edx + esi], xmm0
260     lea       edx, [edx + 2 * esi]
261     movhpd    qword ptr [ebx + ebp], xmm0
262     lea       ebx, [ebx + 2 * ebp]
263     movdqa    xmm0, xmm3   // use xmm0 as the temp register.
264     punpckldq xmm3, xmm7
265     movlpd    qword ptr [edx], xmm3
266     movhpd    qword ptr [ebx], xmm3
267     punpckhdq xmm0, xmm7
268     sub       ecx, 8
269     movlpd    qword ptr [edx + esi], xmm0
270     lea       edx, [edx + 2 * esi]
271     movhpd    qword ptr [ebx + ebp], xmm0
272     lea       ebx, [ebx + 2 * ebp]
273     jg        convertloop
274 
275     mov       esp, [esp + 16]
276     pop       ebp
277     pop       edi
278     pop       esi
279     pop       ebx
280     ret
281   }
282 }
283 #elif !defined(YUV_DISABLE_ASM) && (defined(__i386__) || defined(__x86_64__))
284 #define HAS_TRANSPOSE_WX8_SSSE3
TransposeWx8_SSSE3(const uint8 * src,int src_stride,uint8 * dst,int dst_stride,int width)285 static void TransposeWx8_SSSE3(const uint8* src, int src_stride,
286                                uint8* dst, int dst_stride, int width) {
287   asm volatile (
288     // Read in the data from the source pointer.
289     // First round of bit swap.
290     ".p2align  4                                 \n"
291   "1:                                            \n"
292     "movq       (%0),%%xmm0                      \n"
293     "movq       (%0,%3),%%xmm1                   \n"
294     "lea        (%0,%3,2),%0                     \n"
295     "punpcklbw  %%xmm1,%%xmm0                    \n"
296     "movq       (%0),%%xmm2                      \n"
297     "movdqa     %%xmm0,%%xmm1                    \n"
298     "palignr    $0x8,%%xmm1,%%xmm1               \n"
299     "movq       (%0,%3),%%xmm3                   \n"
300     "lea        (%0,%3,2),%0                     \n"
301     "punpcklbw  %%xmm3,%%xmm2                    \n"
302     "movdqa     %%xmm2,%%xmm3                    \n"
303     "movq       (%0),%%xmm4                      \n"
304     "palignr    $0x8,%%xmm3,%%xmm3               \n"
305     "movq       (%0,%3),%%xmm5                   \n"
306     "lea        (%0,%3,2),%0                     \n"
307     "punpcklbw  %%xmm5,%%xmm4                    \n"
308     "movdqa     %%xmm4,%%xmm5                    \n"
309     "movq       (%0),%%xmm6                      \n"
310     "palignr    $0x8,%%xmm5,%%xmm5               \n"
311     "movq       (%0,%3),%%xmm7                   \n"
312     "lea        (%0,%3,2),%0                     \n"
313     "punpcklbw  %%xmm7,%%xmm6                    \n"
314     "neg        %3                               \n"
315     "movdqa     %%xmm6,%%xmm7                    \n"
316     "lea        0x8(%0,%3,8),%0                  \n"
317     "palignr    $0x8,%%xmm7,%%xmm7               \n"
318     "neg        %3                               \n"
319      // Second round of bit swap.
320     "punpcklwd  %%xmm2,%%xmm0                    \n"
321     "punpcklwd  %%xmm3,%%xmm1                    \n"
322     "movdqa     %%xmm0,%%xmm2                    \n"
323     "movdqa     %%xmm1,%%xmm3                    \n"
324     "palignr    $0x8,%%xmm2,%%xmm2               \n"
325     "palignr    $0x8,%%xmm3,%%xmm3               \n"
326     "punpcklwd  %%xmm6,%%xmm4                    \n"
327     "punpcklwd  %%xmm7,%%xmm5                    \n"
328     "movdqa     %%xmm4,%%xmm6                    \n"
329     "movdqa     %%xmm5,%%xmm7                    \n"
330     "palignr    $0x8,%%xmm6,%%xmm6               \n"
331     "palignr    $0x8,%%xmm7,%%xmm7               \n"
332     // Third round of bit swap.
333     // Write to the destination pointer.
334     "punpckldq  %%xmm4,%%xmm0                    \n"
335     "movq       %%xmm0,(%1)                      \n"
336     "movdqa     %%xmm0,%%xmm4                    \n"
337     "palignr    $0x8,%%xmm4,%%xmm4               \n"
338     "movq       %%xmm4,(%1,%4)                   \n"
339     "lea        (%1,%4,2),%1                     \n"
340     "punpckldq  %%xmm6,%%xmm2                    \n"
341     "movdqa     %%xmm2,%%xmm6                    \n"
342     "movq       %%xmm2,(%1)                      \n"
343     "palignr    $0x8,%%xmm6,%%xmm6               \n"
344     "punpckldq  %%xmm5,%%xmm1                    \n"
345     "movq       %%xmm6,(%1,%4)                   \n"
346     "lea        (%1,%4,2),%1                     \n"
347     "movdqa     %%xmm1,%%xmm5                    \n"
348     "movq       %%xmm1,(%1)                      \n"
349     "palignr    $0x8,%%xmm5,%%xmm5               \n"
350     "movq       %%xmm5,(%1,%4)                   \n"
351     "lea        (%1,%4,2),%1                     \n"
352     "punpckldq  %%xmm7,%%xmm3                    \n"
353     "movq       %%xmm3,(%1)                      \n"
354     "movdqa     %%xmm3,%%xmm7                    \n"
355     "palignr    $0x8,%%xmm7,%%xmm7               \n"
356     "sub        $0x8,%2                          \n"
357     "movq       %%xmm7,(%1,%4)                   \n"
358     "lea        (%1,%4,2),%1                     \n"
359     "jg         1b                               \n"
360     : "+r"(src),    // %0
361       "+r"(dst),    // %1
362       "+r"(width)   // %2
363     : "r"(static_cast<intptr_t>(src_stride)),  // %3
364       "r"(static_cast<intptr_t>(dst_stride))   // %4
365     : "memory", "cc"
366   #if defined(__SSE2__)
367       , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
368   #endif
369   );
370 }
371 
372 #if !defined(YUV_DISABLE_ASM) && defined (__i386__)
373 #define HAS_TRANSPOSE_UVWX8_SSE2
374 extern "C" void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
375                                     uint8* dst_a, int dst_stride_a,
376                                     uint8* dst_b, int dst_stride_b,
377                                     int w);
378   asm (
379     DECLARE_FUNCTION(TransposeUVWx8_SSE2)
380     "push   %ebx                               \n"
381     "push   %esi                               \n"
382     "push   %edi                               \n"
383     "push   %ebp                               \n"
384     "mov    0x14(%esp),%eax                    \n"
385     "mov    0x18(%esp),%edi                    \n"
386     "mov    0x1c(%esp),%edx                    \n"
387     "mov    0x20(%esp),%esi                    \n"
388     "mov    0x24(%esp),%ebx                    \n"
389     "mov    0x28(%esp),%ebp                    \n"
390     "mov    %esp,%ecx                          \n"
391     "sub    $0x14,%esp                         \n"
392     "and    $0xfffffff0,%esp                   \n"
393     "mov    %ecx,0x10(%esp)                    \n"
394     "mov    0x2c(%ecx),%ecx                    \n"
395 
396 "1:                                            \n"
397     "movdqa (%eax),%xmm0                       \n"
398     "movdqa (%eax,%edi,1),%xmm1                \n"
399     "lea    (%eax,%edi,2),%eax                 \n"
400     "movdqa %xmm0,%xmm7                        \n"
401     "punpcklbw %xmm1,%xmm0                     \n"
402     "punpckhbw %xmm1,%xmm7                     \n"
403     "movdqa %xmm7,%xmm1                        \n"
404     "movdqa (%eax),%xmm2                       \n"
405     "movdqa (%eax,%edi,1),%xmm3                \n"
406     "lea    (%eax,%edi,2),%eax                 \n"
407     "movdqa %xmm2,%xmm7                        \n"
408     "punpcklbw %xmm3,%xmm2                     \n"
409     "punpckhbw %xmm3,%xmm7                     \n"
410     "movdqa %xmm7,%xmm3                        \n"
411     "movdqa (%eax),%xmm4                       \n"
412     "movdqa (%eax,%edi,1),%xmm5                \n"
413     "lea    (%eax,%edi,2),%eax                 \n"
414     "movdqa %xmm4,%xmm7                        \n"
415     "punpcklbw %xmm5,%xmm4                     \n"
416     "punpckhbw %xmm5,%xmm7                     \n"
417     "movdqa %xmm7,%xmm5                        \n"
418     "movdqa (%eax),%xmm6                       \n"
419     "movdqa (%eax,%edi,1),%xmm7                \n"
420     "lea    (%eax,%edi,2),%eax                 \n"
421     "movdqa %xmm5,(%esp)                       \n"
422     "neg    %edi                               \n"
423     "movdqa %xmm6,%xmm5                        \n"
424     "punpcklbw %xmm7,%xmm6                     \n"
425     "punpckhbw %xmm7,%xmm5                     \n"
426     "movdqa %xmm5,%xmm7                        \n"
427     "lea    0x10(%eax,%edi,8),%eax             \n"
428     "neg    %edi                               \n"
429     "movdqa %xmm0,%xmm5                        \n"
430     "punpcklwd %xmm2,%xmm0                     \n"
431     "punpckhwd %xmm2,%xmm5                     \n"
432     "movdqa %xmm5,%xmm2                        \n"
433     "movdqa %xmm1,%xmm5                        \n"
434     "punpcklwd %xmm3,%xmm1                     \n"
435     "punpckhwd %xmm3,%xmm5                     \n"
436     "movdqa %xmm5,%xmm3                        \n"
437     "movdqa %xmm4,%xmm5                        \n"
438     "punpcklwd %xmm6,%xmm4                     \n"
439     "punpckhwd %xmm6,%xmm5                     \n"
440     "movdqa %xmm5,%xmm6                        \n"
441     "movdqa (%esp),%xmm5                       \n"
442     "movdqa %xmm6,(%esp)                       \n"
443     "movdqa %xmm5,%xmm6                        \n"
444     "punpcklwd %xmm7,%xmm5                     \n"
445     "punpckhwd %xmm7,%xmm6                     \n"
446     "movdqa %xmm6,%xmm7                        \n"
447     "movdqa %xmm0,%xmm6                        \n"
448     "punpckldq %xmm4,%xmm0                     \n"
449     "punpckhdq %xmm4,%xmm6                     \n"
450     "movdqa %xmm6,%xmm4                        \n"
451     "movdqa (%esp),%xmm6                       \n"
452     "movlpd %xmm0,(%edx)                       \n"
453     "movhpd %xmm0,(%ebx)                       \n"
454     "movlpd %xmm4,(%edx,%esi,1)                \n"
455     "lea    (%edx,%esi,2),%edx                 \n"
456     "movhpd %xmm4,(%ebx,%ebp,1)                \n"
457     "lea    (%ebx,%ebp,2),%ebx                 \n"
458     "movdqa %xmm2,%xmm0                        \n"
459     "punpckldq %xmm6,%xmm2                     \n"
460     "movlpd %xmm2,(%edx)                       \n"
461     "movhpd %xmm2,(%ebx)                       \n"
462     "punpckhdq %xmm6,%xmm0                     \n"
463     "movlpd %xmm0,(%edx,%esi,1)                \n"
464     "lea    (%edx,%esi,2),%edx                 \n"
465     "movhpd %xmm0,(%ebx,%ebp,1)                \n"
466     "lea    (%ebx,%ebp,2),%ebx                 \n"
467     "movdqa %xmm1,%xmm0                        \n"
468     "punpckldq %xmm5,%xmm1                     \n"
469     "movlpd %xmm1,(%edx)                       \n"
470     "movhpd %xmm1,(%ebx)                       \n"
471     "punpckhdq %xmm5,%xmm0                     \n"
472     "movlpd %xmm0,(%edx,%esi,1)                \n"
473     "lea    (%edx,%esi,2),%edx                 \n"
474     "movhpd %xmm0,(%ebx,%ebp,1)                \n"
475     "lea    (%ebx,%ebp,2),%ebx                 \n"
476     "movdqa %xmm3,%xmm0                        \n"
477     "punpckldq %xmm7,%xmm3                     \n"
478     "movlpd %xmm3,(%edx)                       \n"
479     "movhpd %xmm3,(%ebx)                       \n"
480     "punpckhdq %xmm7,%xmm0                     \n"
481     "sub    $0x8,%ecx                          \n"
482     "movlpd %xmm0,(%edx,%esi,1)                \n"
483     "lea    (%edx,%esi,2),%edx                 \n"
484     "movhpd %xmm0,(%ebx,%ebp,1)                \n"
485     "lea    (%ebx,%ebp,2),%ebx                 \n"
486     "jg     1b                                 \n"
487     "mov    0x10(%esp),%esp                    \n"
488     "pop    %ebp                               \n"
489     "pop    %edi                               \n"
490     "pop    %esi                               \n"
491     "pop    %ebx                               \n"
492     "ret                                       \n"
493 );
494 #elif !defined(YUV_DISABLE_ASM) && defined(__x86_64__)
495 // 64 bit version has enough registers to do 16x8 to 8x16 at a time.
496 #define HAS_TRANSPOSE_WX8_FAST_SSSE3
TransposeWx8_FAST_SSSE3(const uint8 * src,int src_stride,uint8 * dst,int dst_stride,int width)497 static void TransposeWx8_FAST_SSSE3(const uint8* src, int src_stride,
498                                     uint8* dst, int dst_stride, int width) {
499   asm volatile (
500   // Read in the data from the source pointer.
501   // First round of bit swap.
502   ".p2align  4                                 \n"
503 "1:                                            \n"
504   "movdqa     (%0),%%xmm0                      \n"
505   "movdqa     (%0,%3),%%xmm1                   \n"
506   "lea        (%0,%3,2),%0                     \n"
507   "movdqa     %%xmm0,%%xmm8                    \n"
508   "punpcklbw  %%xmm1,%%xmm0                    \n"
509   "punpckhbw  %%xmm1,%%xmm8                    \n"
510   "movdqa     (%0),%%xmm2                      \n"
511   "movdqa     %%xmm0,%%xmm1                    \n"
512   "movdqa     %%xmm8,%%xmm9                    \n"
513   "palignr    $0x8,%%xmm1,%%xmm1               \n"
514   "palignr    $0x8,%%xmm9,%%xmm9               \n"
515   "movdqa     (%0,%3),%%xmm3                   \n"
516   "lea        (%0,%3,2),%0                     \n"
517   "movdqa     %%xmm2,%%xmm10                   \n"
518   "punpcklbw  %%xmm3,%%xmm2                    \n"
519   "punpckhbw  %%xmm3,%%xmm10                   \n"
520   "movdqa     %%xmm2,%%xmm3                    \n"
521   "movdqa     %%xmm10,%%xmm11                  \n"
522   "movdqa     (%0),%%xmm4                      \n"
523   "palignr    $0x8,%%xmm3,%%xmm3               \n"
524   "palignr    $0x8,%%xmm11,%%xmm11             \n"
525   "movdqa     (%0,%3),%%xmm5                   \n"
526   "lea        (%0,%3,2),%0                     \n"
527   "movdqa     %%xmm4,%%xmm12                   \n"
528   "punpcklbw  %%xmm5,%%xmm4                    \n"
529   "punpckhbw  %%xmm5,%%xmm12                   \n"
530   "movdqa     %%xmm4,%%xmm5                    \n"
531   "movdqa     %%xmm12,%%xmm13                  \n"
532   "movdqa     (%0),%%xmm6                      \n"
533   "palignr    $0x8,%%xmm5,%%xmm5               \n"
534   "palignr    $0x8,%%xmm13,%%xmm13             \n"
535   "movdqa     (%0,%3),%%xmm7                   \n"
536   "lea        (%0,%3,2),%0                     \n"
537   "movdqa     %%xmm6,%%xmm14                   \n"
538   "punpcklbw  %%xmm7,%%xmm6                    \n"
539   "punpckhbw  %%xmm7,%%xmm14                   \n"
540   "neg        %3                               \n"
541   "movdqa     %%xmm6,%%xmm7                    \n"
542   "movdqa     %%xmm14,%%xmm15                  \n"
543   "lea        0x10(%0,%3,8),%0                 \n"
544   "palignr    $0x8,%%xmm7,%%xmm7               \n"
545   "palignr    $0x8,%%xmm15,%%xmm15             \n"
546   "neg        %3                               \n"
547    // Second round of bit swap.
548   "punpcklwd  %%xmm2,%%xmm0                    \n"
549   "punpcklwd  %%xmm3,%%xmm1                    \n"
550   "movdqa     %%xmm0,%%xmm2                    \n"
551   "movdqa     %%xmm1,%%xmm3                    \n"
552   "palignr    $0x8,%%xmm2,%%xmm2               \n"
553   "palignr    $0x8,%%xmm3,%%xmm3               \n"
554   "punpcklwd  %%xmm6,%%xmm4                    \n"
555   "punpcklwd  %%xmm7,%%xmm5                    \n"
556   "movdqa     %%xmm4,%%xmm6                    \n"
557   "movdqa     %%xmm5,%%xmm7                    \n"
558   "palignr    $0x8,%%xmm6,%%xmm6               \n"
559   "palignr    $0x8,%%xmm7,%%xmm7               \n"
560   "punpcklwd  %%xmm10,%%xmm8                   \n"
561   "punpcklwd  %%xmm11,%%xmm9                   \n"
562   "movdqa     %%xmm8,%%xmm10                   \n"
563   "movdqa     %%xmm9,%%xmm11                   \n"
564   "palignr    $0x8,%%xmm10,%%xmm10             \n"
565   "palignr    $0x8,%%xmm11,%%xmm11             \n"
566   "punpcklwd  %%xmm14,%%xmm12                  \n"
567   "punpcklwd  %%xmm15,%%xmm13                  \n"
568   "movdqa     %%xmm12,%%xmm14                  \n"
569   "movdqa     %%xmm13,%%xmm15                  \n"
570   "palignr    $0x8,%%xmm14,%%xmm14             \n"
571   "palignr    $0x8,%%xmm15,%%xmm15             \n"
572   // Third round of bit swap.
573   // Write to the destination pointer.
574   "punpckldq  %%xmm4,%%xmm0                    \n"
575   "movq       %%xmm0,(%1)                      \n"
576   "movdqa     %%xmm0,%%xmm4                    \n"
577   "palignr    $0x8,%%xmm4,%%xmm4               \n"
578   "movq       %%xmm4,(%1,%4)                   \n"
579   "lea        (%1,%4,2),%1                     \n"
580   "punpckldq  %%xmm6,%%xmm2                    \n"
581   "movdqa     %%xmm2,%%xmm6                    \n"
582   "movq       %%xmm2,(%1)                      \n"
583   "palignr    $0x8,%%xmm6,%%xmm6               \n"
584   "punpckldq  %%xmm5,%%xmm1                    \n"
585   "movq       %%xmm6,(%1,%4)                   \n"
586   "lea        (%1,%4,2),%1                     \n"
587   "movdqa     %%xmm1,%%xmm5                    \n"
588   "movq       %%xmm1,(%1)                      \n"
589   "palignr    $0x8,%%xmm5,%%xmm5               \n"
590   "movq       %%xmm5,(%1,%4)                   \n"
591   "lea        (%1,%4,2),%1                     \n"
592   "punpckldq  %%xmm7,%%xmm3                    \n"
593   "movq       %%xmm3,(%1)                      \n"
594   "movdqa     %%xmm3,%%xmm7                    \n"
595   "palignr    $0x8,%%xmm7,%%xmm7               \n"
596   "movq       %%xmm7,(%1,%4)                   \n"
597   "lea        (%1,%4,2),%1                     \n"
598   "punpckldq  %%xmm12,%%xmm8                   \n"
599   "movq       %%xmm8,(%1)                      \n"
600   "movdqa     %%xmm8,%%xmm12                   \n"
601   "palignr    $0x8,%%xmm12,%%xmm12             \n"
602   "movq       %%xmm12,(%1,%4)                  \n"
603   "lea        (%1,%4,2),%1                     \n"
604   "punpckldq  %%xmm14,%%xmm10                  \n"
605   "movdqa     %%xmm10,%%xmm14                  \n"
606   "movq       %%xmm10,(%1)                     \n"
607   "palignr    $0x8,%%xmm14,%%xmm14             \n"
608   "punpckldq  %%xmm13,%%xmm9                   \n"
609   "movq       %%xmm14,(%1,%4)                  \n"
610   "lea        (%1,%4,2),%1                     \n"
611   "movdqa     %%xmm9,%%xmm13                   \n"
612   "movq       %%xmm9,(%1)                      \n"
613   "palignr    $0x8,%%xmm13,%%xmm13             \n"
614   "movq       %%xmm13,(%1,%4)                  \n"
615   "lea        (%1,%4,2),%1                     \n"
616   "punpckldq  %%xmm15,%%xmm11                  \n"
617   "movq       %%xmm11,(%1)                     \n"
618   "movdqa     %%xmm11,%%xmm15                  \n"
619   "palignr    $0x8,%%xmm15,%%xmm15             \n"
620   "sub        $0x10,%2                         \n"
621   "movq       %%xmm15,(%1,%4)                  \n"
622   "lea        (%1,%4,2),%1                     \n"
623   "jg         1b                               \n"
624   : "+r"(src),    // %0
625     "+r"(dst),    // %1
626     "+r"(width)   // %2
627   : "r"(static_cast<intptr_t>(src_stride)),  // %3
628     "r"(static_cast<intptr_t>(dst_stride))   // %4
629   : "memory", "cc",
630     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7",
631     "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13",  "xmm14",  "xmm15"
632 );
633 }
634 
635 #define HAS_TRANSPOSE_UVWX8_SSE2
TransposeUVWx8_SSE2(const uint8 * src,int src_stride,uint8 * dst_a,int dst_stride_a,uint8 * dst_b,int dst_stride_b,int w)636 static void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
637                                 uint8* dst_a, int dst_stride_a,
638                                 uint8* dst_b, int dst_stride_b,
639                                 int w) {
640   asm volatile (
641   // Read in the data from the source pointer.
642   // First round of bit swap.
643   ".p2align  4                                 \n"
644 "1:                                            \n"
645   "movdqa     (%0),%%xmm0                      \n"
646   "movdqa     (%0,%4),%%xmm1                   \n"
647   "lea        (%0,%4,2),%0                     \n"
648   "movdqa     %%xmm0,%%xmm8                    \n"
649   "punpcklbw  %%xmm1,%%xmm0                    \n"
650   "punpckhbw  %%xmm1,%%xmm8                    \n"
651   "movdqa     %%xmm8,%%xmm1                    \n"
652   "movdqa     (%0),%%xmm2                      \n"
653   "movdqa     (%0,%4),%%xmm3                   \n"
654   "lea        (%0,%4,2),%0                     \n"
655   "movdqa     %%xmm2,%%xmm8                    \n"
656   "punpcklbw  %%xmm3,%%xmm2                    \n"
657   "punpckhbw  %%xmm3,%%xmm8                    \n"
658   "movdqa     %%xmm8,%%xmm3                    \n"
659   "movdqa     (%0),%%xmm4                      \n"
660   "movdqa     (%0,%4),%%xmm5                   \n"
661   "lea        (%0,%4,2),%0                     \n"
662   "movdqa     %%xmm4,%%xmm8                    \n"
663   "punpcklbw  %%xmm5,%%xmm4                    \n"
664   "punpckhbw  %%xmm5,%%xmm8                    \n"
665   "movdqa     %%xmm8,%%xmm5                    \n"
666   "movdqa     (%0),%%xmm6                      \n"
667   "movdqa     (%0,%4),%%xmm7                   \n"
668   "lea        (%0,%4,2),%0                     \n"
669   "movdqa     %%xmm6,%%xmm8                    \n"
670   "punpcklbw  %%xmm7,%%xmm6                    \n"
671   "neg        %4                               \n"
672   "lea        0x10(%0,%4,8),%0                 \n"
673   "punpckhbw  %%xmm7,%%xmm8                    \n"
674   "movdqa     %%xmm8,%%xmm7                    \n"
675   "neg        %4                               \n"
676    // Second round of bit swap.
677   "movdqa     %%xmm0,%%xmm8                    \n"
678   "movdqa     %%xmm1,%%xmm9                    \n"
679   "punpckhwd  %%xmm2,%%xmm8                    \n"
680   "punpckhwd  %%xmm3,%%xmm9                    \n"
681   "punpcklwd  %%xmm2,%%xmm0                    \n"
682   "punpcklwd  %%xmm3,%%xmm1                    \n"
683   "movdqa     %%xmm8,%%xmm2                    \n"
684   "movdqa     %%xmm9,%%xmm3                    \n"
685   "movdqa     %%xmm4,%%xmm8                    \n"
686   "movdqa     %%xmm5,%%xmm9                    \n"
687   "punpckhwd  %%xmm6,%%xmm8                    \n"
688   "punpckhwd  %%xmm7,%%xmm9                    \n"
689   "punpcklwd  %%xmm6,%%xmm4                    \n"
690   "punpcklwd  %%xmm7,%%xmm5                    \n"
691   "movdqa     %%xmm8,%%xmm6                    \n"
692   "movdqa     %%xmm9,%%xmm7                    \n"
693   // Third round of bit swap.
694   // Write to the destination pointer.
695   "movdqa     %%xmm0,%%xmm8                    \n"
696   "punpckldq  %%xmm4,%%xmm0                    \n"
697   "movlpd     %%xmm0,(%1)                      \n"  // Write back U channel
698   "movhpd     %%xmm0,(%2)                      \n"  // Write back V channel
699   "punpckhdq  %%xmm4,%%xmm8                    \n"
700   "movlpd     %%xmm8,(%1,%5)                   \n"
701   "lea        (%1,%5,2),%1                     \n"
702   "movhpd     %%xmm8,(%2,%6)                   \n"
703   "lea        (%2,%6,2),%2                     \n"
704   "movdqa     %%xmm2,%%xmm8                    \n"
705   "punpckldq  %%xmm6,%%xmm2                    \n"
706   "movlpd     %%xmm2,(%1)                      \n"
707   "movhpd     %%xmm2,(%2)                      \n"
708   "punpckhdq  %%xmm6,%%xmm8                    \n"
709   "movlpd     %%xmm8,(%1,%5)                   \n"
710   "lea        (%1,%5,2),%1                     \n"
711   "movhpd     %%xmm8,(%2,%6)                   \n"
712   "lea        (%2,%6,2),%2                     \n"
713   "movdqa     %%xmm1,%%xmm8                    \n"
714   "punpckldq  %%xmm5,%%xmm1                    \n"
715   "movlpd     %%xmm1,(%1)                      \n"
716   "movhpd     %%xmm1,(%2)                      \n"
717   "punpckhdq  %%xmm5,%%xmm8                    \n"
718   "movlpd     %%xmm8,(%1,%5)                   \n"
719   "lea        (%1,%5,2),%1                     \n"
720   "movhpd     %%xmm8,(%2,%6)                   \n"
721   "lea        (%2,%6,2),%2                     \n"
722   "movdqa     %%xmm3,%%xmm8                    \n"
723   "punpckldq  %%xmm7,%%xmm3                    \n"
724   "movlpd     %%xmm3,(%1)                      \n"
725   "movhpd     %%xmm3,(%2)                      \n"
726   "punpckhdq  %%xmm7,%%xmm8                    \n"
727   "sub        $0x8,%3                          \n"
728   "movlpd     %%xmm8,(%1,%5)                   \n"
729   "lea        (%1,%5,2),%1                     \n"
730   "movhpd     %%xmm8,(%2,%6)                   \n"
731   "lea        (%2,%6,2),%2                     \n"
732   "jg         1b                               \n"
733   : "+r"(src),    // %0
734     "+r"(dst_a),  // %1
735     "+r"(dst_b),  // %2
736     "+r"(w)   // %3
737   : "r"(static_cast<intptr_t>(src_stride)),    // %4
738     "r"(static_cast<intptr_t>(dst_stride_a)),  // %5
739     "r"(static_cast<intptr_t>(dst_stride_b))   // %6
740   : "memory", "cc",
741     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7",
742     "xmm8", "xmm9"
743 );
744 }
745 #endif
746 #endif
747 
TransposeWx8_C(const uint8 * src,int src_stride,uint8 * dst,int dst_stride,int width)748 static void TransposeWx8_C(const uint8* src, int src_stride,
749                            uint8* dst, int dst_stride,
750                            int width) {
751   for (int i = 0; i < width; ++i) {
752     dst[0] = src[0 * src_stride];
753     dst[1] = src[1 * src_stride];
754     dst[2] = src[2 * src_stride];
755     dst[3] = src[3 * src_stride];
756     dst[4] = src[4 * src_stride];
757     dst[5] = src[5 * src_stride];
758     dst[6] = src[6 * src_stride];
759     dst[7] = src[7 * src_stride];
760     ++src;
761     dst += dst_stride;
762   }
763 }
764 
TransposeWxH_C(const uint8 * src,int src_stride,uint8 * dst,int dst_stride,int width,int height)765 static void TransposeWxH_C(const uint8* src, int src_stride,
766                            uint8* dst, int dst_stride,
767                            int width, int height) {
768   for (int i = 0; i < width; ++i) {
769     for (int j = 0; j < height; ++j) {
770       dst[i * dst_stride + j] = src[j * src_stride + i];
771     }
772   }
773 }
774 
775 LIBYUV_API
TransposePlane(const uint8 * src,int src_stride,uint8 * dst,int dst_stride,int width,int height)776 void TransposePlane(const uint8* src, int src_stride,
777                     uint8* dst, int dst_stride,
778                     int width, int height) {
779   void (*TransposeWx8)(const uint8* src, int src_stride,
780                        uint8* dst, int dst_stride,
781                        int width) = TransposeWx8_C;
782 #if defined(HAS_TRANSPOSE_WX8_NEON)
783   if (TestCpuFlag(kCpuHasNEON)) {
784     TransposeWx8 = TransposeWx8_NEON;
785   }
786 #endif
787 #if defined(HAS_TRANSPOSE_WX8_SSSE3)
788   if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8)) {
789     TransposeWx8 = TransposeWx8_SSSE3;
790   }
791 #endif
792 #if defined(HAS_TRANSPOSE_WX8_FAST_SSSE3)
793   if (TestCpuFlag(kCpuHasSSSE3) &&
794       IS_ALIGNED(width, 16) &&
795       IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16)) {
796     TransposeWx8 = TransposeWx8_FAST_SSSE3;
797   }
798 #endif
799 
800   // Work across the source in 8x8 tiles
801   int i = height;
802   while (i >= 8) {
803     TransposeWx8(src, src_stride, dst, dst_stride, width);
804     src += 8 * src_stride;    // Go down 8 rows.
805     dst += 8;                 // Move over 8 columns.
806     i -= 8;
807   }
808 
809   TransposeWxH_C(src, src_stride, dst, dst_stride, width, i);
810 }
811 
812 LIBYUV_API
RotatePlane90(const uint8 * src,int src_stride,uint8 * dst,int dst_stride,int width,int height)813 void RotatePlane90(const uint8* src, int src_stride,
814                    uint8* dst, int dst_stride,
815                    int width, int height) {
816   // Rotate by 90 is a transpose with the source read
817   // from bottom to top. So set the source pointer to the end
818   // of the buffer and flip the sign of the source stride.
819   src += src_stride * (height - 1);
820   src_stride = -src_stride;
821   TransposePlane(src, src_stride, dst, dst_stride, width, height);
822 }
823 
824 LIBYUV_API
RotatePlane270(const uint8 * src,int src_stride,uint8 * dst,int dst_stride,int width,int height)825 void RotatePlane270(const uint8* src, int src_stride,
826                     uint8* dst, int dst_stride,
827                     int width, int height) {
828   // Rotate by 270 is a transpose with the destination written
829   // from bottom to top. So set the destination pointer to the end
830   // of the buffer and flip the sign of the destination stride.
831   dst += dst_stride * (width - 1);
832   dst_stride = -dst_stride;
833   TransposePlane(src, src_stride, dst, dst_stride, width, height);
834 }
835 
836 LIBYUV_API
RotatePlane180(const uint8 * src,int src_stride,uint8 * dst,int dst_stride,int width,int height)837 void RotatePlane180(const uint8* src, int src_stride,
838                     uint8* dst, int dst_stride,
839                     int width, int height) {
840   void (*MirrorRow)(const uint8* src, uint8* dst, int width) = MirrorRow_C;
841 #if defined(HAS_MIRRORROW_NEON)
842   if (TestCpuFlag(kCpuHasNEON)) {
843     MirrorRow = MirrorRow_NEON;
844   }
845 #endif
846 #if defined(HAS_MIRRORROW_SSE2)
847   if (TestCpuFlag(kCpuHasSSE2) &&
848       IS_ALIGNED(width, 16) &&
849       IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16) &&
850       IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) {
851     MirrorRow = MirrorRow_SSE2;
852   }
853 #endif
854 #if defined(HAS_MIRRORROW_SSSE3)
855   if (TestCpuFlag(kCpuHasSSSE3) &&
856       IS_ALIGNED(width, 16) &&
857       IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16) &&
858       IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) {
859     MirrorRow = MirrorRow_SSSE3;
860   }
861 #endif
862   void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C;
863 #if defined(HAS_COPYROW_NEON)
864   if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 64)) {
865     CopyRow = CopyRow_NEON;
866   }
867 #endif
868 #if defined(HAS_COPYROW_X86)
869   if (TestCpuFlag(kCpuHasX86) && IS_ALIGNED(width, 4)) {
870     CopyRow = CopyRow_X86;
871   }
872 #endif
873 #if defined(HAS_COPYROW_SSE2)
874   if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 32) &&
875       IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16) &&
876       IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) {
877     CopyRow = CopyRow_SSE2;
878   }
879 #endif
880   if (width > kMaxStride) {
881     return;
882   }
883   // Swap first and last row and mirror the content. Uses a temporary row.
884   SIMD_ALIGNED(uint8 row[kMaxStride]);
885   const uint8* src_bot = src + src_stride * (height - 1);
886   uint8* dst_bot = dst + dst_stride * (height - 1);
887   int half_height = (height + 1) >> 1;
888   // Odd height will harmlessly mirror the middle row twice.
889   for (int y = 0; y < half_height; ++y) {
890     MirrorRow(src, row, width);  // Mirror first row into a buffer
891     src += src_stride;
892     MirrorRow(src_bot, dst, width);  // Mirror last row into first row
893     dst += dst_stride;
894     CopyRow(row, dst_bot, width);  // Copy first mirrored row into last
895     src_bot -= src_stride;
896     dst_bot -= dst_stride;
897   }
898 }
899 
TransposeUVWx8_C(const uint8 * src,int src_stride,uint8 * dst_a,int dst_stride_a,uint8 * dst_b,int dst_stride_b,int width)900 static void TransposeUVWx8_C(const uint8* src, int src_stride,
901                              uint8* dst_a, int dst_stride_a,
902                              uint8* dst_b, int dst_stride_b,
903                              int width) {
904   for (int i = 0; i < width; ++i) {
905     dst_a[0] = src[0 * src_stride + 0];
906     dst_b[0] = src[0 * src_stride + 1];
907     dst_a[1] = src[1 * src_stride + 0];
908     dst_b[1] = src[1 * src_stride + 1];
909     dst_a[2] = src[2 * src_stride + 0];
910     dst_b[2] = src[2 * src_stride + 1];
911     dst_a[3] = src[3 * src_stride + 0];
912     dst_b[3] = src[3 * src_stride + 1];
913     dst_a[4] = src[4 * src_stride + 0];
914     dst_b[4] = src[4 * src_stride + 1];
915     dst_a[5] = src[5 * src_stride + 0];
916     dst_b[5] = src[5 * src_stride + 1];
917     dst_a[6] = src[6 * src_stride + 0];
918     dst_b[6] = src[6 * src_stride + 1];
919     dst_a[7] = src[7 * src_stride + 0];
920     dst_b[7] = src[7 * src_stride + 1];
921     src += 2;
922     dst_a += dst_stride_a;
923     dst_b += dst_stride_b;
924   }
925 }
926 
TransposeUVWxH_C(const uint8 * src,int src_stride,uint8 * dst_a,int dst_stride_a,uint8 * dst_b,int dst_stride_b,int width,int height)927 static void TransposeUVWxH_C(const uint8* src, int src_stride,
928                              uint8* dst_a, int dst_stride_a,
929                              uint8* dst_b, int dst_stride_b,
930                              int width, int height) {
931   for (int i = 0; i < width * 2; i += 2)
932     for (int j = 0; j < height; ++j) {
933       dst_a[j + ((i >> 1) * dst_stride_a)] = src[i + (j * src_stride)];
934       dst_b[j + ((i >> 1) * dst_stride_b)] = src[i + (j * src_stride) + 1];
935     }
936 }
937 
938 LIBYUV_API
TransposeUV(const uint8 * src,int src_stride,uint8 * dst_a,int dst_stride_a,uint8 * dst_b,int dst_stride_b,int width,int height)939 void TransposeUV(const uint8* src, int src_stride,
940                  uint8* dst_a, int dst_stride_a,
941                  uint8* dst_b, int dst_stride_b,
942                  int width, int height) {
943   void (*TransposeUVWx8)(const uint8* src, int src_stride,
944                          uint8* dst_a, int dst_stride_a,
945                          uint8* dst_b, int dst_stride_b,
946                          int width) = TransposeUVWx8_C;
947 #if defined(HAS_TRANSPOSE_UVWX8_NEON)
948   if (TestCpuFlag(kCpuHasNEON)) {
949     TransposeUVWx8 = TransposeUVWx8_NEON;
950   }
951 #elif defined(HAS_TRANSPOSE_UVWX8_SSE2)
952   if (TestCpuFlag(kCpuHasSSE2) &&
953       IS_ALIGNED(width, 8) &&
954       IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16)) {
955     TransposeUVWx8 = TransposeUVWx8_SSE2;
956   }
957 #endif
958 
959   // Work through the source in 8x8 tiles.
960   int i = height;
961   while (i >= 8) {
962     TransposeUVWx8(src, src_stride,
963                    dst_a, dst_stride_a,
964                    dst_b, dst_stride_b,
965                    width);
966     src += 8 * src_stride;    // Go down 8 rows.
967     dst_a += 8;               // Move over 8 columns.
968     dst_b += 8;               // Move over 8 columns.
969     i -= 8;
970   }
971 
972   TransposeUVWxH_C(src, src_stride,
973                    dst_a, dst_stride_a,
974                    dst_b, dst_stride_b,
975                    width, i);
976 }
977 
978 LIBYUV_API
RotateUV90(const uint8 * src,int src_stride,uint8 * dst_a,int dst_stride_a,uint8 * dst_b,int dst_stride_b,int width,int height)979 void RotateUV90(const uint8* src, int src_stride,
980                 uint8* dst_a, int dst_stride_a,
981                 uint8* dst_b, int dst_stride_b,
982                 int width, int height) {
983   src += src_stride * (height - 1);
984   src_stride = -src_stride;
985 
986   TransposeUV(src, src_stride,
987               dst_a, dst_stride_a,
988               dst_b, dst_stride_b,
989               width, height);
990 }
991 
992 LIBYUV_API
RotateUV270(const uint8 * src,int src_stride,uint8 * dst_a,int dst_stride_a,uint8 * dst_b,int dst_stride_b,int width,int height)993 void RotateUV270(const uint8* src, int src_stride,
994                  uint8* dst_a, int dst_stride_a,
995                  uint8* dst_b, int dst_stride_b,
996                  int width, int height) {
997   dst_a += dst_stride_a * (width - 1);
998   dst_b += dst_stride_b * (width - 1);
999   dst_stride_a = -dst_stride_a;
1000   dst_stride_b = -dst_stride_b;
1001 
1002   TransposeUV(src, src_stride,
1003               dst_a, dst_stride_a,
1004               dst_b, dst_stride_b,
1005               width, height);
1006 }
1007 
1008 // Rotate 180 is a horizontal and vertical flip.
1009 LIBYUV_API
RotateUV180(const uint8 * src,int src_stride,uint8 * dst_a,int dst_stride_a,uint8 * dst_b,int dst_stride_b,int width,int height)1010 void RotateUV180(const uint8* src, int src_stride,
1011                  uint8* dst_a, int dst_stride_a,
1012                  uint8* dst_b, int dst_stride_b,
1013                  int width, int height) {
1014   void (*MirrorRowUV)(const uint8* src, uint8* dst_u, uint8* dst_v, int width) =
1015       MirrorRowUV_C;
1016 #if defined(HAS_MIRRORROW_UV_NEON)
1017   if (TestCpuFlag(kCpuHasNEON)) {
1018     MirrorRowUV = MirrorRowUV_NEON;
1019   }
1020 #elif defined(HAS_MIRRORROW_UV_SSSE3)
1021   if (TestCpuFlag(kCpuHasSSSE3) &&
1022       IS_ALIGNED(width, 16) &&
1023       IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16)) {
1024     MirrorRowUV = MirrorRowUV_SSSE3;
1025   }
1026 #endif
1027 
1028   dst_a += dst_stride_a * (height - 1);
1029   dst_b += dst_stride_b * (height - 1);
1030 
1031   for (int i = 0; i < height; ++i) {
1032     MirrorRowUV(src, dst_a, dst_b, width);
1033     src += src_stride;
1034     dst_a -= dst_stride_a;
1035     dst_b -= dst_stride_b;
1036   }
1037 }
1038 
1039 LIBYUV_API
I420Rotate(const uint8 * src_y,int src_stride_y,const uint8 * src_u,int src_stride_u,const uint8 * src_v,int src_stride_v,uint8 * dst_y,int dst_stride_y,uint8 * dst_u,int dst_stride_u,uint8 * dst_v,int dst_stride_v,int width,int height,RotationMode mode)1040 int I420Rotate(const uint8* src_y, int src_stride_y,
1041                const uint8* src_u, int src_stride_u,
1042                const uint8* src_v, int src_stride_v,
1043                uint8* dst_y, int dst_stride_y,
1044                uint8* dst_u, int dst_stride_u,
1045                uint8* dst_v, int dst_stride_v,
1046                int width, int height,
1047                RotationMode mode) {
1048   if (!src_y || !src_u || !src_v || width <= 0 || height == 0 ||
1049       !dst_y || !dst_u || !dst_v) {
1050     return -1;
1051   }
1052   int halfwidth = (width + 1) >> 1;
1053   int halfheight = (height + 1) >> 1;
1054 
1055   // Negative height means invert the image.
1056   if (height < 0) {
1057     height = -height;
1058     halfheight = (height + 1) >> 1;
1059     src_y = src_y + (height - 1) * src_stride_y;
1060     src_u = src_u + (halfheight - 1) * src_stride_u;
1061     src_v = src_v + (halfheight - 1) * src_stride_v;
1062     src_stride_y = -src_stride_y;
1063     src_stride_u = -src_stride_u;
1064     src_stride_v = -src_stride_v;
1065   }
1066 
1067   switch (mode) {
1068     case kRotate0:
1069       // copy frame
1070       return I420Copy(src_y, src_stride_y,
1071                       src_u, src_stride_u,
1072                       src_v, src_stride_v,
1073                       dst_y, dst_stride_y,
1074                       dst_u, dst_stride_u,
1075                       dst_v, dst_stride_v,
1076                       width, height);
1077     case kRotate90:
1078       RotatePlane90(src_y, src_stride_y,
1079                     dst_y, dst_stride_y,
1080                     width, height);
1081       RotatePlane90(src_u, src_stride_u,
1082                     dst_u, dst_stride_u,
1083                     halfwidth, halfheight);
1084       RotatePlane90(src_v, src_stride_v,
1085                     dst_v, dst_stride_v,
1086                     halfwidth, halfheight);
1087       return 0;
1088     case kRotate270:
1089       RotatePlane270(src_y, src_stride_y,
1090                      dst_y, dst_stride_y,
1091                      width, height);
1092       RotatePlane270(src_u, src_stride_u,
1093                      dst_u, dst_stride_u,
1094                      halfwidth, halfheight);
1095       RotatePlane270(src_v, src_stride_v,
1096                      dst_v, dst_stride_v,
1097                      halfwidth, halfheight);
1098       return 0;
1099     case kRotate180:
1100       RotatePlane180(src_y, src_stride_y,
1101                      dst_y, dst_stride_y,
1102                      width, height);
1103       RotatePlane180(src_u, src_stride_u,
1104                      dst_u, dst_stride_u,
1105                      halfwidth, halfheight);
1106       RotatePlane180(src_v, src_stride_v,
1107                      dst_v, dst_stride_v,
1108                      halfwidth, halfheight);
1109       return 0;
1110     default:
1111       break;
1112   }
1113   return -1;
1114 }
1115 
1116 LIBYUV_API
NV12ToI420Rotate(const uint8 * src_y,int src_stride_y,const uint8 * src_uv,int src_stride_uv,uint8 * dst_y,int dst_stride_y,uint8 * dst_u,int dst_stride_u,uint8 * dst_v,int dst_stride_v,int width,int height,RotationMode mode)1117 int NV12ToI420Rotate(const uint8* src_y, int src_stride_y,
1118                      const uint8* src_uv, int src_stride_uv,
1119                      uint8* dst_y, int dst_stride_y,
1120                      uint8* dst_u, int dst_stride_u,
1121                      uint8* dst_v, int dst_stride_v,
1122                      int width, int height,
1123                      RotationMode mode) {
1124   if (!src_y || !src_uv || width <= 0 || height == 0 ||
1125       !dst_y || !dst_u || !dst_v) {
1126     return -1;
1127   }
1128   int halfwidth = (width + 1) >> 1;
1129   int halfheight = (height + 1) >> 1;
1130 
1131   // Negative height means invert the image.
1132   if (height < 0) {
1133     height = -height;
1134     halfheight = (height + 1) >> 1;
1135     src_y = src_y + (height - 1) * src_stride_y;
1136     src_uv = src_uv + (halfheight - 1) * src_stride_uv;
1137     src_stride_y = -src_stride_y;
1138     src_stride_uv = -src_stride_uv;
1139   }
1140 
1141   switch (mode) {
1142     case kRotate0:
1143       // copy frame
1144       return NV12ToI420(src_y, src_stride_y,
1145                         src_uv, src_stride_uv,
1146                         dst_y, dst_stride_y,
1147                         dst_u, dst_stride_u,
1148                         dst_v, dst_stride_v,
1149                         width, height);
1150     case kRotate90:
1151       RotatePlane90(src_y, src_stride_y,
1152                     dst_y, dst_stride_y,
1153                     width, height);
1154       RotateUV90(src_uv, src_stride_uv,
1155                  dst_u, dst_stride_u,
1156                  dst_v, dst_stride_v,
1157                  halfwidth, halfheight);
1158       return 0;
1159     case kRotate270:
1160       RotatePlane270(src_y, src_stride_y,
1161                      dst_y, dst_stride_y,
1162                      width, height);
1163       RotateUV270(src_uv, src_stride_uv,
1164                   dst_u, dst_stride_u,
1165                   dst_v, dst_stride_v,
1166                   halfwidth, halfheight);
1167       return 0;
1168     case kRotate180:
1169       RotatePlane180(src_y, src_stride_y,
1170                      dst_y, dst_stride_y,
1171                      width, height);
1172       RotateUV180(src_uv, src_stride_uv,
1173                   dst_u, dst_stride_u,
1174                   dst_v, dst_stride_v,
1175                   halfwidth, halfheight);
1176       return 0;
1177     default:
1178       break;
1179   }
1180   return -1;
1181 }
1182 
1183 #ifdef __cplusplus
1184 }  // extern "C"
1185 }  // namespace libyuv
1186 #endif
1187