1 /*
2  *  Copyright 2013 The LibYuv Project Authors. All rights reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS. All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include "libyuv/row.h"
12 #include "libyuv/rotate_row.h"
13 
14 #ifdef __cplusplus
15 namespace libyuv {
16 extern "C" {
17 #endif
18 
19 // This module is for Visual C x86.
20 #if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && \
21     defined(_MSC_VER) && !defined(__clang__)
22 
23 __declspec(naked)
TransposeWx8_SSSE3(const uint8 * src,int src_stride,uint8 * dst,int dst_stride,int width)24 void TransposeWx8_SSSE3(const uint8* src, int src_stride,
25                         uint8* dst, int dst_stride, int width) {
26   __asm {
27     push      edi
28     push      esi
29     push      ebp
30     mov       eax, [esp + 12 + 4]   // src
31     mov       edi, [esp + 12 + 8]   // src_stride
32     mov       edx, [esp + 12 + 12]  // dst
33     mov       esi, [esp + 12 + 16]  // dst_stride
34     mov       ecx, [esp + 12 + 20]  // width
35 
36     // Read in the data from the source pointer.
37     // First round of bit swap.
38     align      4
39  convertloop:
40     movq      xmm0, qword ptr [eax]
41     lea       ebp, [eax + 8]
42     movq      xmm1, qword ptr [eax + edi]
43     lea       eax, [eax + 2 * edi]
44     punpcklbw xmm0, xmm1
45     movq      xmm2, qword ptr [eax]
46     movdqa    xmm1, xmm0
47     palignr   xmm1, xmm1, 8
48     movq      xmm3, qword ptr [eax + edi]
49     lea       eax, [eax + 2 * edi]
50     punpcklbw xmm2, xmm3
51     movdqa    xmm3, xmm2
52     movq      xmm4, qword ptr [eax]
53     palignr   xmm3, xmm3, 8
54     movq      xmm5, qword ptr [eax + edi]
55     punpcklbw xmm4, xmm5
56     lea       eax, [eax + 2 * edi]
57     movdqa    xmm5, xmm4
58     movq      xmm6, qword ptr [eax]
59     palignr   xmm5, xmm5, 8
60     movq      xmm7, qword ptr [eax + edi]
61     punpcklbw xmm6, xmm7
62     mov       eax, ebp
63     movdqa    xmm7, xmm6
64     palignr   xmm7, xmm7, 8
65     // Second round of bit swap.
66     punpcklwd xmm0, xmm2
67     punpcklwd xmm1, xmm3
68     movdqa    xmm2, xmm0
69     movdqa    xmm3, xmm1
70     palignr   xmm2, xmm2, 8
71     palignr   xmm3, xmm3, 8
72     punpcklwd xmm4, xmm6
73     punpcklwd xmm5, xmm7
74     movdqa    xmm6, xmm4
75     movdqa    xmm7, xmm5
76     palignr   xmm6, xmm6, 8
77     palignr   xmm7, xmm7, 8
78     // Third round of bit swap.
79     // Write to the destination pointer.
80     punpckldq xmm0, xmm4
81     movq      qword ptr [edx], xmm0
82     movdqa    xmm4, xmm0
83     palignr   xmm4, xmm4, 8
84     movq      qword ptr [edx + esi], xmm4
85     lea       edx, [edx + 2 * esi]
86     punpckldq xmm2, xmm6
87     movdqa    xmm6, xmm2
88     palignr   xmm6, xmm6, 8
89     movq      qword ptr [edx], xmm2
90     punpckldq xmm1, xmm5
91     movq      qword ptr [edx + esi], xmm6
92     lea       edx, [edx + 2 * esi]
93     movdqa    xmm5, xmm1
94     movq      qword ptr [edx], xmm1
95     palignr   xmm5, xmm5, 8
96     punpckldq xmm3, xmm7
97     movq      qword ptr [edx + esi], xmm5
98     lea       edx, [edx + 2 * esi]
99     movq      qword ptr [edx], xmm3
100     movdqa    xmm7, xmm3
101     palignr   xmm7, xmm7, 8
102     sub       ecx, 8
103     movq      qword ptr [edx + esi], xmm7
104     lea       edx, [edx + 2 * esi]
105     jg        convertloop
106 
107     pop       ebp
108     pop       esi
109     pop       edi
110     ret
111   }
112 }
113 
114 __declspec(naked)
TransposeUVWx8_SSE2(const uint8 * src,int src_stride,uint8 * dst_a,int dst_stride_a,uint8 * dst_b,int dst_stride_b,int w)115 void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
116                          uint8* dst_a, int dst_stride_a,
117                          uint8* dst_b, int dst_stride_b,
118                          int w) {
119   __asm {
120     push      ebx
121     push      esi
122     push      edi
123     push      ebp
124     mov       eax, [esp + 16 + 4]   // src
125     mov       edi, [esp + 16 + 8]   // src_stride
126     mov       edx, [esp + 16 + 12]  // dst_a
127     mov       esi, [esp + 16 + 16]  // dst_stride_a
128     mov       ebx, [esp + 16 + 20]  // dst_b
129     mov       ebp, [esp + 16 + 24]  // dst_stride_b
130     mov       ecx, esp
131     sub       esp, 4 + 16
132     and       esp, ~15
133     mov       [esp + 16], ecx
134     mov       ecx, [ecx + 16 + 28]  // w
135 
136     align      4
137  convertloop:
138     // Read in the data from the source pointer.
139     // First round of bit swap.
140     movdqu    xmm0, [eax]
141     movdqu    xmm1, [eax + edi]
142     lea       eax, [eax + 2 * edi]
143     movdqa    xmm7, xmm0  // use xmm7 as temp register.
144     punpcklbw xmm0, xmm1
145     punpckhbw xmm7, xmm1
146     movdqa    xmm1, xmm7
147     movdqu    xmm2, [eax]
148     movdqu    xmm3, [eax + edi]
149     lea       eax, [eax + 2 * edi]
150     movdqa    xmm7, xmm2
151     punpcklbw xmm2, xmm3
152     punpckhbw xmm7, xmm3
153     movdqa    xmm3, xmm7
154     movdqu    xmm4, [eax]
155     movdqu    xmm5, [eax + edi]
156     lea       eax, [eax + 2 * edi]
157     movdqa    xmm7, xmm4
158     punpcklbw xmm4, xmm5
159     punpckhbw xmm7, xmm5
160     movdqa    xmm5, xmm7
161     movdqu    xmm6, [eax]
162     movdqu    xmm7, [eax + edi]
163     lea       eax, [eax + 2 * edi]
164     movdqu    [esp], xmm5  // backup xmm5
165     neg       edi
166     movdqa    xmm5, xmm6   // use xmm5 as temp register.
167     punpcklbw xmm6, xmm7
168     punpckhbw xmm5, xmm7
169     movdqa    xmm7, xmm5
170     lea       eax, [eax + 8 * edi + 16]
171     neg       edi
172     // Second round of bit swap.
173     movdqa    xmm5, xmm0
174     punpcklwd xmm0, xmm2
175     punpckhwd xmm5, xmm2
176     movdqa    xmm2, xmm5
177     movdqa    xmm5, xmm1
178     punpcklwd xmm1, xmm3
179     punpckhwd xmm5, xmm3
180     movdqa    xmm3, xmm5
181     movdqa    xmm5, xmm4
182     punpcklwd xmm4, xmm6
183     punpckhwd xmm5, xmm6
184     movdqa    xmm6, xmm5
185     movdqu    xmm5, [esp]  // restore xmm5
186     movdqu    [esp], xmm6  // backup xmm6
187     movdqa    xmm6, xmm5    // use xmm6 as temp register.
188     punpcklwd xmm5, xmm7
189     punpckhwd xmm6, xmm7
190     movdqa    xmm7, xmm6
191     // Third round of bit swap.
192     // Write to the destination pointer.
193     movdqa    xmm6, xmm0
194     punpckldq xmm0, xmm4
195     punpckhdq xmm6, xmm4
196     movdqa    xmm4, xmm6
197     movdqu    xmm6, [esp]  // restore xmm6
198     movlpd    qword ptr [edx], xmm0
199     movhpd    qword ptr [ebx], xmm0
200     movlpd    qword ptr [edx + esi], xmm4
201     lea       edx, [edx + 2 * esi]
202     movhpd    qword ptr [ebx + ebp], xmm4
203     lea       ebx, [ebx + 2 * ebp]
204     movdqa    xmm0, xmm2   // use xmm0 as the temp register.
205     punpckldq xmm2, xmm6
206     movlpd    qword ptr [edx], xmm2
207     movhpd    qword ptr [ebx], xmm2
208     punpckhdq xmm0, xmm6
209     movlpd    qword ptr [edx + esi], xmm0
210     lea       edx, [edx + 2 * esi]
211     movhpd    qword ptr [ebx + ebp], xmm0
212     lea       ebx, [ebx + 2 * ebp]
213     movdqa    xmm0, xmm1   // use xmm0 as the temp register.
214     punpckldq xmm1, xmm5
215     movlpd    qword ptr [edx], xmm1
216     movhpd    qword ptr [ebx], xmm1
217     punpckhdq xmm0, xmm5
218     movlpd    qword ptr [edx + esi], xmm0
219     lea       edx, [edx + 2 * esi]
220     movhpd    qword ptr [ebx + ebp], xmm0
221     lea       ebx, [ebx + 2 * ebp]
222     movdqa    xmm0, xmm3   // use xmm0 as the temp register.
223     punpckldq xmm3, xmm7
224     movlpd    qword ptr [edx], xmm3
225     movhpd    qword ptr [ebx], xmm3
226     punpckhdq xmm0, xmm7
227     sub       ecx, 8
228     movlpd    qword ptr [edx + esi], xmm0
229     lea       edx, [edx + 2 * esi]
230     movhpd    qword ptr [ebx + ebp], xmm0
231     lea       ebx, [ebx + 2 * ebp]
232     jg        convertloop
233 
234     mov       esp, [esp + 16]
235     pop       ebp
236     pop       edi
237     pop       esi
238     pop       ebx
239     ret
240   }
241 }
242 
243 #endif  // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
244 
245 #ifdef __cplusplus
246 }  // extern "C"
247 }  // namespace libyuv
248 #endif
249