1 /*
2  *  Copyright 2013 The LibYuv Project Authors. All rights reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS. All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include "libyuv/rotate_row.h"
12 #include "libyuv/row.h"
13 
14 #ifdef __cplusplus
15 namespace libyuv {
16 extern "C" {
17 #endif
18 
19 // This module is for 32 bit Visual C x86 and clangcl
20 #if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
21 
TransposeWx8_SSSE3(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int width)22 __declspec(naked) void TransposeWx8_SSSE3(const uint8_t* src,
23                                           int src_stride,
24                                           uint8_t* dst,
25                                           int dst_stride,
26                                           int width) {
27   __asm {
28     push      edi
29     push      esi
30     push      ebp
31     mov       eax, [esp + 12 + 4]  // src
32     mov       edi, [esp + 12 + 8]  // src_stride
33     mov       edx, [esp + 12 + 12]  // dst
34     mov       esi, [esp + 12 + 16]  // dst_stride
35     mov       ecx, [esp + 12 + 20]  // width
36 
37     // Read in the data from the source pointer.
38     // First round of bit swap.
39     align      4
40  convertloop:
41     movq      xmm0, qword ptr [eax]
42     lea       ebp, [eax + 8]
43     movq      xmm1, qword ptr [eax + edi]
44     lea       eax, [eax + 2 * edi]
45     punpcklbw xmm0, xmm1
46     movq      xmm2, qword ptr [eax]
47     movdqa    xmm1, xmm0
48     palignr   xmm1, xmm1, 8
49     movq      xmm3, qword ptr [eax + edi]
50     lea       eax, [eax + 2 * edi]
51     punpcklbw xmm2, xmm3
52     movdqa    xmm3, xmm2
53     movq      xmm4, qword ptr [eax]
54     palignr   xmm3, xmm3, 8
55     movq      xmm5, qword ptr [eax + edi]
56     punpcklbw xmm4, xmm5
57     lea       eax, [eax + 2 * edi]
58     movdqa    xmm5, xmm4
59     movq      xmm6, qword ptr [eax]
60     palignr   xmm5, xmm5, 8
61     movq      xmm7, qword ptr [eax + edi]
62     punpcklbw xmm6, xmm7
63     mov       eax, ebp
64     movdqa    xmm7, xmm6
65     palignr   xmm7, xmm7, 8
66     // Second round of bit swap.
67     punpcklwd xmm0, xmm2
68     punpcklwd xmm1, xmm3
69     movdqa    xmm2, xmm0
70     movdqa    xmm3, xmm1
71     palignr   xmm2, xmm2, 8
72     palignr   xmm3, xmm3, 8
73     punpcklwd xmm4, xmm6
74     punpcklwd xmm5, xmm7
75     movdqa    xmm6, xmm4
76     movdqa    xmm7, xmm5
77     palignr   xmm6, xmm6, 8
78     palignr   xmm7, xmm7, 8
79     // Third round of bit swap.
80     // Write to the destination pointer.
81     punpckldq xmm0, xmm4
82     movq      qword ptr [edx], xmm0
83     movdqa    xmm4, xmm0
84     palignr   xmm4, xmm4, 8
85     movq      qword ptr [edx + esi], xmm4
86     lea       edx, [edx + 2 * esi]
87     punpckldq xmm2, xmm6
88     movdqa    xmm6, xmm2
89     palignr   xmm6, xmm6, 8
90     movq      qword ptr [edx], xmm2
91     punpckldq xmm1, xmm5
92     movq      qword ptr [edx + esi], xmm6
93     lea       edx, [edx + 2 * esi]
94     movdqa    xmm5, xmm1
95     movq      qword ptr [edx], xmm1
96     palignr   xmm5, xmm5, 8
97     punpckldq xmm3, xmm7
98     movq      qword ptr [edx + esi], xmm5
99     lea       edx, [edx + 2 * esi]
100     movq      qword ptr [edx], xmm3
101     movdqa    xmm7, xmm3
102     palignr   xmm7, xmm7, 8
103     sub       ecx, 8
104     movq      qword ptr [edx + esi], xmm7
105     lea       edx, [edx + 2 * esi]
106     jg        convertloop
107 
108     pop       ebp
109     pop       esi
110     pop       edi
111     ret
112   }
113 }
114 
TransposeUVWx8_SSE2(const uint8_t * src,int src_stride,uint8_t * dst_a,int dst_stride_a,uint8_t * dst_b,int dst_stride_b,int w)115 __declspec(naked) void TransposeUVWx8_SSE2(const uint8_t* src,
116                                            int src_stride,
117                                            uint8_t* dst_a,
118                                            int dst_stride_a,
119                                            uint8_t* dst_b,
120                                            int dst_stride_b,
121                                            int w) {
122   __asm {
123     push      ebx
124     push      esi
125     push      edi
126     push      ebp
127     mov       eax, [esp + 16 + 4]  // src
128     mov       edi, [esp + 16 + 8]  // src_stride
129     mov       edx, [esp + 16 + 12]  // dst_a
130     mov       esi, [esp + 16 + 16]  // dst_stride_a
131     mov       ebx, [esp + 16 + 20]  // dst_b
132     mov       ebp, [esp + 16 + 24]  // dst_stride_b
133     mov       ecx, esp
134     sub       esp, 4 + 16
135     and       esp, ~15
136     mov       [esp + 16], ecx
137     mov       ecx, [ecx + 16 + 28]  // w
138 
139     align      4
140     // Read in the data from the source pointer.
141     // First round of bit swap.
142   convertloop:
143     movdqu    xmm0, [eax]
144     movdqu    xmm1, [eax + edi]
145     lea       eax, [eax + 2 * edi]
146     movdqa    xmm7, xmm0  // use xmm7 as temp register.
147     punpcklbw xmm0, xmm1
148     punpckhbw xmm7, xmm1
149     movdqa    xmm1, xmm7
150     movdqu    xmm2, [eax]
151     movdqu    xmm3, [eax + edi]
152     lea       eax, [eax + 2 * edi]
153     movdqa    xmm7, xmm2
154     punpcklbw xmm2, xmm3
155     punpckhbw xmm7, xmm3
156     movdqa    xmm3, xmm7
157     movdqu    xmm4, [eax]
158     movdqu    xmm5, [eax + edi]
159     lea       eax, [eax + 2 * edi]
160     movdqa    xmm7, xmm4
161     punpcklbw xmm4, xmm5
162     punpckhbw xmm7, xmm5
163     movdqa    xmm5, xmm7
164     movdqu    xmm6, [eax]
165     movdqu    xmm7, [eax + edi]
166     lea       eax, [eax + 2 * edi]
167     movdqu    [esp], xmm5  // backup xmm5
168     neg       edi
169     movdqa    xmm5, xmm6  // use xmm5 as temp register.
170     punpcklbw xmm6, xmm7
171     punpckhbw xmm5, xmm7
172     movdqa    xmm7, xmm5
173     lea       eax, [eax + 8 * edi + 16]
174     neg       edi
175         // Second round of bit swap.
176     movdqa    xmm5, xmm0
177     punpcklwd xmm0, xmm2
178     punpckhwd xmm5, xmm2
179     movdqa    xmm2, xmm5
180     movdqa    xmm5, xmm1
181     punpcklwd xmm1, xmm3
182     punpckhwd xmm5, xmm3
183     movdqa    xmm3, xmm5
184     movdqa    xmm5, xmm4
185     punpcklwd xmm4, xmm6
186     punpckhwd xmm5, xmm6
187     movdqa    xmm6, xmm5
188     movdqu    xmm5, [esp]  // restore xmm5
189     movdqu    [esp], xmm6  // backup xmm6
190     movdqa    xmm6, xmm5  // use xmm6 as temp register.
191     punpcklwd xmm5, xmm7
192     punpckhwd xmm6, xmm7
193     movdqa    xmm7, xmm6
194 
195         // Third round of bit swap.
196         // Write to the destination pointer.
197     movdqa    xmm6, xmm0
198     punpckldq xmm0, xmm4
199     punpckhdq xmm6, xmm4
200     movdqa    xmm4, xmm6
201     movdqu    xmm6, [esp]  // restore xmm6
202     movlpd    qword ptr [edx], xmm0
203     movhpd    qword ptr [ebx], xmm0
204     movlpd    qword ptr [edx + esi], xmm4
205     lea       edx, [edx + 2 * esi]
206     movhpd    qword ptr [ebx + ebp], xmm4
207     lea       ebx, [ebx + 2 * ebp]
208     movdqa    xmm0, xmm2  // use xmm0 as the temp register.
209     punpckldq xmm2, xmm6
210     movlpd    qword ptr [edx], xmm2
211     movhpd    qword ptr [ebx], xmm2
212     punpckhdq xmm0, xmm6
213     movlpd    qword ptr [edx + esi], xmm0
214     lea       edx, [edx + 2 * esi]
215     movhpd    qword ptr [ebx + ebp], xmm0
216     lea       ebx, [ebx + 2 * ebp]
217     movdqa    xmm0, xmm1  // use xmm0 as the temp register.
218     punpckldq xmm1, xmm5
219     movlpd    qword ptr [edx], xmm1
220     movhpd    qword ptr [ebx], xmm1
221     punpckhdq xmm0, xmm5
222     movlpd    qword ptr [edx + esi], xmm0
223     lea       edx, [edx + 2 * esi]
224     movhpd    qword ptr [ebx + ebp], xmm0
225     lea       ebx, [ebx + 2 * ebp]
226     movdqa    xmm0, xmm3  // use xmm0 as the temp register.
227     punpckldq xmm3, xmm7
228     movlpd    qword ptr [edx], xmm3
229     movhpd    qword ptr [ebx], xmm3
230     punpckhdq xmm0, xmm7
231     sub       ecx, 8
232     movlpd    qword ptr [edx + esi], xmm0
233     lea       edx, [edx + 2 * esi]
234     movhpd    qword ptr [ebx + ebp], xmm0
235     lea       ebx, [ebx + 2 * ebp]
236     jg        convertloop
237 
238     mov       esp, [esp + 16]
239     pop       ebp
240     pop       edi
241     pop       esi
242     pop       ebx
243     ret
244   }
245 }
246 
247 #endif  // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
248 
249 #ifdef __cplusplus
250 }  // extern "C"
251 }  // namespace libyuv
252 #endif
253