1 /*
2 * Copyright 2013 The LibYuv Project Authors. All rights reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include "libyuv/rotate_row.h"
12 #include "libyuv/row.h"
13
14 #ifdef __cplusplus
15 namespace libyuv {
16 extern "C" {
17 #endif
18
19 // This module is for 32 bit Visual C x86 and clangcl
20 #if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
21
TransposeWx8_SSSE3(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int width)22 __declspec(naked) void TransposeWx8_SSSE3(const uint8_t* src,
23 int src_stride,
24 uint8_t* dst,
25 int dst_stride,
26 int width) {
27 __asm {
28 push edi
29 push esi
30 push ebp
31 mov eax, [esp + 12 + 4] // src
32 mov edi, [esp + 12 + 8] // src_stride
33 mov edx, [esp + 12 + 12] // dst
34 mov esi, [esp + 12 + 16] // dst_stride
35 mov ecx, [esp + 12 + 20] // width
36
37 // Read in the data from the source pointer.
38 // First round of bit swap.
39 align 4
40 convertloop:
41 movq xmm0, qword ptr [eax]
42 lea ebp, [eax + 8]
43 movq xmm1, qword ptr [eax + edi]
44 lea eax, [eax + 2 * edi]
45 punpcklbw xmm0, xmm1
46 movq xmm2, qword ptr [eax]
47 movdqa xmm1, xmm0
48 palignr xmm1, xmm1, 8
49 movq xmm3, qword ptr [eax + edi]
50 lea eax, [eax + 2 * edi]
51 punpcklbw xmm2, xmm3
52 movdqa xmm3, xmm2
53 movq xmm4, qword ptr [eax]
54 palignr xmm3, xmm3, 8
55 movq xmm5, qword ptr [eax + edi]
56 punpcklbw xmm4, xmm5
57 lea eax, [eax + 2 * edi]
58 movdqa xmm5, xmm4
59 movq xmm6, qword ptr [eax]
60 palignr xmm5, xmm5, 8
61 movq xmm7, qword ptr [eax + edi]
62 punpcklbw xmm6, xmm7
63 mov eax, ebp
64 movdqa xmm7, xmm6
65 palignr xmm7, xmm7, 8
66 // Second round of bit swap.
67 punpcklwd xmm0, xmm2
68 punpcklwd xmm1, xmm3
69 movdqa xmm2, xmm0
70 movdqa xmm3, xmm1
71 palignr xmm2, xmm2, 8
72 palignr xmm3, xmm3, 8
73 punpcklwd xmm4, xmm6
74 punpcklwd xmm5, xmm7
75 movdqa xmm6, xmm4
76 movdqa xmm7, xmm5
77 palignr xmm6, xmm6, 8
78 palignr xmm7, xmm7, 8
79 // Third round of bit swap.
80 // Write to the destination pointer.
81 punpckldq xmm0, xmm4
82 movq qword ptr [edx], xmm0
83 movdqa xmm4, xmm0
84 palignr xmm4, xmm4, 8
85 movq qword ptr [edx + esi], xmm4
86 lea edx, [edx + 2 * esi]
87 punpckldq xmm2, xmm6
88 movdqa xmm6, xmm2
89 palignr xmm6, xmm6, 8
90 movq qword ptr [edx], xmm2
91 punpckldq xmm1, xmm5
92 movq qword ptr [edx + esi], xmm6
93 lea edx, [edx + 2 * esi]
94 movdqa xmm5, xmm1
95 movq qword ptr [edx], xmm1
96 palignr xmm5, xmm5, 8
97 punpckldq xmm3, xmm7
98 movq qword ptr [edx + esi], xmm5
99 lea edx, [edx + 2 * esi]
100 movq qword ptr [edx], xmm3
101 movdqa xmm7, xmm3
102 palignr xmm7, xmm7, 8
103 sub ecx, 8
104 movq qword ptr [edx + esi], xmm7
105 lea edx, [edx + 2 * esi]
106 jg convertloop
107
108 pop ebp
109 pop esi
110 pop edi
111 ret
112 }
113 }
114
TransposeUVWx8_SSE2(const uint8_t * src,int src_stride,uint8_t * dst_a,int dst_stride_a,uint8_t * dst_b,int dst_stride_b,int w)115 __declspec(naked) void TransposeUVWx8_SSE2(const uint8_t* src,
116 int src_stride,
117 uint8_t* dst_a,
118 int dst_stride_a,
119 uint8_t* dst_b,
120 int dst_stride_b,
121 int w) {
122 __asm {
123 push ebx
124 push esi
125 push edi
126 push ebp
127 mov eax, [esp + 16 + 4] // src
128 mov edi, [esp + 16 + 8] // src_stride
129 mov edx, [esp + 16 + 12] // dst_a
130 mov esi, [esp + 16 + 16] // dst_stride_a
131 mov ebx, [esp + 16 + 20] // dst_b
132 mov ebp, [esp + 16 + 24] // dst_stride_b
133 mov ecx, esp
134 sub esp, 4 + 16
135 and esp, ~15
136 mov [esp + 16], ecx
137 mov ecx, [ecx + 16 + 28] // w
138
139 align 4
140 // Read in the data from the source pointer.
141 // First round of bit swap.
142 convertloop:
143 movdqu xmm0, [eax]
144 movdqu xmm1, [eax + edi]
145 lea eax, [eax + 2 * edi]
146 movdqa xmm7, xmm0 // use xmm7 as temp register.
147 punpcklbw xmm0, xmm1
148 punpckhbw xmm7, xmm1
149 movdqa xmm1, xmm7
150 movdqu xmm2, [eax]
151 movdqu xmm3, [eax + edi]
152 lea eax, [eax + 2 * edi]
153 movdqa xmm7, xmm2
154 punpcklbw xmm2, xmm3
155 punpckhbw xmm7, xmm3
156 movdqa xmm3, xmm7
157 movdqu xmm4, [eax]
158 movdqu xmm5, [eax + edi]
159 lea eax, [eax + 2 * edi]
160 movdqa xmm7, xmm4
161 punpcklbw xmm4, xmm5
162 punpckhbw xmm7, xmm5
163 movdqa xmm5, xmm7
164 movdqu xmm6, [eax]
165 movdqu xmm7, [eax + edi]
166 lea eax, [eax + 2 * edi]
167 movdqu [esp], xmm5 // backup xmm5
168 neg edi
169 movdqa xmm5, xmm6 // use xmm5 as temp register.
170 punpcklbw xmm6, xmm7
171 punpckhbw xmm5, xmm7
172 movdqa xmm7, xmm5
173 lea eax, [eax + 8 * edi + 16]
174 neg edi
175 // Second round of bit swap.
176 movdqa xmm5, xmm0
177 punpcklwd xmm0, xmm2
178 punpckhwd xmm5, xmm2
179 movdqa xmm2, xmm5
180 movdqa xmm5, xmm1
181 punpcklwd xmm1, xmm3
182 punpckhwd xmm5, xmm3
183 movdqa xmm3, xmm5
184 movdqa xmm5, xmm4
185 punpcklwd xmm4, xmm6
186 punpckhwd xmm5, xmm6
187 movdqa xmm6, xmm5
188 movdqu xmm5, [esp] // restore xmm5
189 movdqu [esp], xmm6 // backup xmm6
190 movdqa xmm6, xmm5 // use xmm6 as temp register.
191 punpcklwd xmm5, xmm7
192 punpckhwd xmm6, xmm7
193 movdqa xmm7, xmm6
194
195 // Third round of bit swap.
196 // Write to the destination pointer.
197 movdqa xmm6, xmm0
198 punpckldq xmm0, xmm4
199 punpckhdq xmm6, xmm4
200 movdqa xmm4, xmm6
201 movdqu xmm6, [esp] // restore xmm6
202 movlpd qword ptr [edx], xmm0
203 movhpd qword ptr [ebx], xmm0
204 movlpd qword ptr [edx + esi], xmm4
205 lea edx, [edx + 2 * esi]
206 movhpd qword ptr [ebx + ebp], xmm4
207 lea ebx, [ebx + 2 * ebp]
208 movdqa xmm0, xmm2 // use xmm0 as the temp register.
209 punpckldq xmm2, xmm6
210 movlpd qword ptr [edx], xmm2
211 movhpd qword ptr [ebx], xmm2
212 punpckhdq xmm0, xmm6
213 movlpd qword ptr [edx + esi], xmm0
214 lea edx, [edx + 2 * esi]
215 movhpd qword ptr [ebx + ebp], xmm0
216 lea ebx, [ebx + 2 * ebp]
217 movdqa xmm0, xmm1 // use xmm0 as the temp register.
218 punpckldq xmm1, xmm5
219 movlpd qword ptr [edx], xmm1
220 movhpd qword ptr [ebx], xmm1
221 punpckhdq xmm0, xmm5
222 movlpd qword ptr [edx + esi], xmm0
223 lea edx, [edx + 2 * esi]
224 movhpd qword ptr [ebx + ebp], xmm0
225 lea ebx, [ebx + 2 * ebp]
226 movdqa xmm0, xmm3 // use xmm0 as the temp register.
227 punpckldq xmm3, xmm7
228 movlpd qword ptr [edx], xmm3
229 movhpd qword ptr [ebx], xmm3
230 punpckhdq xmm0, xmm7
231 sub ecx, 8
232 movlpd qword ptr [edx + esi], xmm0
233 lea edx, [edx + 2 * esi]
234 movhpd qword ptr [ebx + ebp], xmm0
235 lea ebx, [ebx + 2 * ebp]
236 jg convertloop
237
238 mov esp, [esp + 16]
239 pop ebp
240 pop edi
241 pop esi
242 pop ebx
243 ret
244 }
245 }
246
247 #endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
248
249 #ifdef __cplusplus
250 } // extern "C"
251 } // namespace libyuv
252 #endif
253