1 /*
2 * Copyright 2013 The LibYuv Project Authors. All rights reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include "libyuv/row.h"
12 #include "libyuv/rotate_row.h"
13
14 #ifdef __cplusplus
15 namespace libyuv {
16 extern "C" {
17 #endif
18
19 // This module is for Visual C x86.
20 #if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && \
21 defined(_MSC_VER) && !defined(__clang__)
22
23 __declspec(naked)
TransposeWx8_SSSE3(const uint8 * src,int src_stride,uint8 * dst,int dst_stride,int width)24 void TransposeWx8_SSSE3(const uint8* src, int src_stride,
25 uint8* dst, int dst_stride, int width) {
26 __asm {
27 push edi
28 push esi
29 push ebp
30 mov eax, [esp + 12 + 4] // src
31 mov edi, [esp + 12 + 8] // src_stride
32 mov edx, [esp + 12 + 12] // dst
33 mov esi, [esp + 12 + 16] // dst_stride
34 mov ecx, [esp + 12 + 20] // width
35
36 // Read in the data from the source pointer.
37 // First round of bit swap.
38 align 4
39 convertloop:
40 movq xmm0, qword ptr [eax]
41 lea ebp, [eax + 8]
42 movq xmm1, qword ptr [eax + edi]
43 lea eax, [eax + 2 * edi]
44 punpcklbw xmm0, xmm1
45 movq xmm2, qword ptr [eax]
46 movdqa xmm1, xmm0
47 palignr xmm1, xmm1, 8
48 movq xmm3, qword ptr [eax + edi]
49 lea eax, [eax + 2 * edi]
50 punpcklbw xmm2, xmm3
51 movdqa xmm3, xmm2
52 movq xmm4, qword ptr [eax]
53 palignr xmm3, xmm3, 8
54 movq xmm5, qword ptr [eax + edi]
55 punpcklbw xmm4, xmm5
56 lea eax, [eax + 2 * edi]
57 movdqa xmm5, xmm4
58 movq xmm6, qword ptr [eax]
59 palignr xmm5, xmm5, 8
60 movq xmm7, qword ptr [eax + edi]
61 punpcklbw xmm6, xmm7
62 mov eax, ebp
63 movdqa xmm7, xmm6
64 palignr xmm7, xmm7, 8
65 // Second round of bit swap.
66 punpcklwd xmm0, xmm2
67 punpcklwd xmm1, xmm3
68 movdqa xmm2, xmm0
69 movdqa xmm3, xmm1
70 palignr xmm2, xmm2, 8
71 palignr xmm3, xmm3, 8
72 punpcklwd xmm4, xmm6
73 punpcklwd xmm5, xmm7
74 movdqa xmm6, xmm4
75 movdqa xmm7, xmm5
76 palignr xmm6, xmm6, 8
77 palignr xmm7, xmm7, 8
78 // Third round of bit swap.
79 // Write to the destination pointer.
80 punpckldq xmm0, xmm4
81 movq qword ptr [edx], xmm0
82 movdqa xmm4, xmm0
83 palignr xmm4, xmm4, 8
84 movq qword ptr [edx + esi], xmm4
85 lea edx, [edx + 2 * esi]
86 punpckldq xmm2, xmm6
87 movdqa xmm6, xmm2
88 palignr xmm6, xmm6, 8
89 movq qword ptr [edx], xmm2
90 punpckldq xmm1, xmm5
91 movq qword ptr [edx + esi], xmm6
92 lea edx, [edx + 2 * esi]
93 movdqa xmm5, xmm1
94 movq qword ptr [edx], xmm1
95 palignr xmm5, xmm5, 8
96 punpckldq xmm3, xmm7
97 movq qword ptr [edx + esi], xmm5
98 lea edx, [edx + 2 * esi]
99 movq qword ptr [edx], xmm3
100 movdqa xmm7, xmm3
101 palignr xmm7, xmm7, 8
102 sub ecx, 8
103 movq qword ptr [edx + esi], xmm7
104 lea edx, [edx + 2 * esi]
105 jg convertloop
106
107 pop ebp
108 pop esi
109 pop edi
110 ret
111 }
112 }
113
114 __declspec(naked)
TransposeUVWx8_SSE2(const uint8 * src,int src_stride,uint8 * dst_a,int dst_stride_a,uint8 * dst_b,int dst_stride_b,int w)115 void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
116 uint8* dst_a, int dst_stride_a,
117 uint8* dst_b, int dst_stride_b,
118 int w) {
119 __asm {
120 push ebx
121 push esi
122 push edi
123 push ebp
124 mov eax, [esp + 16 + 4] // src
125 mov edi, [esp + 16 + 8] // src_stride
126 mov edx, [esp + 16 + 12] // dst_a
127 mov esi, [esp + 16 + 16] // dst_stride_a
128 mov ebx, [esp + 16 + 20] // dst_b
129 mov ebp, [esp + 16 + 24] // dst_stride_b
130 mov ecx, esp
131 sub esp, 4 + 16
132 and esp, ~15
133 mov [esp + 16], ecx
134 mov ecx, [ecx + 16 + 28] // w
135
136 align 4
137 convertloop:
138 // Read in the data from the source pointer.
139 // First round of bit swap.
140 movdqu xmm0, [eax]
141 movdqu xmm1, [eax + edi]
142 lea eax, [eax + 2 * edi]
143 movdqa xmm7, xmm0 // use xmm7 as temp register.
144 punpcklbw xmm0, xmm1
145 punpckhbw xmm7, xmm1
146 movdqa xmm1, xmm7
147 movdqu xmm2, [eax]
148 movdqu xmm3, [eax + edi]
149 lea eax, [eax + 2 * edi]
150 movdqa xmm7, xmm2
151 punpcklbw xmm2, xmm3
152 punpckhbw xmm7, xmm3
153 movdqa xmm3, xmm7
154 movdqu xmm4, [eax]
155 movdqu xmm5, [eax + edi]
156 lea eax, [eax + 2 * edi]
157 movdqa xmm7, xmm4
158 punpcklbw xmm4, xmm5
159 punpckhbw xmm7, xmm5
160 movdqa xmm5, xmm7
161 movdqu xmm6, [eax]
162 movdqu xmm7, [eax + edi]
163 lea eax, [eax + 2 * edi]
164 movdqu [esp], xmm5 // backup xmm5
165 neg edi
166 movdqa xmm5, xmm6 // use xmm5 as temp register.
167 punpcklbw xmm6, xmm7
168 punpckhbw xmm5, xmm7
169 movdqa xmm7, xmm5
170 lea eax, [eax + 8 * edi + 16]
171 neg edi
172 // Second round of bit swap.
173 movdqa xmm5, xmm0
174 punpcklwd xmm0, xmm2
175 punpckhwd xmm5, xmm2
176 movdqa xmm2, xmm5
177 movdqa xmm5, xmm1
178 punpcklwd xmm1, xmm3
179 punpckhwd xmm5, xmm3
180 movdqa xmm3, xmm5
181 movdqa xmm5, xmm4
182 punpcklwd xmm4, xmm6
183 punpckhwd xmm5, xmm6
184 movdqa xmm6, xmm5
185 movdqu xmm5, [esp] // restore xmm5
186 movdqu [esp], xmm6 // backup xmm6
187 movdqa xmm6, xmm5 // use xmm6 as temp register.
188 punpcklwd xmm5, xmm7
189 punpckhwd xmm6, xmm7
190 movdqa xmm7, xmm6
191 // Third round of bit swap.
192 // Write to the destination pointer.
193 movdqa xmm6, xmm0
194 punpckldq xmm0, xmm4
195 punpckhdq xmm6, xmm4
196 movdqa xmm4, xmm6
197 movdqu xmm6, [esp] // restore xmm6
198 movlpd qword ptr [edx], xmm0
199 movhpd qword ptr [ebx], xmm0
200 movlpd qword ptr [edx + esi], xmm4
201 lea edx, [edx + 2 * esi]
202 movhpd qword ptr [ebx + ebp], xmm4
203 lea ebx, [ebx + 2 * ebp]
204 movdqa xmm0, xmm2 // use xmm0 as the temp register.
205 punpckldq xmm2, xmm6
206 movlpd qword ptr [edx], xmm2
207 movhpd qword ptr [ebx], xmm2
208 punpckhdq xmm0, xmm6
209 movlpd qword ptr [edx + esi], xmm0
210 lea edx, [edx + 2 * esi]
211 movhpd qword ptr [ebx + ebp], xmm0
212 lea ebx, [ebx + 2 * ebp]
213 movdqa xmm0, xmm1 // use xmm0 as the temp register.
214 punpckldq xmm1, xmm5
215 movlpd qword ptr [edx], xmm1
216 movhpd qword ptr [ebx], xmm1
217 punpckhdq xmm0, xmm5
218 movlpd qword ptr [edx + esi], xmm0
219 lea edx, [edx + 2 * esi]
220 movhpd qword ptr [ebx + ebp], xmm0
221 lea ebx, [ebx + 2 * ebp]
222 movdqa xmm0, xmm3 // use xmm0 as the temp register.
223 punpckldq xmm3, xmm7
224 movlpd qword ptr [edx], xmm3
225 movhpd qword ptr [ebx], xmm3
226 punpckhdq xmm0, xmm7
227 sub ecx, 8
228 movlpd qword ptr [edx + esi], xmm0
229 lea edx, [edx + 2 * esi]
230 movhpd qword ptr [ebx + ebp], xmm0
231 lea ebx, [ebx + 2 * ebp]
232 jg convertloop
233
234 mov esp, [esp + 16]
235 pop ebp
236 pop edi
237 pop esi
238 pop ebx
239 ret
240 }
241 }
242
243 #endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
244
245 #ifdef __cplusplus
246 } // extern "C"
247 } // namespace libyuv
248 #endif
249