1 /*
2 * Copyright 2015 The LibYuv Project Authors. All rights reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include "libyuv/rotate_row.h"
12 #include "libyuv/row.h"
13
14 #ifdef __cplusplus
15 namespace libyuv {
16 extern "C" {
17 #endif
18
19 // This module is for GCC x86 and x64.
20 #if !defined(LIBYUV_DISABLE_X86) && \
21 (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))
22
23 // Transpose 8x8. 32 or 64 bit, but not NaCL for 64 bit.
24 #if defined(HAS_TRANSPOSEWX8_SSSE3)
TransposeWx8_SSSE3(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int width)25 void TransposeWx8_SSSE3(const uint8_t* src,
26 int src_stride,
27 uint8_t* dst,
28 int dst_stride,
29 int width) {
30 asm volatile(
31 // Read in the data from the source pointer.
32 // First round of bit swap.
33 LABELALIGN
34 "1: \n"
35 "movq (%0),%%xmm0 \n"
36 "movq (%0,%3),%%xmm1 \n"
37 "lea (%0,%3,2),%0 \n"
38 "punpcklbw %%xmm1,%%xmm0 \n"
39 "movq (%0),%%xmm2 \n"
40 "movdqa %%xmm0,%%xmm1 \n"
41 "palignr $0x8,%%xmm1,%%xmm1 \n"
42 "movq (%0,%3),%%xmm3 \n"
43 "lea (%0,%3,2),%0 \n"
44 "punpcklbw %%xmm3,%%xmm2 \n"
45 "movdqa %%xmm2,%%xmm3 \n"
46 "movq (%0),%%xmm4 \n"
47 "palignr $0x8,%%xmm3,%%xmm3 \n"
48 "movq (%0,%3),%%xmm5 \n"
49 "lea (%0,%3,2),%0 \n"
50 "punpcklbw %%xmm5,%%xmm4 \n"
51 "movdqa %%xmm4,%%xmm5 \n"
52 "movq (%0),%%xmm6 \n"
53 "palignr $0x8,%%xmm5,%%xmm5 \n"
54 "movq (%0,%3),%%xmm7 \n"
55 "lea (%0,%3,2),%0 \n"
56 "punpcklbw %%xmm7,%%xmm6 \n"
57 "neg %3 \n"
58 "movdqa %%xmm6,%%xmm7 \n"
59 "lea 0x8(%0,%3,8),%0 \n"
60 "palignr $0x8,%%xmm7,%%xmm7 \n"
61 "neg %3 \n"
62 // Second round of bit swap.
63 "punpcklwd %%xmm2,%%xmm0 \n"
64 "punpcklwd %%xmm3,%%xmm1 \n"
65 "movdqa %%xmm0,%%xmm2 \n"
66 "movdqa %%xmm1,%%xmm3 \n"
67 "palignr $0x8,%%xmm2,%%xmm2 \n"
68 "palignr $0x8,%%xmm3,%%xmm3 \n"
69 "punpcklwd %%xmm6,%%xmm4 \n"
70 "punpcklwd %%xmm7,%%xmm5 \n"
71 "movdqa %%xmm4,%%xmm6 \n"
72 "movdqa %%xmm5,%%xmm7 \n"
73 "palignr $0x8,%%xmm6,%%xmm6 \n"
74 "palignr $0x8,%%xmm7,%%xmm7 \n"
75 // Third round of bit swap.
76 // Write to the destination pointer.
77 "punpckldq %%xmm4,%%xmm0 \n"
78 "movq %%xmm0,(%1) \n"
79 "movdqa %%xmm0,%%xmm4 \n"
80 "palignr $0x8,%%xmm4,%%xmm4 \n"
81 "movq %%xmm4,(%1,%4) \n"
82 "lea (%1,%4,2),%1 \n"
83 "punpckldq %%xmm6,%%xmm2 \n"
84 "movdqa %%xmm2,%%xmm6 \n"
85 "movq %%xmm2,(%1) \n"
86 "palignr $0x8,%%xmm6,%%xmm6 \n"
87 "punpckldq %%xmm5,%%xmm1 \n"
88 "movq %%xmm6,(%1,%4) \n"
89 "lea (%1,%4,2),%1 \n"
90 "movdqa %%xmm1,%%xmm5 \n"
91 "movq %%xmm1,(%1) \n"
92 "palignr $0x8,%%xmm5,%%xmm5 \n"
93 "movq %%xmm5,(%1,%4) \n"
94 "lea (%1,%4,2),%1 \n"
95 "punpckldq %%xmm7,%%xmm3 \n"
96 "movq %%xmm3,(%1) \n"
97 "movdqa %%xmm3,%%xmm7 \n"
98 "palignr $0x8,%%xmm7,%%xmm7 \n"
99 "sub $0x8,%2 \n"
100 "movq %%xmm7,(%1,%4) \n"
101 "lea (%1,%4,2),%1 \n"
102 "jg 1b \n"
103 : "+r"(src), // %0
104 "+r"(dst), // %1
105 "+r"(width) // %2
106 : "r"((intptr_t)(src_stride)), // %3
107 "r"((intptr_t)(dst_stride)) // %4
108 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
109 "xmm7");
110 }
111 #endif // defined(HAS_TRANSPOSEWX8_SSSE3)
112
113 // Transpose 16x8. 64 bit
114 #if defined(HAS_TRANSPOSEWX8_FAST_SSSE3)
TransposeWx8_Fast_SSSE3(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int width)115 void TransposeWx8_Fast_SSSE3(const uint8_t* src,
116 int src_stride,
117 uint8_t* dst,
118 int dst_stride,
119 int width) {
120 asm volatile(
121 // Read in the data from the source pointer.
122 // First round of bit swap.
123 LABELALIGN
124 "1: \n"
125 "movdqu (%0),%%xmm0 \n"
126 "movdqu (%0,%3),%%xmm1 \n"
127 "lea (%0,%3,2),%0 \n"
128 "movdqa %%xmm0,%%xmm8 \n"
129 "punpcklbw %%xmm1,%%xmm0 \n"
130 "punpckhbw %%xmm1,%%xmm8 \n"
131 "movdqu (%0),%%xmm2 \n"
132 "movdqa %%xmm0,%%xmm1 \n"
133 "movdqa %%xmm8,%%xmm9 \n"
134 "palignr $0x8,%%xmm1,%%xmm1 \n"
135 "palignr $0x8,%%xmm9,%%xmm9 \n"
136 "movdqu (%0,%3),%%xmm3 \n"
137 "lea (%0,%3,2),%0 \n"
138 "movdqa %%xmm2,%%xmm10 \n"
139 "punpcklbw %%xmm3,%%xmm2 \n"
140 "punpckhbw %%xmm3,%%xmm10 \n"
141 "movdqa %%xmm2,%%xmm3 \n"
142 "movdqa %%xmm10,%%xmm11 \n"
143 "movdqu (%0),%%xmm4 \n"
144 "palignr $0x8,%%xmm3,%%xmm3 \n"
145 "palignr $0x8,%%xmm11,%%xmm11 \n"
146 "movdqu (%0,%3),%%xmm5 \n"
147 "lea (%0,%3,2),%0 \n"
148 "movdqa %%xmm4,%%xmm12 \n"
149 "punpcklbw %%xmm5,%%xmm4 \n"
150 "punpckhbw %%xmm5,%%xmm12 \n"
151 "movdqa %%xmm4,%%xmm5 \n"
152 "movdqa %%xmm12,%%xmm13 \n"
153 "movdqu (%0),%%xmm6 \n"
154 "palignr $0x8,%%xmm5,%%xmm5 \n"
155 "palignr $0x8,%%xmm13,%%xmm13 \n"
156 "movdqu (%0,%3),%%xmm7 \n"
157 "lea (%0,%3,2),%0 \n"
158 "movdqa %%xmm6,%%xmm14 \n"
159 "punpcklbw %%xmm7,%%xmm6 \n"
160 "punpckhbw %%xmm7,%%xmm14 \n"
161 "neg %3 \n"
162 "movdqa %%xmm6,%%xmm7 \n"
163 "movdqa %%xmm14,%%xmm15 \n"
164 "lea 0x10(%0,%3,8),%0 \n"
165 "palignr $0x8,%%xmm7,%%xmm7 \n"
166 "palignr $0x8,%%xmm15,%%xmm15 \n"
167 "neg %3 \n"
168 // Second round of bit swap.
169 "punpcklwd %%xmm2,%%xmm0 \n"
170 "punpcklwd %%xmm3,%%xmm1 \n"
171 "movdqa %%xmm0,%%xmm2 \n"
172 "movdqa %%xmm1,%%xmm3 \n"
173 "palignr $0x8,%%xmm2,%%xmm2 \n"
174 "palignr $0x8,%%xmm3,%%xmm3 \n"
175 "punpcklwd %%xmm6,%%xmm4 \n"
176 "punpcklwd %%xmm7,%%xmm5 \n"
177 "movdqa %%xmm4,%%xmm6 \n"
178 "movdqa %%xmm5,%%xmm7 \n"
179 "palignr $0x8,%%xmm6,%%xmm6 \n"
180 "palignr $0x8,%%xmm7,%%xmm7 \n"
181 "punpcklwd %%xmm10,%%xmm8 \n"
182 "punpcklwd %%xmm11,%%xmm9 \n"
183 "movdqa %%xmm8,%%xmm10 \n"
184 "movdqa %%xmm9,%%xmm11 \n"
185 "palignr $0x8,%%xmm10,%%xmm10 \n"
186 "palignr $0x8,%%xmm11,%%xmm11 \n"
187 "punpcklwd %%xmm14,%%xmm12 \n"
188 "punpcklwd %%xmm15,%%xmm13 \n"
189 "movdqa %%xmm12,%%xmm14 \n"
190 "movdqa %%xmm13,%%xmm15 \n"
191 "palignr $0x8,%%xmm14,%%xmm14 \n"
192 "palignr $0x8,%%xmm15,%%xmm15 \n"
193 // Third round of bit swap.
194 // Write to the destination pointer.
195 "punpckldq %%xmm4,%%xmm0 \n"
196 "movq %%xmm0,(%1) \n"
197 "movdqa %%xmm0,%%xmm4 \n"
198 "palignr $0x8,%%xmm4,%%xmm4 \n"
199 "movq %%xmm4,(%1,%4) \n"
200 "lea (%1,%4,2),%1 \n"
201 "punpckldq %%xmm6,%%xmm2 \n"
202 "movdqa %%xmm2,%%xmm6 \n"
203 "movq %%xmm2,(%1) \n"
204 "palignr $0x8,%%xmm6,%%xmm6 \n"
205 "punpckldq %%xmm5,%%xmm1 \n"
206 "movq %%xmm6,(%1,%4) \n"
207 "lea (%1,%4,2),%1 \n"
208 "movdqa %%xmm1,%%xmm5 \n"
209 "movq %%xmm1,(%1) \n"
210 "palignr $0x8,%%xmm5,%%xmm5 \n"
211 "movq %%xmm5,(%1,%4) \n"
212 "lea (%1,%4,2),%1 \n"
213 "punpckldq %%xmm7,%%xmm3 \n"
214 "movq %%xmm3,(%1) \n"
215 "movdqa %%xmm3,%%xmm7 \n"
216 "palignr $0x8,%%xmm7,%%xmm7 \n"
217 "movq %%xmm7,(%1,%4) \n"
218 "lea (%1,%4,2),%1 \n"
219 "punpckldq %%xmm12,%%xmm8 \n"
220 "movq %%xmm8,(%1) \n"
221 "movdqa %%xmm8,%%xmm12 \n"
222 "palignr $0x8,%%xmm12,%%xmm12 \n"
223 "movq %%xmm12,(%1,%4) \n"
224 "lea (%1,%4,2),%1 \n"
225 "punpckldq %%xmm14,%%xmm10 \n"
226 "movdqa %%xmm10,%%xmm14 \n"
227 "movq %%xmm10,(%1) \n"
228 "palignr $0x8,%%xmm14,%%xmm14 \n"
229 "punpckldq %%xmm13,%%xmm9 \n"
230 "movq %%xmm14,(%1,%4) \n"
231 "lea (%1,%4,2),%1 \n"
232 "movdqa %%xmm9,%%xmm13 \n"
233 "movq %%xmm9,(%1) \n"
234 "palignr $0x8,%%xmm13,%%xmm13 \n"
235 "movq %%xmm13,(%1,%4) \n"
236 "lea (%1,%4,2),%1 \n"
237 "punpckldq %%xmm15,%%xmm11 \n"
238 "movq %%xmm11,(%1) \n"
239 "movdqa %%xmm11,%%xmm15 \n"
240 "palignr $0x8,%%xmm15,%%xmm15 \n"
241 "sub $0x10,%2 \n"
242 "movq %%xmm15,(%1,%4) \n"
243 "lea (%1,%4,2),%1 \n"
244 "jg 1b \n"
245 : "+r"(src), // %0
246 "+r"(dst), // %1
247 "+r"(width) // %2
248 : "r"((intptr_t)(src_stride)), // %3
249 "r"((intptr_t)(dst_stride)) // %4
250 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
251 "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14",
252 "xmm15");
253 }
254 #endif // defined(HAS_TRANSPOSEWX8_FAST_SSSE3)
255
256 // Transpose UV 8x8. 64 bit.
257 #if defined(HAS_TRANSPOSEUVWX8_SSE2)
TransposeUVWx8_SSE2(const uint8_t * src,int src_stride,uint8_t * dst_a,int dst_stride_a,uint8_t * dst_b,int dst_stride_b,int width)258 void TransposeUVWx8_SSE2(const uint8_t* src,
259 int src_stride,
260 uint8_t* dst_a,
261 int dst_stride_a,
262 uint8_t* dst_b,
263 int dst_stride_b,
264 int width) {
265 asm volatile(
266 // Read in the data from the source pointer.
267 // First round of bit swap.
268 LABELALIGN
269 "1: \n"
270 "movdqu (%0),%%xmm0 \n"
271 "movdqu (%0,%4),%%xmm1 \n"
272 "lea (%0,%4,2),%0 \n"
273 "movdqa %%xmm0,%%xmm8 \n"
274 "punpcklbw %%xmm1,%%xmm0 \n"
275 "punpckhbw %%xmm1,%%xmm8 \n"
276 "movdqa %%xmm8,%%xmm1 \n"
277 "movdqu (%0),%%xmm2 \n"
278 "movdqu (%0,%4),%%xmm3 \n"
279 "lea (%0,%4,2),%0 \n"
280 "movdqa %%xmm2,%%xmm8 \n"
281 "punpcklbw %%xmm3,%%xmm2 \n"
282 "punpckhbw %%xmm3,%%xmm8 \n"
283 "movdqa %%xmm8,%%xmm3 \n"
284 "movdqu (%0),%%xmm4 \n"
285 "movdqu (%0,%4),%%xmm5 \n"
286 "lea (%0,%4,2),%0 \n"
287 "movdqa %%xmm4,%%xmm8 \n"
288 "punpcklbw %%xmm5,%%xmm4 \n"
289 "punpckhbw %%xmm5,%%xmm8 \n"
290 "movdqa %%xmm8,%%xmm5 \n"
291 "movdqu (%0),%%xmm6 \n"
292 "movdqu (%0,%4),%%xmm7 \n"
293 "lea (%0,%4,2),%0 \n"
294 "movdqa %%xmm6,%%xmm8 \n"
295 "punpcklbw %%xmm7,%%xmm6 \n"
296 "neg %4 \n"
297 "lea 0x10(%0,%4,8),%0 \n"
298 "punpckhbw %%xmm7,%%xmm8 \n"
299 "movdqa %%xmm8,%%xmm7 \n"
300 "neg %4 \n"
301 // Second round of bit swap.
302 "movdqa %%xmm0,%%xmm8 \n"
303 "movdqa %%xmm1,%%xmm9 \n"
304 "punpckhwd %%xmm2,%%xmm8 \n"
305 "punpckhwd %%xmm3,%%xmm9 \n"
306 "punpcklwd %%xmm2,%%xmm0 \n"
307 "punpcklwd %%xmm3,%%xmm1 \n"
308 "movdqa %%xmm8,%%xmm2 \n"
309 "movdqa %%xmm9,%%xmm3 \n"
310 "movdqa %%xmm4,%%xmm8 \n"
311 "movdqa %%xmm5,%%xmm9 \n"
312 "punpckhwd %%xmm6,%%xmm8 \n"
313 "punpckhwd %%xmm7,%%xmm9 \n"
314 "punpcklwd %%xmm6,%%xmm4 \n"
315 "punpcklwd %%xmm7,%%xmm5 \n"
316 "movdqa %%xmm8,%%xmm6 \n"
317 "movdqa %%xmm9,%%xmm7 \n"
318 // Third round of bit swap.
319 // Write to the destination pointer.
320 "movdqa %%xmm0,%%xmm8 \n"
321 "punpckldq %%xmm4,%%xmm0 \n"
322 "movlpd %%xmm0,(%1) \n" // Write back U channel
323 "movhpd %%xmm0,(%2) \n" // Write back V channel
324 "punpckhdq %%xmm4,%%xmm8 \n"
325 "movlpd %%xmm8,(%1,%5) \n"
326 "lea (%1,%5,2),%1 \n"
327 "movhpd %%xmm8,(%2,%6) \n"
328 "lea (%2,%6,2),%2 \n"
329 "movdqa %%xmm2,%%xmm8 \n"
330 "punpckldq %%xmm6,%%xmm2 \n"
331 "movlpd %%xmm2,(%1) \n"
332 "movhpd %%xmm2,(%2) \n"
333 "punpckhdq %%xmm6,%%xmm8 \n"
334 "movlpd %%xmm8,(%1,%5) \n"
335 "lea (%1,%5,2),%1 \n"
336 "movhpd %%xmm8,(%2,%6) \n"
337 "lea (%2,%6,2),%2 \n"
338 "movdqa %%xmm1,%%xmm8 \n"
339 "punpckldq %%xmm5,%%xmm1 \n"
340 "movlpd %%xmm1,(%1) \n"
341 "movhpd %%xmm1,(%2) \n"
342 "punpckhdq %%xmm5,%%xmm8 \n"
343 "movlpd %%xmm8,(%1,%5) \n"
344 "lea (%1,%5,2),%1 \n"
345 "movhpd %%xmm8,(%2,%6) \n"
346 "lea (%2,%6,2),%2 \n"
347 "movdqa %%xmm3,%%xmm8 \n"
348 "punpckldq %%xmm7,%%xmm3 \n"
349 "movlpd %%xmm3,(%1) \n"
350 "movhpd %%xmm3,(%2) \n"
351 "punpckhdq %%xmm7,%%xmm8 \n"
352 "sub $0x8,%3 \n"
353 "movlpd %%xmm8,(%1,%5) \n"
354 "lea (%1,%5,2),%1 \n"
355 "movhpd %%xmm8,(%2,%6) \n"
356 "lea (%2,%6,2),%2 \n"
357 "jg 1b \n"
358 : "+r"(src), // %0
359 "+r"(dst_a), // %1
360 "+r"(dst_b), // %2
361 "+r"(width) // %3
362 : "r"((intptr_t)(src_stride)), // %4
363 "r"((intptr_t)(dst_stride_a)), // %5
364 "r"((intptr_t)(dst_stride_b)) // %6
365 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
366 "xmm7", "xmm8", "xmm9");
367 }
368 #endif // defined(HAS_TRANSPOSEUVWX8_SSE2)
369 #endif // defined(__x86_64__) || defined(__i386__)
370
371 #ifdef __cplusplus
372 } // extern "C"
373 } // namespace libyuv
374 #endif
375