1 /*
2 * Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include "third_party/libyuv/include/libyuv/scale.h"
12
13 #include <assert.h>
14 #include <string.h>
15
16 #include "third_party/libyuv/include/libyuv/cpu_id.h"
17 #include "third_party/libyuv/source/row.h"
18
19 #ifdef __cplusplus
20 namespace libyuv {
21 extern "C" {
22 #endif
23
24 /*
25 * Note: Defining YUV_DISABLE_ASM allows to use c version.
26 */
27 //#define YUV_DISABLE_ASM
28
29 #if defined(_MSC_VER)
30 #define ALIGN16(var) __declspec(align(16)) var
31 #else
32 #define ALIGN16(var) var __attribute__((aligned(16)))
33 #endif
34
35 // Note: A Neon reference manual
36 // http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.dui0204j/CJAJIIGG.html
37 // Note: Some SSE2 reference manuals
38 // cpuvol1.pdf agner_instruction_tables.pdf 253666.pdf 253667.pdf
39
40 // Set the following flag to true to revert to only
41 // using the reference implementation ScalePlaneBox(), and
42 // NOT the optimized versions. Useful for debugging and
43 // when comparing the quality of the resulting YUV planes
44 // as produced by the optimized and non-optimized versions.
45
46 static int use_reference_impl_ = 0;
47
SetUseReferenceImpl(int use)48 void SetUseReferenceImpl(int use) {
49 use_reference_impl_ = use;
50 }
51
52 // ScaleRowDown2Int also used by planar functions
53
54 /**
55 * NEON downscalers with interpolation.
56 *
57 * Provided by Fritz Koenig
58 *
59 */
60
61 #if defined(__ARM_NEON__) && !defined(YUV_DISABLE_ASM)
62 #define HAS_SCALEROWDOWN2_NEON
ScaleRowDown2_NEON(const uint8 * src_ptr,int src_stride,uint8 * dst,int dst_width)63 void ScaleRowDown2_NEON(const uint8* src_ptr, int src_stride,
64 uint8* dst, int dst_width) {
65 asm volatile (
66 "1: \n"
67 "vld2.u8 {q0,q1}, [%0]! \n" // load even pixels into q0, odd into q1
68 "vst1.u8 {q0}, [%1]! \n" // store even pixels
69 "subs %2, %2, #16 \n" // 16 processed per loop
70 "bhi 1b \n"
71 : "+r"(src_ptr), // %0
72 "+r"(dst), // %1
73 "+r"(dst_width) // %2
74 :
75 : "q0", "q1" // Clobber List
76 );
77 }
78
ScaleRowDown2Int_NEON(const uint8 * src_ptr,int src_stride,uint8 * dst,int dst_width)79 void ScaleRowDown2Int_NEON(const uint8* src_ptr, int src_stride,
80 uint8* dst, int dst_width) {
81 asm volatile (
82 "add %1, %0 \n" // change the stride to row 2 pointer
83 "1: \n"
84 "vld1.u8 {q0,q1}, [%0]! \n" // load row 1 and post increment
85 "vld1.u8 {q2,q3}, [%1]! \n" // load row 2 and post increment
86 "vpaddl.u8 q0, q0 \n" // row 1 add adjacent
87 "vpaddl.u8 q1, q1 \n"
88 "vpadal.u8 q0, q2 \n" // row 2 add adjacent, add row 1 to row 2
89 "vpadal.u8 q1, q3 \n"
90 "vrshrn.u16 d0, q0, #2 \n" // downshift, round and pack
91 "vrshrn.u16 d1, q1, #2 \n"
92 "vst1.u8 {q0}, [%2]! \n"
93 "subs %3, %3, #16 \n" // 16 processed per loop
94 "bhi 1b \n"
95 : "+r"(src_ptr), // %0
96 "+r"(src_stride), // %1
97 "+r"(dst), // %2
98 "+r"(dst_width) // %3
99 :
100 : "q0", "q1", "q2", "q3" // Clobber List
101 );
102 }
103
104 #define HAS_SCALEROWDOWN4_NEON
ScaleRowDown4_NEON(const uint8 * src_ptr,int src_stride,uint8 * dst_ptr,int dst_width)105 static void ScaleRowDown4_NEON(const uint8* src_ptr, int src_stride,
106 uint8* dst_ptr, int dst_width) {
107 asm volatile (
108 "1: \n"
109 "vld2.u8 {d0, d1}, [%0]! \n"
110 "vtrn.u8 d1, d0 \n"
111 "vshrn.u16 d0, q0, #8 \n"
112 "vst1.u32 {d0[1]}, [%1]! \n"
113
114 "subs %2, #4 \n"
115 "bhi 1b \n"
116 : "+r"(src_ptr), // %0
117 "+r"(dst_ptr), // %1
118 "+r"(dst_width) // %2
119 :
120 : "q0", "q1", "memory", "cc"
121 );
122 }
123
ScaleRowDown4Int_NEON(const uint8 * src_ptr,int src_stride,uint8 * dst_ptr,int dst_width)124 static void ScaleRowDown4Int_NEON(const uint8* src_ptr, int src_stride,
125 uint8* dst_ptr, int dst_width) {
126 asm volatile (
127 "add r4, %0, %3 \n"
128 "add r5, r4, %3 \n"
129 "add %3, r5, %3 \n"
130 "1: \n"
131 "vld1.u8 {q0}, [%0]! \n" // load up 16x4 block of input data
132 "vld1.u8 {q1}, [r4]! \n"
133 "vld1.u8 {q2}, [r5]! \n"
134 "vld1.u8 {q3}, [%3]! \n"
135
136 "vpaddl.u8 q0, q0 \n"
137 "vpadal.u8 q0, q1 \n"
138 "vpadal.u8 q0, q2 \n"
139 "vpadal.u8 q0, q3 \n"
140
141 "vpaddl.u16 q0, q0 \n"
142
143 "vrshrn.u32 d0, q0, #4 \n" // divide by 16 w/rounding
144
145 "vmovn.u16 d0, q0 \n"
146 "vst1.u32 {d0[0]}, [%1]! \n"
147
148 "subs %2, #4 \n"
149 "bhi 1b \n"
150
151 : "+r"(src_ptr), // %0
152 "+r"(dst_ptr), // %1
153 "+r"(dst_width) // %2
154 : "r"(src_stride) // %3
155 : "r4", "r5", "q0", "q1", "q2", "q3", "memory", "cc"
156 );
157 }
158
159 #define HAS_SCALEROWDOWN34_NEON
160 // Down scale from 4 to 3 pixels. Use the neon multilane read/write
161 // to load up the every 4th pixel into a 4 different registers.
162 // Point samples 32 pixels to 24 pixels.
ScaleRowDown34_NEON(const uint8 * src_ptr,int src_stride,uint8 * dst_ptr,int dst_width)163 static void ScaleRowDown34_NEON(const uint8* src_ptr, int src_stride,
164 uint8* dst_ptr, int dst_width) {
165 asm volatile (
166 "1: \n"
167 "vld4.u8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
168 "vmov d2, d3 \n" // order needs to be d0, d1, d2
169 "vst3.u8 {d0, d1, d2}, [%1]! \n"
170 "subs %2, #24 \n"
171 "bhi 1b \n"
172 : "+r"(src_ptr), // %0
173 "+r"(dst_ptr), // %1
174 "+r"(dst_width) // %2
175 :
176 : "d0", "d1", "d2", "d3", "memory", "cc"
177 );
178 }
179
ScaleRowDown34_0_Int_NEON(const uint8 * src_ptr,int src_stride,uint8 * dst_ptr,int dst_width)180 static void ScaleRowDown34_0_Int_NEON(const uint8* src_ptr, int src_stride,
181 uint8* dst_ptr, int dst_width) {
182 asm volatile (
183 "vmov.u8 d24, #3 \n"
184 "add %3, %0 \n"
185 "1: \n"
186 "vld4.u8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
187 "vld4.u8 {d4, d5, d6, d7}, [%3]! \n" // src line 1
188
189 // filter src line 0 with src line 1
190 // expand chars to shorts to allow for room
191 // when adding lines together
192 "vmovl.u8 q8, d4 \n"
193 "vmovl.u8 q9, d5 \n"
194 "vmovl.u8 q10, d6 \n"
195 "vmovl.u8 q11, d7 \n"
196
197 // 3 * line_0 + line_1
198 "vmlal.u8 q8, d0, d24 \n"
199 "vmlal.u8 q9, d1, d24 \n"
200 "vmlal.u8 q10, d2, d24 \n"
201 "vmlal.u8 q11, d3, d24 \n"
202
203 // (3 * line_0 + line_1) >> 2
204 "vqrshrn.u16 d0, q8, #2 \n"
205 "vqrshrn.u16 d1, q9, #2 \n"
206 "vqrshrn.u16 d2, q10, #2 \n"
207 "vqrshrn.u16 d3, q11, #2 \n"
208
209 // a0 = (src[0] * 3 + s[1] * 1) >> 2
210 "vmovl.u8 q8, d1 \n"
211 "vmlal.u8 q8, d0, d24 \n"
212 "vqrshrn.u16 d0, q8, #2 \n"
213
214 // a1 = (src[1] * 1 + s[2] * 1) >> 1
215 "vrhadd.u8 d1, d1, d2 \n"
216
217 // a2 = (src[2] * 1 + s[3] * 3) >> 2
218 "vmovl.u8 q8, d2 \n"
219 "vmlal.u8 q8, d3, d24 \n"
220 "vqrshrn.u16 d2, q8, #2 \n"
221
222 "vst3.u8 {d0, d1, d2}, [%1]! \n"
223
224 "subs %2, #24 \n"
225 "bhi 1b \n"
226 : "+r"(src_ptr), // %0
227 "+r"(dst_ptr), // %1
228 "+r"(dst_width), // %2
229 "+r"(src_stride) // %3
230 :
231 : "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "d24", "memory", "cc"
232 );
233 }
234
ScaleRowDown34_1_Int_NEON(const uint8 * src_ptr,int src_stride,uint8 * dst_ptr,int dst_width)235 static void ScaleRowDown34_1_Int_NEON(const uint8* src_ptr, int src_stride,
236 uint8* dst_ptr, int dst_width) {
237 asm volatile (
238 "vmov.u8 d24, #3 \n"
239 "add %3, %0 \n"
240 "1: \n"
241 "vld4.u8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
242 "vld4.u8 {d4, d5, d6, d7}, [%3]! \n" // src line 1
243
244 // average src line 0 with src line 1
245 "vrhadd.u8 q0, q0, q2 \n"
246 "vrhadd.u8 q1, q1, q3 \n"
247
248 // a0 = (src[0] * 3 + s[1] * 1) >> 2
249 "vmovl.u8 q3, d1 \n"
250 "vmlal.u8 q3, d0, d24 \n"
251 "vqrshrn.u16 d0, q3, #2 \n"
252
253 // a1 = (src[1] * 1 + s[2] * 1) >> 1
254 "vrhadd.u8 d1, d1, d2 \n"
255
256 // a2 = (src[2] * 1 + s[3] * 3) >> 2
257 "vmovl.u8 q3, d2 \n"
258 "vmlal.u8 q3, d3, d24 \n"
259 "vqrshrn.u16 d2, q3, #2 \n"
260
261 "vst3.u8 {d0, d1, d2}, [%1]! \n"
262
263 "subs %2, #24 \n"
264 "bhi 1b \n"
265 : "+r"(src_ptr), // %0
266 "+r"(dst_ptr), // %1
267 "+r"(dst_width), // %2
268 "+r"(src_stride) // %3
269 :
270 : "r4", "q0", "q1", "q2", "q3", "d24", "memory", "cc"
271 );
272 }
273
274 #define HAS_SCALEROWDOWN38_NEON
275 const uint8 shuf38[16] __attribute__ ((aligned(16))) =
276 { 0, 3, 6, 8, 11, 14, 16, 19, 22, 24, 27, 30, 0, 0, 0, 0 };
277 const uint8 shuf38_2[16] __attribute__ ((aligned(16))) =
278 { 0, 8, 16, 2, 10, 17, 4, 12, 18, 6, 14, 19, 0, 0, 0, 0 };
279 const unsigned short mult38_div6[8] __attribute__ ((aligned(16))) =
280 { 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12,
281 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12 };
282 const unsigned short mult38_div9[8] __attribute__ ((aligned(16))) =
283 { 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18,
284 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18 };
285
286 // 32 -> 12
ScaleRowDown38_NEON(const uint8 * src_ptr,int src_stride,uint8 * dst_ptr,int dst_width)287 static void ScaleRowDown38_NEON(const uint8* src_ptr, int src_stride,
288 uint8* dst_ptr, int dst_width) {
289 asm volatile (
290 "vld1.u8 {q3}, [%3] \n"
291 "1: \n"
292 "vld1.u8 {d0, d1, d2, d3}, [%0]! \n"
293 "vtbl.u8 d4, {d0, d1, d2, d3}, d6 \n"
294 "vtbl.u8 d5, {d0, d1, d2, d3}, d7 \n"
295 "vst1.u8 {d4}, [%1]! \n"
296 "vst1.u32 {d5[0]}, [%1]! \n"
297 "subs %2, #12 \n"
298 "bhi 1b \n"
299 : "+r"(src_ptr), // %0
300 "+r"(dst_ptr), // %1
301 "+r"(dst_width) // %2
302 : "r"(shuf38) // %3
303 : "d0", "d1", "d2", "d3", "d4", "d5", "memory", "cc"
304 );
305 }
306
307 // 32x3 -> 12x1
ScaleRowDown38_3_Int_NEON(const uint8 * src_ptr,int src_stride,uint8 * dst_ptr,int dst_width)308 static void ScaleRowDown38_3_Int_NEON(const uint8* src_ptr, int src_stride,
309 uint8* dst_ptr, int dst_width) {
310 asm volatile (
311 "vld1.u16 {q13}, [%4] \n"
312 "vld1.u8 {q14}, [%5] \n"
313 "vld1.u8 {q15}, [%6] \n"
314 "add r4, %0, %3, lsl #1 \n"
315 "add %3, %0 \n"
316 "1: \n"
317
318 // d0 = 00 40 01 41 02 42 03 43
319 // d1 = 10 50 11 51 12 52 13 53
320 // d2 = 20 60 21 61 22 62 23 63
321 // d3 = 30 70 31 71 32 72 33 73
322 "vld4.u8 {d0, d1, d2, d3}, [%0]! \n"
323 "vld4.u8 {d4, d5, d6, d7}, [%3]! \n"
324 "vld4.u8 {d16, d17, d18, d19}, [r4]! \n"
325
326 // Shuffle the input data around to get align the data
327 // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
328 // d0 = 00 10 01 11 02 12 03 13
329 // d1 = 40 50 41 51 42 52 43 53
330 "vtrn.u8 d0, d1 \n"
331 "vtrn.u8 d4, d5 \n"
332 "vtrn.u8 d16, d17 \n"
333
334 // d2 = 20 30 21 31 22 32 23 33
335 // d3 = 60 70 61 71 62 72 63 73
336 "vtrn.u8 d2, d3 \n"
337 "vtrn.u8 d6, d7 \n"
338 "vtrn.u8 d18, d19 \n"
339
340 // d0 = 00+10 01+11 02+12 03+13
341 // d2 = 40+50 41+51 42+52 43+53
342 "vpaddl.u8 q0, q0 \n"
343 "vpaddl.u8 q2, q2 \n"
344 "vpaddl.u8 q8, q8 \n"
345
346 // d3 = 60+70 61+71 62+72 63+73
347 "vpaddl.u8 d3, d3 \n"
348 "vpaddl.u8 d7, d7 \n"
349 "vpaddl.u8 d19, d19 \n"
350
351 // combine source lines
352 "vadd.u16 q0, q2 \n"
353 "vadd.u16 q0, q8 \n"
354 "vadd.u16 d4, d3, d7 \n"
355 "vadd.u16 d4, d19 \n"
356
357 // dst_ptr[3] = (s[6 + st * 0] + s[7 + st * 0]
358 // + s[6 + st * 1] + s[7 + st * 1]
359 // + s[6 + st * 2] + s[7 + st * 2]) / 6
360 "vqrdmulh.s16 q2, q13 \n"
361 "vmovn.u16 d4, q2 \n"
362
363 // Shuffle 2,3 reg around so that 2 can be added to the
364 // 0,1 reg and 3 can be added to the 4,5 reg. This
365 // requires expanding from u8 to u16 as the 0,1 and 4,5
366 // registers are already expanded. Then do transposes
367 // to get aligned.
368 // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
369 "vmovl.u8 q1, d2 \n"
370 "vmovl.u8 q3, d6 \n"
371 "vmovl.u8 q9, d18 \n"
372
373 // combine source lines
374 "vadd.u16 q1, q3 \n"
375 "vadd.u16 q1, q9 \n"
376
377 // d4 = xx 20 xx 30 xx 22 xx 32
378 // d5 = xx 21 xx 31 xx 23 xx 33
379 "vtrn.u32 d2, d3 \n"
380
381 // d4 = xx 20 xx 21 xx 22 xx 23
382 // d5 = xx 30 xx 31 xx 32 xx 33
383 "vtrn.u16 d2, d3 \n"
384
385 // 0+1+2, 3+4+5
386 "vadd.u16 q0, q1 \n"
387
388 // Need to divide, but can't downshift as the the value
389 // isn't a power of 2. So multiply by 65536 / n
390 // and take the upper 16 bits.
391 "vqrdmulh.s16 q0, q15 \n"
392
393 // Align for table lookup, vtbl requires registers to
394 // be adjacent
395 "vmov.u8 d2, d4 \n"
396
397 "vtbl.u8 d3, {d0, d1, d2}, d28 \n"
398 "vtbl.u8 d4, {d0, d1, d2}, d29 \n"
399
400 "vst1.u8 {d3}, [%1]! \n"
401 "vst1.u32 {d4[0]}, [%1]! \n"
402 "subs %2, #12 \n"
403 "bhi 1b \n"
404 : "+r"(src_ptr), // %0
405 "+r"(dst_ptr), // %1
406 "+r"(dst_width), // %2
407 "+r"(src_stride) // %3
408 : "r"(mult38_div6), // %4
409 "r"(shuf38_2), // %5
410 "r"(mult38_div9) // %6
411 : "r4", "q0", "q1", "q2", "q3", "q8", "q9",
412 "q13", "q14", "q15", "memory", "cc"
413 );
414 }
415
416 // 32x2 -> 12x1
ScaleRowDown38_2_Int_NEON(const uint8 * src_ptr,int src_stride,uint8 * dst_ptr,int dst_width)417 static void ScaleRowDown38_2_Int_NEON(const uint8* src_ptr, int src_stride,
418 uint8* dst_ptr, int dst_width) {
419 asm volatile (
420 "vld1.u16 {q13}, [%4] \n"
421 "vld1.u8 {q14}, [%5] \n"
422 "add %3, %0 \n"
423 "1: \n"
424
425 // d0 = 00 40 01 41 02 42 03 43
426 // d1 = 10 50 11 51 12 52 13 53
427 // d2 = 20 60 21 61 22 62 23 63
428 // d3 = 30 70 31 71 32 72 33 73
429 "vld4.u8 {d0, d1, d2, d3}, [%0]! \n"
430 "vld4.u8 {d4, d5, d6, d7}, [%3]! \n"
431
432 // Shuffle the input data around to get align the data
433 // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
434 // d0 = 00 10 01 11 02 12 03 13
435 // d1 = 40 50 41 51 42 52 43 53
436 "vtrn.u8 d0, d1 \n"
437 "vtrn.u8 d4, d5 \n"
438
439 // d2 = 20 30 21 31 22 32 23 33
440 // d3 = 60 70 61 71 62 72 63 73
441 "vtrn.u8 d2, d3 \n"
442 "vtrn.u8 d6, d7 \n"
443
444 // d0 = 00+10 01+11 02+12 03+13
445 // d2 = 40+50 41+51 42+52 43+53
446 "vpaddl.u8 q0, q0 \n"
447 "vpaddl.u8 q2, q2 \n"
448
449 // d3 = 60+70 61+71 62+72 63+73
450 "vpaddl.u8 d3, d3 \n"
451 "vpaddl.u8 d7, d7 \n"
452
453 // combine source lines
454 "vadd.u16 q0, q2 \n"
455 "vadd.u16 d4, d3, d7 \n"
456
457 // dst_ptr[3] = (s[6] + s[7] + s[6+st] + s[7+st]) / 4
458 "vqrshrn.u16 d4, q2, #2 \n"
459
460 // Shuffle 2,3 reg around so that 2 can be added to the
461 // 0,1 reg and 3 can be added to the 4,5 reg. This
462 // requires expanding from u8 to u16 as the 0,1 and 4,5
463 // registers are already expanded. Then do transposes
464 // to get aligned.
465 // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
466 "vmovl.u8 q1, d2 \n"
467 "vmovl.u8 q3, d6 \n"
468
469 // combine source lines
470 "vadd.u16 q1, q3 \n"
471
472 // d4 = xx 20 xx 30 xx 22 xx 32
473 // d5 = xx 21 xx 31 xx 23 xx 33
474 "vtrn.u32 d2, d3 \n"
475
476 // d4 = xx 20 xx 21 xx 22 xx 23
477 // d5 = xx 30 xx 31 xx 32 xx 33
478 "vtrn.u16 d2, d3 \n"
479
480 // 0+1+2, 3+4+5
481 "vadd.u16 q0, q1 \n"
482
483 // Need to divide, but can't downshift as the the value
484 // isn't a power of 2. So multiply by 65536 / n
485 // and take the upper 16 bits.
486 "vqrdmulh.s16 q0, q13 \n"
487
488 // Align for table lookup, vtbl requires registers to
489 // be adjacent
490 "vmov.u8 d2, d4 \n"
491
492 "vtbl.u8 d3, {d0, d1, d2}, d28 \n"
493 "vtbl.u8 d4, {d0, d1, d2}, d29 \n"
494
495 "vst1.u8 {d3}, [%1]! \n"
496 "vst1.u32 {d4[0]}, [%1]! \n"
497 "subs %2, #12 \n"
498 "bhi 1b \n"
499 : "+r"(src_ptr), // %0
500 "+r"(dst_ptr), // %1
501 "+r"(dst_width), // %2
502 "+r"(src_stride) // %3
503 : "r"(mult38_div6), // %4
504 "r"(shuf38_2) // %5
505 : "q0", "q1", "q2", "q3", "q13", "q14", "memory", "cc"
506 );
507 }
508
509 /**
510 * SSE2 downscalers with interpolation.
511 *
512 * Provided by Frank Barchard (fbarchard@google.com)
513 *
514 */
515
516 // Constants for SSE2 code
517 #elif (defined(_M_IX86) || defined(__i386__) || defined(__x86_64__)) && \
518 !defined(YUV_DISABLE_ASM)
519 #if defined(_MSC_VER)
520 #define TALIGN16(t, var) __declspec(align(16)) t _ ## var
521 #elif (defined(__APPLE__) || defined(__MINGW32__) || defined(__CYGWIN__)) && defined(__i386__)
522 #define TALIGN16(t, var) t var __attribute__((aligned(16)))
523 #else
524 #define TALIGN16(t, var) t _ ## var __attribute__((aligned(16)))
525 #endif
526
527 #if (defined(__APPLE__) || defined(__MINGW32__) || defined(__CYGWIN__)) && \
528 defined(__i386__)
529 #define DECLARE_FUNCTION(name) \
530 ".text \n" \
531 ".globl _" #name " \n" \
532 "_" #name ": \n"
533 #else
534 #define DECLARE_FUNCTION(name) \
535 ".text \n" \
536 ".global " #name " \n" \
537 #name ": \n"
538 #endif
539
540
541 // Offsets for source bytes 0 to 9
542 //extern "C"
543 TALIGN16(const uint8, shuf0[16]) =
544 { 0, 1, 3, 4, 5, 7, 8, 9, 128, 128, 128, 128, 128, 128, 128, 128 };
545
546 // Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12.
547 //extern "C"
548 TALIGN16(const uint8, shuf1[16]) =
549 { 3, 4, 5, 7, 8, 9, 11, 12, 128, 128, 128, 128, 128, 128, 128, 128 };
550
551 // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
552 //extern "C"
553 TALIGN16(const uint8, shuf2[16]) =
554 { 5, 7, 8, 9, 11, 12, 13, 15, 128, 128, 128, 128, 128, 128, 128, 128 };
555
556 // Offsets for source bytes 0 to 10
557 //extern "C"
558 TALIGN16(const uint8, shuf01[16]) =
559 { 0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10 };
560
561 // Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13.
562 //extern "C"
563 TALIGN16(const uint8, shuf11[16]) =
564 { 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13 };
565
566 // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
567 //extern "C"
568 TALIGN16(const uint8, shuf21[16]) =
569 { 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13, 13, 14, 14, 15 };
570
571 // Coefficients for source bytes 0 to 10
572 //extern "C"
573 TALIGN16(const uint8, madd01[16]) =
574 { 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2 };
575
576 // Coefficients for source bytes 10 to 21
577 //extern "C"
578 TALIGN16(const uint8, madd11[16]) =
579 { 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1 };
580
581 // Coefficients for source bytes 21 to 31
582 //extern "C"
583 TALIGN16(const uint8, madd21[16]) =
584 { 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3 };
585
586 // Coefficients for source bytes 21 to 31
587 //extern "C"
588 TALIGN16(const int16, round34[8]) =
589 { 2, 2, 2, 2, 2, 2, 2, 2 };
590
591 //extern "C"
592 TALIGN16(const uint8, shuf38a[16]) =
593 { 0, 3, 6, 8, 11, 14, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };
594
595 //extern "C"
596 TALIGN16(const uint8, shuf38b[16]) =
597 { 128, 128, 128, 128, 128, 128, 0, 3, 6, 8, 11, 14, 128, 128, 128, 128 };
598
599 // Arrange words 0,3,6 into 0,1,2
600 //extern "C"
601 TALIGN16(const uint8, shufac0[16]) =
602 { 0, 1, 6, 7, 12, 13, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };
603
604 // Arrange words 0,3,6 into 3,4,5
605 //extern "C"
606 TALIGN16(const uint8, shufac3[16]) =
607 { 128, 128, 128, 128, 128, 128, 0, 1, 6, 7, 12, 13, 128, 128, 128, 128 };
608
609 // Scaling values for boxes of 3x3 and 2x3
610 //extern "C"
611 TALIGN16(const uint16, scaleac3[8]) =
612 { 65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9, 65536 / 9, 65536 / 6, 0, 0 };
613
614 // Arrange first value for pixels 0,1,2,3,4,5
615 //extern "C"
616 TALIGN16(const uint8, shufab0[16]) =
617 { 0, 128, 3, 128, 6, 128, 8, 128, 11, 128, 14, 128, 128, 128, 128, 128 };
618
619 // Arrange second value for pixels 0,1,2,3,4,5
620 //extern "C"
621 TALIGN16(const uint8, shufab1[16]) =
622 { 1, 128, 4, 128, 7, 128, 9, 128, 12, 128, 15, 128, 128, 128, 128, 128 };
623
624 // Arrange third value for pixels 0,1,2,3,4,5
625 //extern "C"
626 TALIGN16(const uint8, shufab2[16]) =
627 { 2, 128, 5, 128, 128, 128, 10, 128, 13, 128, 128, 128, 128, 128, 128, 128 };
628
629 // Scaling values for boxes of 3x2 and 2x2
630 //extern "C"
631 TALIGN16(const uint16, scaleab2[8]) =
632 { 65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, 65536 / 3, 65536 / 2, 0, 0 };
633 #endif
634
635 #if defined(_M_IX86) && !defined(YUV_DISABLE_ASM) && defined(_MSC_VER)
636
637 #define HAS_SCALEROWDOWN2_SSE2
638 // Reads 32 pixels, throws half away and writes 16 pixels.
639 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
640 __declspec(naked)
ScaleRowDown2_SSE2(const uint8 * src_ptr,int src_stride,uint8 * dst_ptr,int dst_width)641 static void ScaleRowDown2_SSE2(const uint8* src_ptr, int src_stride,
642 uint8* dst_ptr, int dst_width) {
643 __asm {
644 mov eax, [esp + 4] // src_ptr
645 // src_stride ignored
646 mov edx, [esp + 12] // dst_ptr
647 mov ecx, [esp + 16] // dst_width
648 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
649 psrlw xmm5, 8
650
651 wloop:
652 movdqa xmm0, [eax]
653 movdqa xmm1, [eax + 16]
654 lea eax, [eax + 32]
655 pand xmm0, xmm5
656 pand xmm1, xmm5
657 packuswb xmm0, xmm1
658 movdqa [edx], xmm0
659 lea edx, [edx + 16]
660 sub ecx, 16
661 ja wloop
662
663 ret
664 }
665 }
666 // Blends 32x2 rectangle to 16x1.
667 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
668 __declspec(naked)
ScaleRowDown2Int_SSE2(const uint8 * src_ptr,int src_stride,uint8 * dst_ptr,int dst_width)669 void ScaleRowDown2Int_SSE2(const uint8* src_ptr, int src_stride,
670 uint8* dst_ptr, int dst_width) {
671 __asm {
672 push esi
673 mov eax, [esp + 4 + 4] // src_ptr
674 mov esi, [esp + 4 + 8] // src_stride
675 mov edx, [esp + 4 + 12] // dst_ptr
676 mov ecx, [esp + 4 + 16] // dst_width
677 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
678 psrlw xmm5, 8
679
680 wloop:
681 movdqa xmm0, [eax]
682 movdqa xmm1, [eax + 16]
683 movdqa xmm2, [eax + esi]
684 movdqa xmm3, [eax + esi + 16]
685 lea eax, [eax + 32]
686 pavgb xmm0, xmm2 // average rows
687 pavgb xmm1, xmm3
688
689 movdqa xmm2, xmm0 // average columns (32 to 16 pixels)
690 psrlw xmm0, 8
691 movdqa xmm3, xmm1
692 psrlw xmm1, 8
693 pand xmm2, xmm5
694 pand xmm3, xmm5
695 pavgw xmm0, xmm2
696 pavgw xmm1, xmm3
697 packuswb xmm0, xmm1
698
699 movdqa [edx], xmm0
700 lea edx, [edx + 16]
701 sub ecx, 16
702 ja wloop
703
704 pop esi
705 ret
706 }
707 }
708
709 #define HAS_SCALEROWDOWN4_SSE2
710 // Point samples 32 pixels to 8 pixels.
711 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
712 __declspec(naked)
ScaleRowDown4_SSE2(const uint8 * src_ptr,int src_stride,uint8 * dst_ptr,int dst_width)713 static void ScaleRowDown4_SSE2(const uint8* src_ptr, int src_stride,
714 uint8* dst_ptr, int dst_width) {
715 __asm {
716 pushad
717 mov esi, [esp + 32 + 4] // src_ptr
718 // src_stride ignored
719 mov edi, [esp + 32 + 12] // dst_ptr
720 mov ecx, [esp + 32 + 16] // dst_width
721 pcmpeqb xmm5, xmm5 // generate mask 0x000000ff
722 psrld xmm5, 24
723
724 wloop:
725 movdqa xmm0, [esi]
726 movdqa xmm1, [esi + 16]
727 lea esi, [esi + 32]
728 pand xmm0, xmm5
729 pand xmm1, xmm5
730 packuswb xmm0, xmm1
731 packuswb xmm0, xmm0
732 movq qword ptr [edi], xmm0
733 lea edi, [edi + 8]
734 sub ecx, 8
735 ja wloop
736
737 popad
738 ret
739 }
740 }
741
742 // Blends 32x4 rectangle to 8x1.
743 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
744 __declspec(naked)
ScaleRowDown4Int_SSE2(const uint8 * src_ptr,int src_stride,uint8 * dst_ptr,int dst_width)745 static void ScaleRowDown4Int_SSE2(const uint8* src_ptr, int src_stride,
746 uint8* dst_ptr, int dst_width) {
747 __asm {
748 pushad
749 mov esi, [esp + 32 + 4] // src_ptr
750 mov ebx, [esp + 32 + 8] // src_stride
751 mov edi, [esp + 32 + 12] // dst_ptr
752 mov ecx, [esp + 32 + 16] // dst_width
753 pcmpeqb xmm7, xmm7 // generate mask 0x00ff00ff
754 psrlw xmm7, 8
755 lea edx, [ebx + ebx * 2] // src_stride * 3
756
757 wloop:
758 movdqa xmm0, [esi]
759 movdqa xmm1, [esi + 16]
760 movdqa xmm2, [esi + ebx]
761 movdqa xmm3, [esi + ebx + 16]
762 pavgb xmm0, xmm2 // average rows
763 pavgb xmm1, xmm3
764 movdqa xmm2, [esi + ebx * 2]
765 movdqa xmm3, [esi + ebx * 2 + 16]
766 movdqa xmm4, [esi + edx]
767 movdqa xmm5, [esi + edx + 16]
768 lea esi, [esi + 32]
769 pavgb xmm2, xmm4
770 pavgb xmm3, xmm5
771 pavgb xmm0, xmm2
772 pavgb xmm1, xmm3
773
774 movdqa xmm2, xmm0 // average columns (32 to 16 pixels)
775 psrlw xmm0, 8
776 movdqa xmm3, xmm1
777 psrlw xmm1, 8
778 pand xmm2, xmm7
779 pand xmm3, xmm7
780 pavgw xmm0, xmm2
781 pavgw xmm1, xmm3
782 packuswb xmm0, xmm1
783
784 movdqa xmm2, xmm0 // average columns (16 to 8 pixels)
785 psrlw xmm0, 8
786 pand xmm2, xmm7
787 pavgw xmm0, xmm2
788 packuswb xmm0, xmm0
789
790 movq qword ptr [edi], xmm0
791 lea edi, [edi + 8]
792 sub ecx, 8
793 ja wloop
794
795 popad
796 ret
797 }
798 }
799
800 #define HAS_SCALEROWDOWN8_SSE2
801 // Point samples 32 pixels to 4 pixels.
802 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 4 byte aligned.
803 __declspec(naked)
ScaleRowDown8_SSE2(const uint8 * src_ptr,int src_stride,uint8 * dst_ptr,int dst_width)804 static void ScaleRowDown8_SSE2(const uint8* src_ptr, int src_stride,
805 uint8* dst_ptr, int dst_width) {
806 __asm {
807 pushad
808 mov esi, [esp + 32 + 4] // src_ptr
809 // src_stride ignored
810 mov edi, [esp + 32 + 12] // dst_ptr
811 mov ecx, [esp + 32 + 16] // dst_width
812 pcmpeqb xmm5, xmm5 // generate mask isolating 1 src 8 bytes
813 psrlq xmm5, 56
814
815 wloop:
816 movdqa xmm0, [esi]
817 movdqa xmm1, [esi + 16]
818 lea esi, [esi + 32]
819 pand xmm0, xmm5
820 pand xmm1, xmm5
821 packuswb xmm0, xmm1 // 32->16
822 packuswb xmm0, xmm0 // 16->8
823 packuswb xmm0, xmm0 // 8->4
824 movd dword ptr [edi], xmm0
825 lea edi, [edi + 4]
826 sub ecx, 4
827 ja wloop
828
829 popad
830 ret
831 }
832 }
833
834 // Blends 32x8 rectangle to 4x1.
835 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 4 byte aligned.
836 __declspec(naked)
ScaleRowDown8Int_SSE2(const uint8 * src_ptr,int src_stride,uint8 * dst_ptr,int dst_width)837 static void ScaleRowDown8Int_SSE2(const uint8* src_ptr, int src_stride,
838 uint8* dst_ptr, int dst_width) {
839 __asm {
840 pushad
841 mov esi, [esp + 32 + 4] // src_ptr
842 mov ebx, [esp + 32 + 8] // src_stride
843 mov edi, [esp + 32 + 12] // dst_ptr
844 mov ecx, [esp + 32 + 16] // dst_width
845 lea edx, [ebx + ebx * 2] // src_stride * 3
846 pxor xmm7, xmm7
847
848 wloop:
849 movdqa xmm0, [esi] // average 8 rows to 1
850 movdqa xmm1, [esi + 16]
851 movdqa xmm2, [esi + ebx]
852 movdqa xmm3, [esi + ebx + 16]
853 pavgb xmm0, xmm2
854 pavgb xmm1, xmm3
855 movdqa xmm2, [esi + ebx * 2]
856 movdqa xmm3, [esi + ebx * 2 + 16]
857 movdqa xmm4, [esi + edx]
858 movdqa xmm5, [esi + edx + 16]
859 lea ebp, [esi + ebx * 4]
860 lea esi, [esi + 32]
861 pavgb xmm2, xmm4
862 pavgb xmm3, xmm5
863 pavgb xmm0, xmm2
864 pavgb xmm1, xmm3
865
866 movdqa xmm2, [ebp]
867 movdqa xmm3, [ebp + 16]
868 movdqa xmm4, [ebp + ebx]
869 movdqa xmm5, [ebp + ebx + 16]
870 pavgb xmm2, xmm4
871 pavgb xmm3, xmm5
872 movdqa xmm4, [ebp + ebx * 2]
873 movdqa xmm5, [ebp + ebx * 2 + 16]
874 movdqa xmm6, [ebp + edx]
875 pavgb xmm4, xmm6
876 movdqa xmm6, [ebp + edx + 16]
877 pavgb xmm5, xmm6
878 pavgb xmm2, xmm4
879 pavgb xmm3, xmm5
880 pavgb xmm0, xmm2
881 pavgb xmm1, xmm3
882
883 psadbw xmm0, xmm7 // average 32 pixels to 4
884 psadbw xmm1, xmm7
885 pshufd xmm0, xmm0, 0xd8 // x1x0 -> xx01
886 pshufd xmm1, xmm1, 0x8d // x3x2 -> 32xx
887 por xmm0, xmm1 // -> 3201
888 psrlw xmm0, 3
889 packuswb xmm0, xmm0
890 packuswb xmm0, xmm0
891 movd dword ptr [edi], xmm0
892
893 lea edi, [edi + 4]
894 sub ecx, 4
895 ja wloop
896
897 popad
898 ret
899 }
900 }
901
902 #define HAS_SCALEROWDOWN34_SSSE3
903 // Point samples 32 pixels to 24 pixels.
904 // Produces three 8 byte values. For each 8 bytes, 16 bytes are read.
905 // Then shuffled to do the scaling.
906
907 // Note that movdqa+palign may be better than movdqu.
908 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
909 __declspec(naked)
ScaleRowDown34_SSSE3(const uint8 * src_ptr,int src_stride,uint8 * dst_ptr,int dst_width)910 static void ScaleRowDown34_SSSE3(const uint8* src_ptr, int src_stride,
911 uint8* dst_ptr, int dst_width) {
912 __asm {
913 pushad
914 mov esi, [esp + 32 + 4] // src_ptr
915 // src_stride ignored
916 mov edi, [esp + 32 + 12] // dst_ptr
917 mov ecx, [esp + 32 + 16] // dst_width
918 movdqa xmm3, _shuf0
919 movdqa xmm4, _shuf1
920 movdqa xmm5, _shuf2
921
922 wloop:
923 movdqa xmm0, [esi]
924 movdqa xmm1, [esi + 16]
925 lea esi, [esi + 32]
926 movdqa xmm2, xmm1
927 palignr xmm1, xmm0, 8
928 pshufb xmm0, xmm3
929 pshufb xmm1, xmm4
930 pshufb xmm2, xmm5
931 movq qword ptr [edi], xmm0
932 movq qword ptr [edi + 8], xmm1
933 movq qword ptr [edi + 16], xmm2
934 lea edi, [edi + 24]
935 sub ecx, 24
936 ja wloop
937
938 popad
939 ret
940 }
941 }
942
943 // Blends 32x2 rectangle to 24x1
944 // Produces three 8 byte values. For each 8 bytes, 16 bytes are read.
945 // Then shuffled to do the scaling.
946
947 // Register usage:
948 // xmm0 src_row 0
949 // xmm1 src_row 1
950 // xmm2 shuf 0
951 // xmm3 shuf 1
952 // xmm4 shuf 2
953 // xmm5 madd 0
954 // xmm6 madd 1
955 // xmm7 round34
956
957 // Note that movdqa+palign may be better than movdqu.
958 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
959 __declspec(naked)
ScaleRowDown34_1_Int_SSSE3(const uint8 * src_ptr,int src_stride,uint8 * dst_ptr,int dst_width)960 static void ScaleRowDown34_1_Int_SSSE3(const uint8* src_ptr, int src_stride,
961 uint8* dst_ptr, int dst_width) {
962 __asm {
963 pushad
964 mov esi, [esp + 32 + 4] // src_ptr
965 mov ebx, [esp + 32 + 8] // src_stride
966 mov edi, [esp + 32 + 12] // dst_ptr
967 mov ecx, [esp + 32 + 16] // dst_width
968 movdqa xmm2, _shuf01
969 movdqa xmm3, _shuf11
970 movdqa xmm4, _shuf21
971 movdqa xmm5, _madd01
972 movdqa xmm6, _madd11
973 movdqa xmm7, _round34
974
975 wloop:
976 movdqa xmm0, [esi] // pixels 0..7
977 movdqa xmm1, [esi+ebx]
978 pavgb xmm0, xmm1
979 pshufb xmm0, xmm2
980 pmaddubsw xmm0, xmm5
981 paddsw xmm0, xmm7
982 psrlw xmm0, 2
983 packuswb xmm0, xmm0
984 movq qword ptr [edi], xmm0
985 movdqu xmm0, [esi+8] // pixels 8..15
986 movdqu xmm1, [esi+ebx+8]
987 pavgb xmm0, xmm1
988 pshufb xmm0, xmm3
989 pmaddubsw xmm0, xmm6
990 paddsw xmm0, xmm7
991 psrlw xmm0, 2
992 packuswb xmm0, xmm0
993 movq qword ptr [edi+8], xmm0
994 movdqa xmm0, [esi+16] // pixels 16..23
995 movdqa xmm1, [esi+ebx+16]
996 lea esi, [esi+32]
997 pavgb xmm0, xmm1
998 pshufb xmm0, xmm4
999 movdqa xmm1, _madd21
1000 pmaddubsw xmm0, xmm1
1001 paddsw xmm0, xmm7
1002 psrlw xmm0, 2
1003 packuswb xmm0, xmm0
1004 movq qword ptr [edi+16], xmm0
1005 lea edi, [edi+24]
1006 sub ecx, 24
1007 ja wloop
1008
1009 popad
1010 ret
1011 }
1012 }
1013
1014 // Note that movdqa+palign may be better than movdqu.
1015 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
1016 __declspec(naked)
ScaleRowDown34_0_Int_SSSE3(const uint8 * src_ptr,int src_stride,uint8 * dst_ptr,int dst_width)1017 static void ScaleRowDown34_0_Int_SSSE3(const uint8* src_ptr, int src_stride,
1018 uint8* dst_ptr, int dst_width) {
1019 __asm {
1020 pushad
1021 mov esi, [esp + 32 + 4] // src_ptr
1022 mov ebx, [esp + 32 + 8] // src_stride
1023 mov edi, [esp + 32 + 12] // dst_ptr
1024 mov ecx, [esp + 32 + 16] // dst_width
1025 movdqa xmm2, _shuf01
1026 movdqa xmm3, _shuf11
1027 movdqa xmm4, _shuf21
1028 movdqa xmm5, _madd01
1029 movdqa xmm6, _madd11
1030 movdqa xmm7, _round34
1031
1032 wloop:
1033 movdqa xmm0, [esi] // pixels 0..7
1034 movdqa xmm1, [esi+ebx]
1035 pavgb xmm1, xmm0
1036 pavgb xmm0, xmm1
1037 pshufb xmm0, xmm2
1038 pmaddubsw xmm0, xmm5
1039 paddsw xmm0, xmm7
1040 psrlw xmm0, 2
1041 packuswb xmm0, xmm0
1042 movq qword ptr [edi], xmm0
1043 movdqu xmm0, [esi+8] // pixels 8..15
1044 movdqu xmm1, [esi+ebx+8]
1045 pavgb xmm1, xmm0
1046 pavgb xmm0, xmm1
1047 pshufb xmm0, xmm3
1048 pmaddubsw xmm0, xmm6
1049 paddsw xmm0, xmm7
1050 psrlw xmm0, 2
1051 packuswb xmm0, xmm0
1052 movq qword ptr [edi+8], xmm0
1053 movdqa xmm0, [esi+16] // pixels 16..23
1054 movdqa xmm1, [esi+ebx+16]
1055 lea esi, [esi+32]
1056 pavgb xmm1, xmm0
1057 pavgb xmm0, xmm1
1058 pshufb xmm0, xmm4
1059 movdqa xmm1, _madd21
1060 pmaddubsw xmm0, xmm1
1061 paddsw xmm0, xmm7
1062 psrlw xmm0, 2
1063 packuswb xmm0, xmm0
1064 movq qword ptr [edi+16], xmm0
1065 lea edi, [edi+24]
1066 sub ecx, 24
1067 ja wloop
1068
1069 popad
1070 ret
1071 }
1072 }
1073
1074 #define HAS_SCALEROWDOWN38_SSSE3
1075 // 3/8 point sampler
1076
1077 // Scale 32 pixels to 12
1078 __declspec(naked)
ScaleRowDown38_SSSE3(const uint8 * src_ptr,int src_stride,uint8 * dst_ptr,int dst_width)1079 static void ScaleRowDown38_SSSE3(const uint8* src_ptr, int src_stride,
1080 uint8* dst_ptr, int dst_width) {
1081 __asm {
1082 pushad
1083 mov esi, [esp + 32 + 4] // src_ptr
1084 mov edx, [esp + 32 + 8] // src_stride
1085 mov edi, [esp + 32 + 12] // dst_ptr
1086 mov ecx, [esp + 32 + 16] // dst_width
1087 movdqa xmm4, _shuf38a
1088 movdqa xmm5, _shuf38b
1089
1090 xloop:
1091 movdqa xmm0, [esi] // 16 pixels -> 0,1,2,3,4,5
1092 movdqa xmm1, [esi + 16] // 16 pixels -> 6,7,8,9,10,11
1093 lea esi, [esi + 32]
1094 pshufb xmm0, xmm4
1095 pshufb xmm1, xmm5
1096 paddusb xmm0, xmm1
1097
1098 movq qword ptr [edi], xmm0 // write 12 pixels
1099 movhlps xmm1, xmm0
1100 movd [edi + 8], xmm1
1101 lea edi, [edi + 12]
1102 sub ecx, 12
1103 ja xloop
1104
1105 popad
1106 ret
1107 }
1108 }
1109
1110 // Scale 16x3 pixels to 6x1 with interpolation
1111 __declspec(naked)
ScaleRowDown38_3_Int_SSSE3(const uint8 * src_ptr,int src_stride,uint8 * dst_ptr,int dst_width)1112 static void ScaleRowDown38_3_Int_SSSE3(const uint8* src_ptr, int src_stride,
1113 uint8* dst_ptr, int dst_width) {
1114 __asm {
1115 pushad
1116 mov esi, [esp + 32 + 4] // src_ptr
1117 mov edx, [esp + 32 + 8] // src_stride
1118 mov edi, [esp + 32 + 12] // dst_ptr
1119 mov ecx, [esp + 32 + 16] // dst_width
1120 movdqa xmm4, _shufac0
1121 movdqa xmm5, _shufac3
1122 movdqa xmm6, _scaleac3
1123 pxor xmm7, xmm7
1124
1125 xloop:
1126 movdqa xmm0, [esi] // sum up 3 rows into xmm0/1
1127 movdqa xmm2, [esi + edx]
1128 movhlps xmm1, xmm0
1129 movhlps xmm3, xmm2
1130 punpcklbw xmm0, xmm7
1131 punpcklbw xmm1, xmm7
1132 punpcklbw xmm2, xmm7
1133 punpcklbw xmm3, xmm7
1134 paddusw xmm0, xmm2
1135 paddusw xmm1, xmm3
1136 movdqa xmm2, [esi + edx * 2]
1137 lea esi, [esi + 16]
1138 movhlps xmm3, xmm2
1139 punpcklbw xmm2, xmm7
1140 punpcklbw xmm3, xmm7
1141 paddusw xmm0, xmm2
1142 paddusw xmm1, xmm3
1143
1144 movdqa xmm2, xmm0 // 8 pixels -> 0,1,2 of xmm2
1145 psrldq xmm0, 2
1146 paddusw xmm2, xmm0
1147 psrldq xmm0, 2
1148 paddusw xmm2, xmm0
1149 pshufb xmm2, xmm4
1150
1151 movdqa xmm3, xmm1 // 8 pixels -> 3,4,5 of xmm2
1152 psrldq xmm1, 2
1153 paddusw xmm3, xmm1
1154 psrldq xmm1, 2
1155 paddusw xmm3, xmm1
1156 pshufb xmm3, xmm5
1157 paddusw xmm2, xmm3
1158
1159 pmulhuw xmm2, xmm6 // divide by 9,9,6, 9,9,6
1160 packuswb xmm2, xmm2
1161
1162 movd [edi], xmm2 // write 6 pixels
1163 pextrw eax, xmm2, 2
1164 mov [edi + 4], ax
1165 lea edi, [edi + 6]
1166 sub ecx, 6
1167 ja xloop
1168
1169 popad
1170 ret
1171 }
1172 }
1173
1174 // Scale 16x2 pixels to 6x1 with interpolation
1175 __declspec(naked)
ScaleRowDown38_2_Int_SSSE3(const uint8 * src_ptr,int src_stride,uint8 * dst_ptr,int dst_width)1176 static void ScaleRowDown38_2_Int_SSSE3(const uint8* src_ptr, int src_stride,
1177 uint8* dst_ptr, int dst_width) {
1178 __asm {
1179 pushad
1180 mov esi, [esp + 32 + 4] // src_ptr
1181 mov edx, [esp + 32 + 8] // src_stride
1182 mov edi, [esp + 32 + 12] // dst_ptr
1183 mov ecx, [esp + 32 + 16] // dst_width
1184 movdqa xmm4, _shufab0
1185 movdqa xmm5, _shufab1
1186 movdqa xmm6, _shufab2
1187 movdqa xmm7, _scaleab2
1188
1189 xloop:
1190 movdqa xmm2, [esi] // average 2 rows into xmm2
1191 pavgb xmm2, [esi + edx]
1192 lea esi, [esi + 16]
1193
1194 movdqa xmm0, xmm2 // 16 pixels -> 0,1,2,3,4,5 of xmm0
1195 pshufb xmm0, xmm4
1196 movdqa xmm1, xmm2
1197 pshufb xmm1, xmm5
1198 paddusw xmm0, xmm1
1199 pshufb xmm2, xmm6
1200 paddusw xmm0, xmm2
1201
1202 pmulhuw xmm0, xmm7 // divide by 3,3,2, 3,3,2
1203 packuswb xmm0, xmm0
1204
1205 movd [edi], xmm0 // write 6 pixels
1206 pextrw eax, xmm0, 2
1207 mov [edi + 4], ax
1208 lea edi, [edi + 6]
1209 sub ecx, 6
1210 ja xloop
1211
1212 popad
1213 ret
1214 }
1215 }
1216
1217 #define HAS_SCALEADDROWS_SSE2
1218
1219 // Reads 8xN bytes and produces 16 shorts at a time.
1220 __declspec(naked)
ScaleAddRows_SSE2(const uint8 * src_ptr,int src_stride,uint16 * dst_ptr,int src_width,int src_height)1221 static void ScaleAddRows_SSE2(const uint8* src_ptr, int src_stride,
1222 uint16* dst_ptr, int src_width,
1223 int src_height) {
1224 __asm {
1225 pushad
1226 mov esi, [esp + 32 + 4] // src_ptr
1227 mov edx, [esp + 32 + 8] // src_stride
1228 mov edi, [esp + 32 + 12] // dst_ptr
1229 mov ecx, [esp + 32 + 16] // dst_width
1230 mov ebx, [esp + 32 + 20] // height
1231 pxor xmm5, xmm5
1232 dec ebx
1233
1234 xloop:
1235 // first row
1236 movdqa xmm2, [esi]
1237 lea eax, [esi + edx]
1238 movhlps xmm3, xmm2
1239 mov ebp, ebx
1240 punpcklbw xmm2, xmm5
1241 punpcklbw xmm3, xmm5
1242
1243 // sum remaining rows
1244 yloop:
1245 movdqa xmm0, [eax] // read 16 pixels
1246 lea eax, [eax + edx] // advance to next row
1247 movhlps xmm1, xmm0
1248 punpcklbw xmm0, xmm5
1249 punpcklbw xmm1, xmm5
1250 paddusw xmm2, xmm0 // sum 16 words
1251 paddusw xmm3, xmm1
1252 sub ebp, 1
1253 ja yloop
1254
1255 movdqa [edi], xmm2
1256 movdqa [edi + 16], xmm3
1257 lea edi, [edi + 32]
1258 lea esi, [esi + 16]
1259
1260 sub ecx, 16
1261 ja xloop
1262
1263 popad
1264 ret
1265 }
1266 }
1267
1268 // Bilinear row filtering combines 16x2 -> 16x1. SSE2 version.
1269 #define HAS_SCALEFILTERROWS_SSE2
1270 __declspec(naked)
ScaleFilterRows_SSE2(uint8 * dst_ptr,const uint8 * src_ptr,int src_stride,int dst_width,int source_y_fraction)1271 static void ScaleFilterRows_SSE2(uint8* dst_ptr, const uint8* src_ptr,
1272 int src_stride, int dst_width,
1273 int source_y_fraction) {
1274 __asm {
1275 push esi
1276 push edi
1277 mov edi, [esp + 8 + 4] // dst_ptr
1278 mov esi, [esp + 8 + 8] // src_ptr
1279 mov edx, [esp + 8 + 12] // src_stride
1280 mov ecx, [esp + 8 + 16] // dst_width
1281 mov eax, [esp + 8 + 20] // source_y_fraction (0..255)
1282 cmp eax, 0
1283 je xloop1
1284 cmp eax, 128
1285 je xloop2
1286
1287 movd xmm6, eax // xmm6 = y fraction
1288 punpcklwd xmm6, xmm6
1289 pshufd xmm6, xmm6, 0
1290 neg eax // xmm5 = 256 - y fraction
1291 add eax, 256
1292 movd xmm5, eax
1293 punpcklwd xmm5, xmm5
1294 pshufd xmm5, xmm5, 0
1295 pxor xmm7, xmm7
1296
1297 xloop:
1298 movdqa xmm0, [esi]
1299 movdqa xmm2, [esi + edx]
1300 lea esi, [esi + 16]
1301 movdqa xmm1, xmm0
1302 movdqa xmm3, xmm2
1303 punpcklbw xmm0, xmm7
1304 punpcklbw xmm2, xmm7
1305 punpckhbw xmm1, xmm7
1306 punpckhbw xmm3, xmm7
1307 pmullw xmm0, xmm5 // scale row 0
1308 pmullw xmm1, xmm5
1309 pmullw xmm2, xmm6 // scale row 1
1310 pmullw xmm3, xmm6
1311 paddusw xmm0, xmm2 // sum rows
1312 paddusw xmm1, xmm3
1313 psrlw xmm0, 8
1314 psrlw xmm1, 8
1315 packuswb xmm0, xmm1
1316 movdqa [edi], xmm0
1317 lea edi, [edi + 16]
1318 sub ecx, 16
1319 ja xloop
1320
1321 mov al, [edi - 1]
1322 mov [edi], al
1323 pop edi
1324 pop esi
1325 ret
1326
1327 xloop1:
1328 movdqa xmm0, [esi]
1329 lea esi, [esi + 16]
1330 movdqa [edi], xmm0
1331 lea edi, [edi + 16]
1332 sub ecx, 16
1333 ja xloop1
1334
1335 mov al, [edi - 1]
1336 mov [edi], al
1337 pop edi
1338 pop esi
1339 ret
1340
1341 xloop2:
1342 movdqa xmm0, [esi]
1343 movdqa xmm2, [esi + edx]
1344 lea esi, [esi + 16]
1345 pavgb xmm0, xmm2
1346 movdqa [edi], xmm0
1347 lea edi, [edi + 16]
1348 sub ecx, 16
1349 ja xloop2
1350
1351 mov al, [edi - 1]
1352 mov [edi], al
1353 pop edi
1354 pop esi
1355 ret
1356 }
1357 }
1358
1359 // Bilinear row filtering combines 16x2 -> 16x1. SSSE3 version.
1360 #define HAS_SCALEFILTERROWS_SSSE3
1361 __declspec(naked)
ScaleFilterRows_SSSE3(uint8 * dst_ptr,const uint8 * src_ptr,int src_stride,int dst_width,int source_y_fraction)1362 static void ScaleFilterRows_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
1363 int src_stride, int dst_width,
1364 int source_y_fraction) {
1365 __asm {
1366 push esi
1367 push edi
1368 mov edi, [esp + 8 + 4] // dst_ptr
1369 mov esi, [esp + 8 + 8] // src_ptr
1370 mov edx, [esp + 8 + 12] // src_stride
1371 mov ecx, [esp + 8 + 16] // dst_width
1372 mov eax, [esp + 8 + 20] // source_y_fraction (0..255)
1373 shr eax, 1
1374 cmp eax, 0
1375 je xloop1
1376 cmp eax, 64
1377 je xloop2
1378
1379 mov ah,al
1380 neg al
1381 add al, 128
1382 movd xmm5, eax
1383 punpcklwd xmm5, xmm5
1384 pshufd xmm5, xmm5, 0
1385
1386 xloop:
1387 movdqa xmm0, [esi]
1388 movdqa xmm2, [esi + edx]
1389 lea esi, [esi + 16]
1390 movdqa xmm1, xmm0
1391 punpcklbw xmm0, xmm2
1392 punpckhbw xmm1, xmm2
1393 pmaddubsw xmm0, xmm5
1394 pmaddubsw xmm1, xmm5
1395 psrlw xmm0, 7
1396 psrlw xmm1, 7
1397 packuswb xmm0, xmm1
1398 movdqa [edi], xmm0
1399 lea edi, [edi + 16]
1400 sub ecx, 16
1401 ja xloop
1402
1403 mov al, [edi - 1]
1404 mov [edi], al
1405 pop edi
1406 pop esi
1407 ret
1408
1409 xloop1:
1410 movdqa xmm0, [esi]
1411 lea esi, [esi + 16]
1412 movdqa [edi], xmm0
1413 lea edi, [edi + 16]
1414 sub ecx, 16
1415 ja xloop1
1416
1417 mov al, [edi - 1]
1418 mov [edi], al
1419 pop edi
1420 pop esi
1421 ret
1422
1423 xloop2:
1424 movdqa xmm0, [esi]
1425 movdqa xmm2, [esi + edx]
1426 lea esi, [esi + 16]
1427 pavgb xmm0, xmm2
1428 movdqa [edi], xmm0
1429 lea edi, [edi + 16]
1430 sub ecx, 16
1431 ja xloop2
1432
1433 mov al, [edi - 1]
1434 mov [edi], al
1435 pop edi
1436 pop esi
1437 ret
1438
1439 }
1440 }
1441
1442 // Note that movdqa+palign may be better than movdqu.
1443 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
1444 __declspec(naked)
ScaleFilterCols34_SSSE3(uint8 * dst_ptr,const uint8 * src_ptr,int dst_width)1445 static void ScaleFilterCols34_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
1446 int dst_width) {
1447 __asm {
1448 mov edx, [esp + 4] // dst_ptr
1449 mov eax, [esp + 8] // src_ptr
1450 mov ecx, [esp + 12] // dst_width
1451 movdqa xmm1, _round34
1452 movdqa xmm2, _shuf01
1453 movdqa xmm3, _shuf11
1454 movdqa xmm4, _shuf21
1455 movdqa xmm5, _madd01
1456 movdqa xmm6, _madd11
1457 movdqa xmm7, _madd21
1458
1459 wloop:
1460 movdqa xmm0, [eax] // pixels 0..7
1461 pshufb xmm0, xmm2
1462 pmaddubsw xmm0, xmm5
1463 paddsw xmm0, xmm1
1464 psrlw xmm0, 2
1465 packuswb xmm0, xmm0
1466 movq qword ptr [edx], xmm0
1467 movdqu xmm0, [eax+8] // pixels 8..15
1468 pshufb xmm0, xmm3
1469 pmaddubsw xmm0, xmm6
1470 paddsw xmm0, xmm1
1471 psrlw xmm0, 2
1472 packuswb xmm0, xmm0
1473 movq qword ptr [edx+8], xmm0
1474 movdqa xmm0, [eax+16] // pixels 16..23
1475 lea eax, [eax+32]
1476 pshufb xmm0, xmm4
1477 pmaddubsw xmm0, xmm7
1478 paddsw xmm0, xmm1
1479 psrlw xmm0, 2
1480 packuswb xmm0, xmm0
1481 movq qword ptr [edx+16], xmm0
1482 lea edx, [edx+24]
1483 sub ecx, 24
1484 ja wloop
1485 ret
1486 }
1487 }
1488
1489 #elif (defined(__x86_64__) || defined(__i386__)) && !defined(YUV_DISABLE_ASM)
1490
1491 // GCC versions of row functions are verbatim conversions from Visual C.
1492 // Generated using gcc disassembly on Visual C object file:
1493 // objdump -D yuvscaler.obj >yuvscaler.txt
1494 #define HAS_SCALEROWDOWN2_SSE2
ScaleRowDown2_SSE2(const uint8 * src_ptr,int src_stride,uint8 * dst_ptr,int dst_width)1495 static void ScaleRowDown2_SSE2(const uint8* src_ptr, int src_stride,
1496 uint8* dst_ptr, int dst_width) {
1497 asm volatile (
1498 "pcmpeqb %%xmm5,%%xmm5 \n"
1499 "psrlw $0x8,%%xmm5 \n"
1500 "1:"
1501 "movdqa (%0),%%xmm0 \n"
1502 "movdqa 0x10(%0),%%xmm1 \n"
1503 "lea 0x20(%0),%0 \n"
1504 "pand %%xmm5,%%xmm0 \n"
1505 "pand %%xmm5,%%xmm1 \n"
1506 "packuswb %%xmm1,%%xmm0 \n"
1507 "movdqa %%xmm0,(%1) \n"
1508 "lea 0x10(%1),%1 \n"
1509 "sub $0x10,%2 \n"
1510 "ja 1b \n"
1511 : "+r"(src_ptr), // %0
1512 "+r"(dst_ptr), // %1
1513 "+r"(dst_width) // %2
1514 :
1515 : "memory", "cc"
1516 );
1517 }
1518
ScaleRowDown2Int_SSE2(const uint8 * src_ptr,int src_stride,uint8 * dst_ptr,int dst_width)1519 static void ScaleRowDown2Int_SSE2(const uint8* src_ptr, int src_stride,
1520 uint8* dst_ptr, int dst_width) {
1521 asm volatile (
1522 "pcmpeqb %%xmm5,%%xmm5 \n"
1523 "psrlw $0x8,%%xmm5 \n"
1524 "1:"
1525 "movdqa (%0),%%xmm0 \n"
1526 "movdqa 0x10(%0),%%xmm1 \n"
1527 "movdqa (%0,%3,1),%%xmm2 \n"
1528 "movdqa 0x10(%0,%3,1),%%xmm3 \n"
1529 "lea 0x20(%0),%0 \n"
1530 "pavgb %%xmm2,%%xmm0 \n"
1531 "pavgb %%xmm3,%%xmm1 \n"
1532 "movdqa %%xmm0,%%xmm2 \n"
1533 "psrlw $0x8,%%xmm0 \n"
1534 "movdqa %%xmm1,%%xmm3 \n"
1535 "psrlw $0x8,%%xmm1 \n"
1536 "pand %%xmm5,%%xmm2 \n"
1537 "pand %%xmm5,%%xmm3 \n"
1538 "pavgw %%xmm2,%%xmm0 \n"
1539 "pavgw %%xmm3,%%xmm1 \n"
1540 "packuswb %%xmm1,%%xmm0 \n"
1541 "movdqa %%xmm0,(%1) \n"
1542 "lea 0x10(%1),%1 \n"
1543 "sub $0x10,%2 \n"
1544 "ja 1b \n"
1545 : "+r"(src_ptr), // %0
1546 "+r"(dst_ptr), // %1
1547 "+r"(dst_width) // %2
1548 : "r"((intptr_t)(src_stride)) // %3
1549 : "memory", "cc"
1550 );
1551 }
1552
1553 #define HAS_SCALEROWDOWN4_SSE2
ScaleRowDown4_SSE2(const uint8 * src_ptr,int src_stride,uint8 * dst_ptr,int dst_width)1554 static void ScaleRowDown4_SSE2(const uint8* src_ptr, int src_stride,
1555 uint8* dst_ptr, int dst_width) {
1556 asm volatile (
1557 "pcmpeqb %%xmm5,%%xmm5 \n"
1558 "psrld $0x18,%%xmm5 \n"
1559 "1:"
1560 "movdqa (%0),%%xmm0 \n"
1561 "movdqa 0x10(%0),%%xmm1 \n"
1562 "lea 0x20(%0),%0 \n"
1563 "pand %%xmm5,%%xmm0 \n"
1564 "pand %%xmm5,%%xmm1 \n"
1565 "packuswb %%xmm1,%%xmm0 \n"
1566 "packuswb %%xmm0,%%xmm0 \n"
1567 "movq %%xmm0,(%1) \n"
1568 "lea 0x8(%1),%1 \n"
1569 "sub $0x8,%2 \n"
1570 "ja 1b \n"
1571 : "+r"(src_ptr), // %0
1572 "+r"(dst_ptr), // %1
1573 "+r"(dst_width) // %2
1574 :
1575 : "memory", "cc"
1576 );
1577 }
1578
ScaleRowDown4Int_SSE2(const uint8 * src_ptr,int src_stride,uint8 * dst_ptr,int dst_width)1579 static void ScaleRowDown4Int_SSE2(const uint8* src_ptr, int src_stride,
1580 uint8* dst_ptr, int dst_width) {
1581 intptr_t temp = 0;
1582 asm volatile (
1583 "pcmpeqb %%xmm7,%%xmm7 \n"
1584 "psrlw $0x8,%%xmm7 \n"
1585 "lea (%4,%4,2),%3 \n"
1586 "1:"
1587 "movdqa (%0),%%xmm0 \n"
1588 "movdqa 0x10(%0),%%xmm1 \n"
1589 "movdqa (%0,%4,1),%%xmm2 \n"
1590 "movdqa 0x10(%0,%4,1),%%xmm3 \n"
1591 "pavgb %%xmm2,%%xmm0 \n"
1592 "pavgb %%xmm3,%%xmm1 \n"
1593 "movdqa (%0,%4,2),%%xmm2 \n"
1594 "movdqa 0x10(%0,%4,2),%%xmm3 \n"
1595 "movdqa (%0,%3,1),%%xmm4 \n"
1596 "movdqa 0x10(%0,%3,1),%%xmm5 \n"
1597 "lea 0x20(%0),%0 \n"
1598 "pavgb %%xmm4,%%xmm2 \n"
1599 "pavgb %%xmm2,%%xmm0 \n"
1600 "pavgb %%xmm5,%%xmm3 \n"
1601 "pavgb %%xmm3,%%xmm1 \n"
1602 "movdqa %%xmm0,%%xmm2 \n"
1603 "psrlw $0x8,%%xmm0 \n"
1604 "movdqa %%xmm1,%%xmm3 \n"
1605 "psrlw $0x8,%%xmm1 \n"
1606 "pand %%xmm7,%%xmm2 \n"
1607 "pand %%xmm7,%%xmm3 \n"
1608 "pavgw %%xmm2,%%xmm0 \n"
1609 "pavgw %%xmm3,%%xmm1 \n"
1610 "packuswb %%xmm1,%%xmm0 \n"
1611 "movdqa %%xmm0,%%xmm2 \n"
1612 "psrlw $0x8,%%xmm0 \n"
1613 "pand %%xmm7,%%xmm2 \n"
1614 "pavgw %%xmm2,%%xmm0 \n"
1615 "packuswb %%xmm0,%%xmm0 \n"
1616 "movq %%xmm0,(%1) \n"
1617 "lea 0x8(%1),%1 \n"
1618 "sub $0x8,%2 \n"
1619 "ja 1b \n"
1620 : "+r"(src_ptr), // %0
1621 "+r"(dst_ptr), // %1
1622 "+r"(dst_width), // %2
1623 "+r"(temp) // %3
1624 : "r"((intptr_t)(src_stride)) // %4
1625 : "memory", "cc"
1626 #if defined(__x86_64__)
1627 , "xmm6", "xmm7"
1628 #endif
1629 );
1630 }
1631
1632 #define HAS_SCALEROWDOWN8_SSE2
ScaleRowDown8_SSE2(const uint8 * src_ptr,int src_stride,uint8 * dst_ptr,int dst_width)1633 static void ScaleRowDown8_SSE2(const uint8* src_ptr, int src_stride,
1634 uint8* dst_ptr, int dst_width) {
1635 asm volatile (
1636 "pcmpeqb %%xmm5,%%xmm5 \n"
1637 "psrlq $0x38,%%xmm5 \n"
1638 "1:"
1639 "movdqa (%0),%%xmm0 \n"
1640 "movdqa 0x10(%0),%%xmm1 \n"
1641 "lea 0x20(%0),%0 \n"
1642 "pand %%xmm5,%%xmm0 \n"
1643 "pand %%xmm5,%%xmm1 \n"
1644 "packuswb %%xmm1,%%xmm0 \n"
1645 "packuswb %%xmm0,%%xmm0 \n"
1646 "packuswb %%xmm0,%%xmm0 \n"
1647 "movd %%xmm0,(%1) \n"
1648 "lea 0x4(%1),%1 \n"
1649 "sub $0x4,%2 \n"
1650 "ja 1b \n"
1651 : "+r"(src_ptr), // %0
1652 "+r"(dst_ptr), // %1
1653 "+r"(dst_width) // %2
1654 :
1655 : "memory", "cc"
1656 );
1657 }
1658
1659 #if defined(__i386__)
1660 void ScaleRowDown8Int_SSE2(const uint8* src_ptr, int src_stride,
1661 uint8* dst_ptr, int dst_width);
1662 asm(
1663 DECLARE_FUNCTION(ScaleRowDown8Int_SSE2)
1664 "pusha \n"
1665 "mov 0x24(%esp),%esi \n"
1666 "mov 0x28(%esp),%ebx \n"
1667 "mov 0x2c(%esp),%edi \n"
1668 "mov 0x30(%esp),%ecx \n"
1669 "lea (%ebx,%ebx,2),%edx \n"
1670 "pxor %xmm7,%xmm7 \n"
1671
1672 "1:"
1673 "movdqa (%esi),%xmm0 \n"
1674 "movdqa 0x10(%esi),%xmm1 \n"
1675 "movdqa (%esi,%ebx,1),%xmm2 \n"
1676 "movdqa 0x10(%esi,%ebx,1),%xmm3 \n"
1677 "pavgb %xmm2,%xmm0 \n"
1678 "pavgb %xmm3,%xmm1 \n"
1679 "movdqa (%esi,%ebx,2),%xmm2 \n"
1680 "movdqa 0x10(%esi,%ebx,2),%xmm3 \n"
1681 "movdqa (%esi,%edx,1),%xmm4 \n"
1682 "movdqa 0x10(%esi,%edx,1),%xmm5 \n"
1683 "lea (%esi,%ebx,4),%ebp \n"
1684 "lea 0x20(%esi),%esi \n"
1685 "pavgb %xmm4,%xmm2 \n"
1686 "pavgb %xmm5,%xmm3 \n"
1687 "pavgb %xmm2,%xmm0 \n"
1688 "pavgb %xmm3,%xmm1 \n"
1689 "movdqa 0x0(%ebp),%xmm2 \n"
1690 "movdqa 0x10(%ebp),%xmm3 \n"
1691 "movdqa 0x0(%ebp,%ebx,1),%xmm4 \n"
1692 "movdqa 0x10(%ebp,%ebx,1),%xmm5 \n"
1693 "pavgb %xmm4,%xmm2 \n"
1694 "pavgb %xmm5,%xmm3 \n"
1695 "movdqa 0x0(%ebp,%ebx,2),%xmm4 \n"
1696 "movdqa 0x10(%ebp,%ebx,2),%xmm5 \n"
1697 "movdqa 0x0(%ebp,%edx,1),%xmm6 \n"
1698 "pavgb %xmm6,%xmm4 \n"
1699 "movdqa 0x10(%ebp,%edx,1),%xmm6 \n"
1700 "pavgb %xmm6,%xmm5 \n"
1701 "pavgb %xmm4,%xmm2 \n"
1702 "pavgb %xmm5,%xmm3 \n"
1703 "pavgb %xmm2,%xmm0 \n"
1704 "pavgb %xmm3,%xmm1 \n"
1705 "psadbw %xmm7,%xmm0 \n"
1706 "psadbw %xmm7,%xmm1 \n"
1707 "pshufd $0xd8,%xmm0,%xmm0 \n"
1708 "pshufd $0x8d,%xmm1,%xmm1 \n"
1709 "por %xmm1,%xmm0 \n"
1710 "psrlw $0x3,%xmm0 \n"
1711 "packuswb %xmm0,%xmm0 \n"
1712 "packuswb %xmm0,%xmm0 \n"
1713 "movd %xmm0,(%edi) \n"
1714 "lea 0x4(%edi),%edi \n"
1715 "sub $0x4,%ecx \n"
1716 "ja 1b \n"
1717 "popa \n"
1718 "ret \n"
1719 );
1720
1721 // fpic is used for magiccam plugin
1722 #if !defined(__PIC__)
1723 #define HAS_SCALEROWDOWN34_SSSE3
1724 void ScaleRowDown34_SSSE3(const uint8* src_ptr, int src_stride,
1725 uint8* dst_ptr, int dst_width);
1726 asm(
1727 DECLARE_FUNCTION(ScaleRowDown34_SSSE3)
1728 "pusha \n"
1729 "mov 0x24(%esp),%esi \n"
1730 "mov 0x2c(%esp),%edi \n"
1731 "mov 0x30(%esp),%ecx \n"
1732 "movdqa _shuf0,%xmm3 \n"
1733 "movdqa _shuf1,%xmm4 \n"
1734 "movdqa _shuf2,%xmm5 \n"
1735
1736 "1:"
1737 "movdqa (%esi),%xmm0 \n"
1738 "movdqa 0x10(%esi),%xmm2 \n"
1739 "lea 0x20(%esi),%esi \n"
1740 "movdqa %xmm2,%xmm1 \n"
1741 "palignr $0x8,%xmm0,%xmm1 \n"
1742 "pshufb %xmm3,%xmm0 \n"
1743 "pshufb %xmm4,%xmm1 \n"
1744 "pshufb %xmm5,%xmm2 \n"
1745 "movq %xmm0,(%edi) \n"
1746 "movq %xmm1,0x8(%edi) \n"
1747 "movq %xmm2,0x10(%edi) \n"
1748 "lea 0x18(%edi),%edi \n"
1749 "sub $0x18,%ecx \n"
1750 "ja 1b \n"
1751 "popa \n"
1752 "ret \n"
1753 );
1754
1755 void ScaleRowDown34_1_Int_SSSE3(const uint8* src_ptr, int src_stride,
1756 uint8* dst_ptr, int dst_width);
1757 asm(
1758 DECLARE_FUNCTION(ScaleRowDown34_1_Int_SSSE3)
1759 "pusha \n"
1760 "mov 0x24(%esp),%esi \n"
1761 "mov 0x28(%esp),%ebp \n"
1762 "mov 0x2c(%esp),%edi \n"
1763 "mov 0x30(%esp),%ecx \n"
1764 "movdqa _shuf01,%xmm2 \n"
1765 "movdqa _shuf11,%xmm3 \n"
1766 "movdqa _shuf21,%xmm4 \n"
1767 "movdqa _madd01,%xmm5 \n"
1768 "movdqa _madd11,%xmm6 \n"
1769 "movdqa _round34,%xmm7 \n"
1770
1771 "1:"
1772 "movdqa (%esi),%xmm0 \n"
1773 "movdqa (%esi,%ebp),%xmm1 \n"
1774 "pavgb %xmm1,%xmm0 \n"
1775 "pshufb %xmm2,%xmm0 \n"
1776 "pmaddubsw %xmm5,%xmm0 \n"
1777 "paddsw %xmm7,%xmm0 \n"
1778 "psrlw $0x2,%xmm0 \n"
1779 "packuswb %xmm0,%xmm0 \n"
1780 "movq %xmm0,(%edi) \n"
1781 "movdqu 0x8(%esi),%xmm0 \n"
1782 "movdqu 0x8(%esi,%ebp),%xmm1 \n"
1783 "pavgb %xmm1,%xmm0 \n"
1784 "pshufb %xmm3,%xmm0 \n"
1785 "pmaddubsw %xmm6,%xmm0 \n"
1786 "paddsw %xmm7,%xmm0 \n"
1787 "psrlw $0x2,%xmm0 \n"
1788 "packuswb %xmm0,%xmm0 \n"
1789 "movq %xmm0,0x8(%edi) \n"
1790 "movdqa 0x10(%esi),%xmm0 \n"
1791 "movdqa 0x10(%esi,%ebp),%xmm1 \n"
1792 "lea 0x20(%esi),%esi \n"
1793 "pavgb %xmm1,%xmm0 \n"
1794 "pshufb %xmm4,%xmm0 \n"
1795 "movdqa _madd21,%xmm1 \n"
1796 "pmaddubsw %xmm1,%xmm0 \n"
1797 "paddsw %xmm7,%xmm0 \n"
1798 "psrlw $0x2,%xmm0 \n"
1799 "packuswb %xmm0,%xmm0 \n"
1800 "movq %xmm0,0x10(%edi) \n"
1801 "lea 0x18(%edi),%edi \n"
1802 "sub $0x18,%ecx \n"
1803 "ja 1b \n"
1804
1805 "popa \n"
1806 "ret \n"
1807 );
1808
1809 void ScaleRowDown34_0_Int_SSSE3(const uint8* src_ptr, int src_stride,
1810 uint8* dst_ptr, int dst_width);
1811 asm(
1812 DECLARE_FUNCTION(ScaleRowDown34_0_Int_SSSE3)
1813 "pusha \n"
1814 "mov 0x24(%esp),%esi \n"
1815 "mov 0x28(%esp),%ebp \n"
1816 "mov 0x2c(%esp),%edi \n"
1817 "mov 0x30(%esp),%ecx \n"
1818 "movdqa _shuf01,%xmm2 \n"
1819 "movdqa _shuf11,%xmm3 \n"
1820 "movdqa _shuf21,%xmm4 \n"
1821 "movdqa _madd01,%xmm5 \n"
1822 "movdqa _madd11,%xmm6 \n"
1823 "movdqa _round34,%xmm7 \n"
1824
1825 "1:"
1826 "movdqa (%esi),%xmm0 \n"
1827 "movdqa (%esi,%ebp,1),%xmm1 \n"
1828 "pavgb %xmm0,%xmm1 \n"
1829 "pavgb %xmm1,%xmm0 \n"
1830 "pshufb %xmm2,%xmm0 \n"
1831 "pmaddubsw %xmm5,%xmm0 \n"
1832 "paddsw %xmm7,%xmm0 \n"
1833 "psrlw $0x2,%xmm0 \n"
1834 "packuswb %xmm0,%xmm0 \n"
1835 "movq %xmm0,(%edi) \n"
1836 "movdqu 0x8(%esi),%xmm0 \n"
1837 "movdqu 0x8(%esi,%ebp,1),%xmm1 \n"
1838 "pavgb %xmm0,%xmm1 \n"
1839 "pavgb %xmm1,%xmm0 \n"
1840 "pshufb %xmm3,%xmm0 \n"
1841 "pmaddubsw %xmm6,%xmm0 \n"
1842 "paddsw %xmm7,%xmm0 \n"
1843 "psrlw $0x2,%xmm0 \n"
1844 "packuswb %xmm0,%xmm0 \n"
1845 "movq %xmm0,0x8(%edi) \n"
1846 "movdqa 0x10(%esi),%xmm0 \n"
1847 "movdqa 0x10(%esi,%ebp,1),%xmm1 \n"
1848 "lea 0x20(%esi),%esi \n"
1849 "pavgb %xmm0,%xmm1 \n"
1850 "pavgb %xmm1,%xmm0 \n"
1851 "pshufb %xmm4,%xmm0 \n"
1852 "movdqa _madd21,%xmm1 \n"
1853 "pmaddubsw %xmm1,%xmm0 \n"
1854 "paddsw %xmm7,%xmm0 \n"
1855 "psrlw $0x2,%xmm0 \n"
1856 "packuswb %xmm0,%xmm0 \n"
1857 "movq %xmm0,0x10(%edi) \n"
1858 "lea 0x18(%edi),%edi \n"
1859 "sub $0x18,%ecx \n"
1860 "ja 1b \n"
1861 "popa \n"
1862 "ret \n"
1863 );
1864
1865 #define HAS_SCALEROWDOWN38_SSSE3
1866 void ScaleRowDown38_SSSE3(const uint8* src_ptr, int src_stride,
1867 uint8* dst_ptr, int dst_width);
1868 asm(
1869 DECLARE_FUNCTION(ScaleRowDown38_SSSE3)
1870 "pusha \n"
1871 "mov 0x24(%esp),%esi \n"
1872 "mov 0x28(%esp),%edx \n"
1873 "mov 0x2c(%esp),%edi \n"
1874 "mov 0x30(%esp),%ecx \n"
1875 "movdqa _shuf38a ,%xmm4 \n"
1876 "movdqa _shuf38b ,%xmm5 \n"
1877
1878 "1:"
1879 "movdqa (%esi),%xmm0 \n"
1880 "movdqa 0x10(%esi),%xmm1 \n"
1881 "lea 0x20(%esi),%esi \n"
1882 "pshufb %xmm4,%xmm0 \n"
1883 "pshufb %xmm5,%xmm1 \n"
1884 "paddusb %xmm1,%xmm0 \n"
1885 "movq %xmm0,(%edi) \n"
1886 "movhlps %xmm0,%xmm1 \n"
1887 "movd %xmm1,0x8(%edi) \n"
1888 "lea 0xc(%edi),%edi \n"
1889 "sub $0xc,%ecx \n"
1890 "ja 1b \n"
1891 "popa \n"
1892 "ret \n"
1893 );
1894
1895 void ScaleRowDown38_3_Int_SSSE3(const uint8* src_ptr, int src_stride,
1896 uint8* dst_ptr, int dst_width);
1897 asm(
1898 DECLARE_FUNCTION(ScaleRowDown38_3_Int_SSSE3)
1899 "pusha \n"
1900 "mov 0x24(%esp),%esi \n"
1901 "mov 0x28(%esp),%edx \n"
1902 "mov 0x2c(%esp),%edi \n"
1903 "mov 0x30(%esp),%ecx \n"
1904 "movdqa _shufac0,%xmm4 \n"
1905 "movdqa _shufac3,%xmm5 \n"
1906 "movdqa _scaleac3,%xmm6 \n"
1907 "pxor %xmm7,%xmm7 \n"
1908
1909 "1:"
1910 "movdqa (%esi),%xmm0 \n"
1911 "movdqa (%esi,%edx,1),%xmm2 \n"
1912 "movhlps %xmm0,%xmm1 \n"
1913 "movhlps %xmm2,%xmm3 \n"
1914 "punpcklbw %xmm7,%xmm0 \n"
1915 "punpcklbw %xmm7,%xmm1 \n"
1916 "punpcklbw %xmm7,%xmm2 \n"
1917 "punpcklbw %xmm7,%xmm3 \n"
1918 "paddusw %xmm2,%xmm0 \n"
1919 "paddusw %xmm3,%xmm1 \n"
1920 "movdqa (%esi,%edx,2),%xmm2 \n"
1921 "lea 0x10(%esi),%esi \n"
1922 "movhlps %xmm2,%xmm3 \n"
1923 "punpcklbw %xmm7,%xmm2 \n"
1924 "punpcklbw %xmm7,%xmm3 \n"
1925 "paddusw %xmm2,%xmm0 \n"
1926 "paddusw %xmm3,%xmm1 \n"
1927 "movdqa %xmm0,%xmm2 \n"
1928 "psrldq $0x2,%xmm0 \n"
1929 "paddusw %xmm0,%xmm2 \n"
1930 "psrldq $0x2,%xmm0 \n"
1931 "paddusw %xmm0,%xmm2 \n"
1932 "pshufb %xmm4,%xmm2 \n"
1933 "movdqa %xmm1,%xmm3 \n"
1934 "psrldq $0x2,%xmm1 \n"
1935 "paddusw %xmm1,%xmm3 \n"
1936 "psrldq $0x2,%xmm1 \n"
1937 "paddusw %xmm1,%xmm3 \n"
1938 "pshufb %xmm5,%xmm3 \n"
1939 "paddusw %xmm3,%xmm2 \n"
1940 "pmulhuw %xmm6,%xmm2 \n"
1941 "packuswb %xmm2,%xmm2 \n"
1942 "movd %xmm2,(%edi) \n"
1943 "pextrw $0x2,%xmm2,%eax \n"
1944 "mov %ax,0x4(%edi) \n"
1945 "lea 0x6(%edi),%edi \n"
1946 "sub $0x6,%ecx \n"
1947 "ja 1b \n"
1948 "popa \n"
1949 "ret \n"
1950 );
1951
1952 void ScaleRowDown38_2_Int_SSSE3(const uint8* src_ptr, int src_stride,
1953 uint8* dst_ptr, int dst_width);
1954 asm(
1955 DECLARE_FUNCTION(ScaleRowDown38_2_Int_SSSE3)
1956 "pusha \n"
1957 "mov 0x24(%esp),%esi \n"
1958 "mov 0x28(%esp),%edx \n"
1959 "mov 0x2c(%esp),%edi \n"
1960 "mov 0x30(%esp),%ecx \n"
1961 "movdqa _shufab0,%xmm4 \n"
1962 "movdqa _shufab1,%xmm5 \n"
1963 "movdqa _shufab2,%xmm6 \n"
1964 "movdqa _scaleab2,%xmm7 \n"
1965
1966 "1:"
1967 "movdqa (%esi),%xmm2 \n"
1968 "pavgb (%esi,%edx,1),%xmm2 \n"
1969 "lea 0x10(%esi),%esi \n"
1970 "movdqa %xmm2,%xmm0 \n"
1971 "pshufb %xmm4,%xmm0 \n"
1972 "movdqa %xmm2,%xmm1 \n"
1973 "pshufb %xmm5,%xmm1 \n"
1974 "paddusw %xmm1,%xmm0 \n"
1975 "pshufb %xmm6,%xmm2 \n"
1976 "paddusw %xmm2,%xmm0 \n"
1977 "pmulhuw %xmm7,%xmm0 \n"
1978 "packuswb %xmm0,%xmm0 \n"
1979 "movd %xmm0,(%edi) \n"
1980 "pextrw $0x2,%xmm0,%eax \n"
1981 "mov %ax,0x4(%edi) \n"
1982 "lea 0x6(%edi),%edi \n"
1983 "sub $0x6,%ecx \n"
1984 "ja 1b \n"
1985 "popa \n"
1986 "ret \n"
1987 );
1988 #endif // __PIC__
1989
1990 #define HAS_SCALEADDROWS_SSE2
1991 void ScaleAddRows_SSE2(const uint8* src_ptr, int src_stride,
1992 uint16* dst_ptr, int src_width,
1993 int src_height);
1994 asm(
1995 DECLARE_FUNCTION(ScaleAddRows_SSE2)
1996 "pusha \n"
1997 "mov 0x24(%esp),%esi \n"
1998 "mov 0x28(%esp),%edx \n"
1999 "mov 0x2c(%esp),%edi \n"
2000 "mov 0x30(%esp),%ecx \n"
2001 "mov 0x34(%esp),%ebx \n"
2002 "pxor %xmm5,%xmm5 \n"
2003
2004 "1:"
2005 "movdqa (%esi),%xmm2 \n"
2006 "lea (%esi,%edx,1),%eax \n"
2007 "movhlps %xmm2,%xmm3 \n"
2008 "lea -0x1(%ebx),%ebp \n"
2009 "punpcklbw %xmm5,%xmm2 \n"
2010 "punpcklbw %xmm5,%xmm3 \n"
2011
2012 "2:"
2013 "movdqa (%eax),%xmm0 \n"
2014 "lea (%eax,%edx,1),%eax \n"
2015 "movhlps %xmm0,%xmm1 \n"
2016 "punpcklbw %xmm5,%xmm0 \n"
2017 "punpcklbw %xmm5,%xmm1 \n"
2018 "paddusw %xmm0,%xmm2 \n"
2019 "paddusw %xmm1,%xmm3 \n"
2020 "sub $0x1,%ebp \n"
2021 "ja 2b \n"
2022
2023 "movdqa %xmm2,(%edi) \n"
2024 "movdqa %xmm3,0x10(%edi) \n"
2025 "lea 0x20(%edi),%edi \n"
2026 "lea 0x10(%esi),%esi \n"
2027 "sub $0x10,%ecx \n"
2028 "ja 1b \n"
2029 "popa \n"
2030 "ret \n"
2031 );
2032
2033 // Bilinear row filtering combines 16x2 -> 16x1. SSE2 version
2034 #define HAS_SCALEFILTERROWS_SSE2
2035 void ScaleFilterRows_SSE2(uint8* dst_ptr,
2036 const uint8* src_ptr, int src_stride,
2037 int dst_width, int source_y_fraction);
2038 asm(
2039 DECLARE_FUNCTION(ScaleFilterRows_SSE2)
2040 "push %esi \n"
2041 "push %edi \n"
2042 "mov 0xc(%esp),%edi \n"
2043 "mov 0x10(%esp),%esi \n"
2044 "mov 0x14(%esp),%edx \n"
2045 "mov 0x18(%esp),%ecx \n"
2046 "mov 0x1c(%esp),%eax \n"
2047 "cmp $0x0,%eax \n"
2048 "je 2f \n"
2049 "cmp $0x80,%eax \n"
2050 "je 3f \n"
2051 "movd %eax,%xmm6 \n"
2052 "punpcklwd %xmm6,%xmm6 \n"
2053 "pshufd $0x0,%xmm6,%xmm6 \n"
2054 "neg %eax \n"
2055 "add $0x100,%eax \n"
2056 "movd %eax,%xmm5 \n"
2057 "punpcklwd %xmm5,%xmm5 \n"
2058 "pshufd $0x0,%xmm5,%xmm5 \n"
2059 "pxor %xmm7,%xmm7 \n"
2060
2061 "1:"
2062 "movdqa (%esi),%xmm0 \n"
2063 "movdqa (%esi,%edx,1),%xmm2 \n"
2064 "lea 0x10(%esi),%esi \n"
2065 "movdqa %xmm0,%xmm1 \n"
2066 "movdqa %xmm2,%xmm3 \n"
2067 "punpcklbw %xmm7,%xmm0 \n"
2068 "punpcklbw %xmm7,%xmm2 \n"
2069 "punpckhbw %xmm7,%xmm1 \n"
2070 "punpckhbw %xmm7,%xmm3 \n"
2071 "pmullw %xmm5,%xmm0 \n"
2072 "pmullw %xmm5,%xmm1 \n"
2073 "pmullw %xmm6,%xmm2 \n"
2074 "pmullw %xmm6,%xmm3 \n"
2075 "paddusw %xmm2,%xmm0 \n"
2076 "paddusw %xmm3,%xmm1 \n"
2077 "psrlw $0x8,%xmm0 \n"
2078 "psrlw $0x8,%xmm1 \n"
2079 "packuswb %xmm1,%xmm0 \n"
2080 "movdqa %xmm0,(%edi) \n"
2081 "lea 0x10(%edi),%edi \n"
2082 "sub $0x10,%ecx \n"
2083 "ja 1b \n"
2084 "mov -0x1(%edi),%al \n"
2085 "mov %al,(%edi) \n"
2086 "pop %edi \n"
2087 "pop %esi \n"
2088 "ret \n"
2089
2090 "2:"
2091 "movdqa (%esi),%xmm0 \n"
2092 "lea 0x10(%esi),%esi \n"
2093 "movdqa %xmm0,(%edi) \n"
2094 "lea 0x10(%edi),%edi \n"
2095 "sub $0x10,%ecx \n"
2096 "ja 2b \n"
2097
2098 "mov -0x1(%edi),%al \n"
2099 "mov %al,(%edi) \n"
2100 "pop %edi \n"
2101 "pop %esi \n"
2102 "ret \n"
2103
2104 "3:"
2105 "movdqa (%esi),%xmm0 \n"
2106 "movdqa (%esi,%edx,1),%xmm2 \n"
2107 "lea 0x10(%esi),%esi \n"
2108 "pavgb %xmm2,%xmm0 \n"
2109 "movdqa %xmm0,(%edi) \n"
2110 "lea 0x10(%edi),%edi \n"
2111 "sub $0x10,%ecx \n"
2112 "ja 3b \n"
2113
2114 "mov -0x1(%edi),%al \n"
2115 "mov %al,(%edi) \n"
2116 "pop %edi \n"
2117 "pop %esi \n"
2118 "ret \n"
2119 );
2120
2121 // Bilinear row filtering combines 16x2 -> 16x1. SSSE3 version
2122 #define HAS_SCALEFILTERROWS_SSSE3
2123 void ScaleFilterRows_SSSE3(uint8* dst_ptr,
2124 const uint8* src_ptr, int src_stride,
2125 int dst_width, int source_y_fraction);
2126 asm(
2127 DECLARE_FUNCTION(ScaleFilterRows_SSSE3)
2128 "push %esi \n"
2129 "push %edi \n"
2130 "mov 0xc(%esp),%edi \n"
2131 "mov 0x10(%esp),%esi \n"
2132 "mov 0x14(%esp),%edx \n"
2133 "mov 0x18(%esp),%ecx \n"
2134 "mov 0x1c(%esp),%eax \n"
2135 "shr %eax \n"
2136 "cmp $0x0,%eax \n"
2137 "je 2f \n"
2138 "cmp $0x40,%eax \n"
2139 "je 3f \n"
2140 "mov %al,%ah \n"
2141 "neg %al \n"
2142 "add $0x80,%al \n"
2143 "movd %eax,%xmm5 \n"
2144 "punpcklwd %xmm5,%xmm5 \n"
2145 "pshufd $0x0,%xmm5,%xmm5 \n"
2146
2147 "1:"
2148 "movdqa (%esi),%xmm0 \n"
2149 "movdqa (%esi,%edx,1),%xmm2 \n"
2150 "lea 0x10(%esi),%esi \n"
2151 "movdqa %xmm0,%xmm1 \n"
2152 "punpcklbw %xmm2,%xmm0 \n"
2153 "punpckhbw %xmm2,%xmm1 \n"
2154 "pmaddubsw %xmm5,%xmm0 \n"
2155 "pmaddubsw %xmm5,%xmm1 \n"
2156 "psrlw $0x7,%xmm0 \n"
2157 "psrlw $0x7,%xmm1 \n"
2158 "packuswb %xmm1,%xmm0 \n"
2159 "movdqa %xmm0,(%edi) \n"
2160 "lea 0x10(%edi),%edi \n"
2161 "sub $0x10,%ecx \n"
2162 "ja 1b \n"
2163 "mov -0x1(%edi),%al \n"
2164 "mov %al,(%edi) \n"
2165 "pop %edi \n"
2166 "pop %esi \n"
2167 "ret \n"
2168
2169 "2:"
2170 "movdqa (%esi),%xmm0 \n"
2171 "lea 0x10(%esi),%esi \n"
2172 "movdqa %xmm0,(%edi) \n"
2173 "lea 0x10(%edi),%edi \n"
2174 "sub $0x10,%ecx \n"
2175 "ja 2b \n"
2176 "mov -0x1(%edi),%al \n"
2177 "mov %al,(%edi) \n"
2178 "pop %edi \n"
2179 "pop %esi \n"
2180 "ret \n"
2181
2182 "3:"
2183 "movdqa (%esi),%xmm0 \n"
2184 "movdqa (%esi,%edx,1),%xmm2 \n"
2185 "lea 0x10(%esi),%esi \n"
2186 "pavgb %xmm2,%xmm0 \n"
2187 "movdqa %xmm0,(%edi) \n"
2188 "lea 0x10(%edi),%edi \n"
2189 "sub $0x10,%ecx \n"
2190 "ja 3b \n"
2191 "mov -0x1(%edi),%al \n"
2192 "mov %al,(%edi) \n"
2193 "pop %edi \n"
2194 "pop %esi \n"
2195 "ret \n"
2196 );
2197
2198 #elif defined(__x86_64__)
ScaleRowDown8Int_SSE2(const uint8 * src_ptr,int src_stride,uint8 * dst_ptr,int dst_width)2199 static void ScaleRowDown8Int_SSE2(const uint8* src_ptr, int src_stride,
2200 uint8* dst_ptr, int dst_width) {
2201 asm volatile (
2202 "lea (%3,%3,2),%%r10 \n"
2203 "pxor %%xmm7,%%xmm7 \n"
2204 "1:"
2205 "movdqa (%0),%%xmm0 \n"
2206 "movdqa 0x10(%0),%%xmm1 \n"
2207 "movdqa (%0,%3,1),%%xmm2 \n"
2208 "movdqa 0x10(%0,%3,1),%%xmm3 \n"
2209 "pavgb %%xmm2,%%xmm0 \n"
2210 "pavgb %%xmm3,%%xmm1 \n"
2211 "movdqa (%0,%3,2),%%xmm2 \n"
2212 "movdqa 0x10(%0,%3,2),%%xmm3 \n"
2213 "movdqa (%0,%%r10,1),%%xmm4 \n"
2214 "movdqa 0x10(%0,%%r10,1),%%xmm5 \n"
2215 "lea (%0,%3,4),%%r11 \n"
2216 "lea 0x20(%0),%0 \n"
2217 "pavgb %%xmm4,%%xmm2 \n"
2218 "pavgb %%xmm5,%%xmm3 \n"
2219 "pavgb %%xmm2,%%xmm0 \n"
2220 "pavgb %%xmm3,%%xmm1 \n"
2221 "movdqa 0x0(%%r11),%%xmm2 \n"
2222 "movdqa 0x10(%%r11),%%xmm3 \n"
2223 "movdqa 0x0(%%r11,%3,1),%%xmm4 \n"
2224 "movdqa 0x10(%%r11,%3,1),%%xmm5 \n"
2225 "pavgb %%xmm4,%%xmm2 \n"
2226 "pavgb %%xmm5,%%xmm3 \n"
2227 "movdqa 0x0(%%r11,%3,2),%%xmm4 \n"
2228 "movdqa 0x10(%%r11,%3,2),%%xmm5 \n"
2229 "movdqa 0x0(%%r11,%%r10,1),%%xmm6 \n"
2230 "pavgb %%xmm6,%%xmm4 \n"
2231 "movdqa 0x10(%%r11,%%r10,1),%%xmm6 \n"
2232 "pavgb %%xmm6,%%xmm5 \n"
2233 "pavgb %%xmm4,%%xmm2 \n"
2234 "pavgb %%xmm5,%%xmm3 \n"
2235 "pavgb %%xmm2,%%xmm0 \n"
2236 "pavgb %%xmm3,%%xmm1 \n"
2237 "psadbw %%xmm7,%%xmm0 \n"
2238 "psadbw %%xmm7,%%xmm1 \n"
2239 "pshufd $0xd8,%%xmm0,%%xmm0 \n"
2240 "pshufd $0x8d,%%xmm1,%%xmm1 \n"
2241 "por %%xmm1,%%xmm0 \n"
2242 "psrlw $0x3,%%xmm0 \n"
2243 "packuswb %%xmm0,%%xmm0 \n"
2244 "packuswb %%xmm0,%%xmm0 \n"
2245 "movd %%xmm0,(%1) \n"
2246 "lea 0x4(%1),%1 \n"
2247 "sub $0x4,%2 \n"
2248 "ja 1b \n"
2249 : "+r"(src_ptr), // %0
2250 "+r"(dst_ptr), // %1
2251 "+r"(dst_width) // %2
2252 : "r"((intptr_t)(src_stride)) // %3
2253 : "memory", "cc", "r10", "r11", "xmm6", "xmm7"
2254 );
2255 }
2256
2257 #define HAS_SCALEROWDOWN34_SSSE3
ScaleRowDown34_SSSE3(const uint8 * src_ptr,int src_stride,uint8 * dst_ptr,int dst_width)2258 static void ScaleRowDown34_SSSE3(const uint8* src_ptr, int src_stride,
2259 uint8* dst_ptr, int dst_width) {
2260 asm volatile (
2261 "movdqa (%3),%%xmm3 \n"
2262 "movdqa (%4),%%xmm4 \n"
2263 "movdqa (%5),%%xmm5 \n"
2264 "1:"
2265 "movdqa (%0),%%xmm0 \n"
2266 "movdqa 0x10(%0),%%xmm2 \n"
2267 "lea 0x20(%0),%0 \n"
2268 "movdqa %%xmm2,%%xmm1 \n"
2269 "palignr $0x8,%%xmm0,%%xmm1 \n"
2270 "pshufb %%xmm3,%%xmm0 \n"
2271 "pshufb %%xmm4,%%xmm1 \n"
2272 "pshufb %%xmm5,%%xmm2 \n"
2273 "movq %%xmm0,(%1) \n"
2274 "movq %%xmm1,0x8(%1) \n"
2275 "movq %%xmm2,0x10(%1) \n"
2276 "lea 0x18(%1),%1 \n"
2277 "sub $0x18,%2 \n"
2278 "ja 1b \n"
2279 : "+r"(src_ptr), // %0
2280 "+r"(dst_ptr), // %1
2281 "+r"(dst_width) // %2
2282 : "r"(_shuf0), // %3
2283 "r"(_shuf1), // %4
2284 "r"(_shuf2) // %5
2285 : "memory", "cc"
2286 );
2287 }
2288
ScaleRowDown34_1_Int_SSSE3(const uint8 * src_ptr,int src_stride,uint8 * dst_ptr,int dst_width)2289 static void ScaleRowDown34_1_Int_SSSE3(const uint8* src_ptr, int src_stride,
2290 uint8* dst_ptr, int dst_width) {
2291 asm volatile (
2292 "movdqa (%4),%%xmm2 \n" // _shuf01
2293 "movdqa (%5),%%xmm3 \n" // _shuf11
2294 "movdqa (%6),%%xmm4 \n" // _shuf21
2295 "movdqa (%7),%%xmm5 \n" // _madd01
2296 "movdqa (%8),%%xmm6 \n" // _madd11
2297 "movdqa (%9),%%xmm7 \n" // _round34
2298 "movdqa (%10),%%xmm8 \n" // _madd21
2299 "1:"
2300 "movdqa (%0),%%xmm0 \n"
2301 "movdqa (%0,%3),%%xmm1 \n"
2302 "pavgb %%xmm1,%%xmm0 \n"
2303 "pshufb %%xmm2,%%xmm0 \n"
2304 "pmaddubsw %%xmm5,%%xmm0 \n"
2305 "paddsw %%xmm7,%%xmm0 \n"
2306 "psrlw $0x2,%%xmm0 \n"
2307 "packuswb %%xmm0,%%xmm0 \n"
2308 "movq %%xmm0,(%1) \n"
2309 "movdqu 0x8(%0),%%xmm0 \n"
2310 "movdqu 0x8(%0,%3),%%xmm1 \n"
2311 "pavgb %%xmm1,%%xmm0 \n"
2312 "pshufb %%xmm3,%%xmm0 \n"
2313 "pmaddubsw %%xmm6,%%xmm0 \n"
2314 "paddsw %%xmm7,%%xmm0 \n"
2315 "psrlw $0x2,%%xmm0 \n"
2316 "packuswb %%xmm0,%%xmm0 \n"
2317 "movq %%xmm0,0x8(%1) \n"
2318 "movdqa 0x10(%0),%%xmm0 \n"
2319 "movdqa 0x10(%0,%3),%%xmm1 \n"
2320 "lea 0x20(%0),%0 \n"
2321 "pavgb %%xmm1,%%xmm0 \n"
2322 "pshufb %%xmm4,%%xmm0 \n"
2323 "pmaddubsw %%xmm8,%%xmm0 \n"
2324 "paddsw %%xmm7,%%xmm0 \n"
2325 "psrlw $0x2,%%xmm0 \n"
2326 "packuswb %%xmm0,%%xmm0 \n"
2327 "movq %%xmm0,0x10(%1) \n"
2328 "lea 0x18(%1),%1 \n"
2329 "sub $0x18,%2 \n"
2330 "ja 1b \n"
2331 : "+r"(src_ptr), // %0
2332 "+r"(dst_ptr), // %1
2333 "+r"(dst_width) // %2
2334 : "r"((intptr_t)(src_stride)), // %3
2335 "r"(_shuf01), // %4
2336 "r"(_shuf11), // %5
2337 "r"(_shuf21), // %6
2338 "r"(_madd01), // %7
2339 "r"(_madd11), // %8
2340 "r"(_round34), // %9
2341 "r"(_madd21) // %10
2342 : "memory", "cc", "xmm6", "xmm7", "xmm8"
2343 );
2344 }
2345
ScaleRowDown34_0_Int_SSSE3(const uint8 * src_ptr,int src_stride,uint8 * dst_ptr,int dst_width)2346 static void ScaleRowDown34_0_Int_SSSE3(const uint8* src_ptr, int src_stride,
2347 uint8* dst_ptr, int dst_width) {
2348 asm volatile (
2349 "movdqa (%4),%%xmm2 \n" // _shuf01
2350 "movdqa (%5),%%xmm3 \n" // _shuf11
2351 "movdqa (%6),%%xmm4 \n" // _shuf21
2352 "movdqa (%7),%%xmm5 \n" // _madd01
2353 "movdqa (%8),%%xmm6 \n" // _madd11
2354 "movdqa (%9),%%xmm7 \n" // _round34
2355 "movdqa (%10),%%xmm8 \n" // _madd21
2356 "1:"
2357 "movdqa (%0),%%xmm0 \n"
2358 "movdqa (%0,%3,1),%%xmm1 \n"
2359 "pavgb %%xmm0,%%xmm1 \n"
2360 "pavgb %%xmm1,%%xmm0 \n"
2361 "pshufb %%xmm2,%%xmm0 \n"
2362 "pmaddubsw %%xmm5,%%xmm0 \n"
2363 "paddsw %%xmm7,%%xmm0 \n"
2364 "psrlw $0x2,%%xmm0 \n"
2365 "packuswb %%xmm0,%%xmm0 \n"
2366 "movq %%xmm0,(%1) \n"
2367 "movdqu 0x8(%0),%%xmm0 \n"
2368 "movdqu 0x8(%0,%3,1),%%xmm1 \n"
2369 "pavgb %%xmm0,%%xmm1 \n"
2370 "pavgb %%xmm1,%%xmm0 \n"
2371 "pshufb %%xmm3,%%xmm0 \n"
2372 "pmaddubsw %%xmm6,%%xmm0 \n"
2373 "paddsw %%xmm7,%%xmm0 \n"
2374 "psrlw $0x2,%%xmm0 \n"
2375 "packuswb %%xmm0,%%xmm0 \n"
2376 "movq %%xmm0,0x8(%1) \n"
2377 "movdqa 0x10(%0),%%xmm0 \n"
2378 "movdqa 0x10(%0,%3,1),%%xmm1 \n"
2379 "lea 0x20(%0),%0 \n"
2380 "pavgb %%xmm0,%%xmm1 \n"
2381 "pavgb %%xmm1,%%xmm0 \n"
2382 "pshufb %%xmm4,%%xmm0 \n"
2383 "pmaddubsw %%xmm8,%%xmm0 \n"
2384 "paddsw %%xmm7,%%xmm0 \n"
2385 "psrlw $0x2,%%xmm0 \n"
2386 "packuswb %%xmm0,%%xmm0 \n"
2387 "movq %%xmm0,0x10(%1) \n"
2388 "lea 0x18(%1),%1 \n"
2389 "sub $0x18,%2 \n"
2390 "ja 1b \n"
2391 : "+r"(src_ptr), // %0
2392 "+r"(dst_ptr), // %1
2393 "+r"(dst_width) // %2
2394 : "r"((intptr_t)(src_stride)), // %3
2395 "r"(_shuf01), // %4
2396 "r"(_shuf11), // %5
2397 "r"(_shuf21), // %6
2398 "r"(_madd01), // %7
2399 "r"(_madd11), // %8
2400 "r"(_round34), // %9
2401 "r"(_madd21) // %10
2402 : "memory", "cc", "xmm6", "xmm7", "xmm8"
2403 );
2404 }
2405
2406 #define HAS_SCALEROWDOWN38_SSSE3
ScaleRowDown38_SSSE3(const uint8 * src_ptr,int src_stride,uint8 * dst_ptr,int dst_width)2407 static void ScaleRowDown38_SSSE3(const uint8* src_ptr, int src_stride,
2408 uint8* dst_ptr, int dst_width) {
2409 asm volatile (
2410 "movdqa (%3),%%xmm4 \n"
2411 "movdqa (%4),%%xmm5 \n"
2412 "1:"
2413 "movdqa (%0),%%xmm0 \n"
2414 "movdqa 0x10(%0),%%xmm1 \n"
2415 "lea 0x20(%0),%0 \n"
2416 "pshufb %%xmm4,%%xmm0 \n"
2417 "pshufb %%xmm5,%%xmm1 \n"
2418 "paddusb %%xmm1,%%xmm0 \n"
2419 "movq %%xmm0,(%1) \n"
2420 "movhlps %%xmm0,%%xmm1 \n"
2421 "movd %%xmm1,0x8(%1) \n"
2422 "lea 0xc(%1),%1 \n"
2423 "sub $0xc,%2 \n"
2424 "ja 1b \n"
2425 : "+r"(src_ptr), // %0
2426 "+r"(dst_ptr), // %1
2427 "+r"(dst_width) // %2
2428 : "r"(_shuf38a), // %3
2429 "r"(_shuf38b) // %4
2430 : "memory", "cc"
2431 );
2432 }
2433
ScaleRowDown38_3_Int_SSSE3(const uint8 * src_ptr,int src_stride,uint8 * dst_ptr,int dst_width)2434 static void ScaleRowDown38_3_Int_SSSE3(const uint8* src_ptr, int src_stride,
2435 uint8* dst_ptr, int dst_width) {
2436 asm volatile (
2437 "movdqa (%4),%%xmm4 \n"
2438 "movdqa (%5),%%xmm5 \n"
2439 "movdqa (%6),%%xmm6 \n"
2440 "pxor %%xmm7,%%xmm7 \n"
2441 "1:"
2442 "movdqa (%0),%%xmm0 \n"
2443 "movdqa (%0,%3,1),%%xmm2 \n"
2444 "movhlps %%xmm0,%%xmm1 \n"
2445 "movhlps %%xmm2,%%xmm3 \n"
2446 "punpcklbw %%xmm7,%%xmm0 \n"
2447 "punpcklbw %%xmm7,%%xmm1 \n"
2448 "punpcklbw %%xmm7,%%xmm2 \n"
2449 "punpcklbw %%xmm7,%%xmm3 \n"
2450 "paddusw %%xmm2,%%xmm0 \n"
2451 "paddusw %%xmm3,%%xmm1 \n"
2452 "movdqa (%0,%3,2),%%xmm2 \n"
2453 "lea 0x10(%0),%0 \n"
2454 "movhlps %%xmm2,%%xmm3 \n"
2455 "punpcklbw %%xmm7,%%xmm2 \n"
2456 "punpcklbw %%xmm7,%%xmm3 \n"
2457 "paddusw %%xmm2,%%xmm0 \n"
2458 "paddusw %%xmm3,%%xmm1 \n"
2459 "movdqa %%xmm0,%%xmm2 \n"
2460 "psrldq $0x2,%%xmm0 \n"
2461 "paddusw %%xmm0,%%xmm2 \n"
2462 "psrldq $0x2,%%xmm0 \n"
2463 "paddusw %%xmm0,%%xmm2 \n"
2464 "pshufb %%xmm4,%%xmm2 \n"
2465 "movdqa %%xmm1,%%xmm3 \n"
2466 "psrldq $0x2,%%xmm1 \n"
2467 "paddusw %%xmm1,%%xmm3 \n"
2468 "psrldq $0x2,%%xmm1 \n"
2469 "paddusw %%xmm1,%%xmm3 \n"
2470 "pshufb %%xmm5,%%xmm3 \n"
2471 "paddusw %%xmm3,%%xmm2 \n"
2472 "pmulhuw %%xmm6,%%xmm2 \n"
2473 "packuswb %%xmm2,%%xmm2 \n"
2474 "movd %%xmm2,(%1) \n"
2475 "pextrw $0x2,%%xmm2,%%eax \n"
2476 "mov %%ax,0x4(%1) \n"
2477 "lea 0x6(%1),%1 \n"
2478 "sub $0x6,%2 \n"
2479 "ja 1b \n"
2480 : "+r"(src_ptr), // %0
2481 "+r"(dst_ptr), // %1
2482 "+r"(dst_width) // %2
2483 : "r"((intptr_t)(src_stride)), // %3
2484 "r"(_shufac0), // %4
2485 "r"(_shufac3), // %5
2486 "r"(_scaleac3) // %6
2487 : "memory", "cc", "rax", "xmm6", "xmm7"
2488 );
2489 }
2490
ScaleRowDown38_2_Int_SSSE3(const uint8 * src_ptr,int src_stride,uint8 * dst_ptr,int dst_width)2491 static void ScaleRowDown38_2_Int_SSSE3(const uint8* src_ptr, int src_stride,
2492 uint8* dst_ptr, int dst_width) {
2493 asm volatile (
2494 "movdqa (%4),%%xmm4 \n"
2495 "movdqa (%5),%%xmm5 \n"
2496 "movdqa (%6),%%xmm6 \n"
2497 "movdqa (%7),%%xmm7 \n"
2498 "1:"
2499 "movdqa (%0),%%xmm2 \n"
2500 "pavgb (%0,%3,1),%%xmm2 \n"
2501 "lea 0x10(%0),%0 \n"
2502 "movdqa %%xmm2,%%xmm0 \n"
2503 "pshufb %%xmm4,%%xmm0 \n"
2504 "movdqa %%xmm2,%%xmm1 \n"
2505 "pshufb %%xmm5,%%xmm1 \n"
2506 "paddusw %%xmm1,%%xmm0 \n"
2507 "pshufb %%xmm6,%%xmm2 \n"
2508 "paddusw %%xmm2,%%xmm0 \n"
2509 "pmulhuw %%xmm7,%%xmm0 \n"
2510 "packuswb %%xmm0,%%xmm0 \n"
2511 "movd %%xmm0,(%1) \n"
2512 "pextrw $0x2,%%xmm0,%%eax \n"
2513 "mov %%ax,0x4(%1) \n"
2514 "lea 0x6(%1),%1 \n"
2515 "sub $0x6,%2 \n"
2516 "ja 1b \n"
2517 : "+r"(src_ptr), // %0
2518 "+r"(dst_ptr), // %1
2519 "+r"(dst_width) // %2
2520 : "r"((intptr_t)(src_stride)), // %3
2521 "r"(_shufab0), // %4
2522 "r"(_shufab1), // %5
2523 "r"(_shufab2), // %6
2524 "r"(_scaleab2) // %7
2525 : "memory", "cc", "rax", "xmm6", "xmm7"
2526 );
2527 }
2528
2529 #define HAS_SCALEADDROWS_SSE2
ScaleAddRows_SSE2(const uint8 * src_ptr,int src_stride,uint16 * dst_ptr,int src_width,int src_height)2530 static void ScaleAddRows_SSE2(const uint8* src_ptr, int src_stride,
2531 uint16* dst_ptr, int src_width,
2532 int src_height) {
2533 asm volatile (
2534 "pxor %%xmm5,%%xmm5 \n"
2535 "1:"
2536 "movdqa (%0),%%xmm2 \n"
2537 "lea (%0,%4,1),%%r10 \n"
2538 "movhlps %%xmm2,%%xmm3 \n"
2539 "lea -0x1(%3),%%r11 \n"
2540 "punpcklbw %%xmm5,%%xmm2 \n"
2541 "punpcklbw %%xmm5,%%xmm3 \n"
2542
2543 "2:"
2544 "movdqa (%%r10),%%xmm0 \n"
2545 "lea (%%r10,%4,1),%%r10 \n"
2546 "movhlps %%xmm0,%%xmm1 \n"
2547 "punpcklbw %%xmm5,%%xmm0 \n"
2548 "punpcklbw %%xmm5,%%xmm1 \n"
2549 "paddusw %%xmm0,%%xmm2 \n"
2550 "paddusw %%xmm1,%%xmm3 \n"
2551 "sub $0x1,%%r11 \n"
2552 "ja 2b \n"
2553
2554 "movdqa %%xmm2,(%1) \n"
2555 "movdqa %%xmm3,0x10(%1) \n"
2556 "lea 0x20(%1),%1 \n"
2557 "lea 0x10(%0),%0 \n"
2558 "sub $0x10,%2 \n"
2559 "ja 1b \n"
2560 : "+r"(src_ptr), // %0
2561 "+r"(dst_ptr), // %1
2562 "+r"(src_width), // %2
2563 "+r"(src_height) // %3
2564 : "r"((intptr_t)(src_stride)) // %4
2565 : "memory", "cc", "r10", "r11"
2566 );
2567 }
2568
2569 // Bilinear row filtering combines 16x2 -> 16x1. SSE2 version
2570 #define HAS_SCALEFILTERROWS_SSE2
ScaleFilterRows_SSE2(uint8 * dst_ptr,const uint8 * src_ptr,int src_stride,int dst_width,int source_y_fraction)2571 static void ScaleFilterRows_SSE2(uint8* dst_ptr,
2572 const uint8* src_ptr, int src_stride,
2573 int dst_width, int source_y_fraction) {
2574 if (source_y_fraction == 0) {
2575 asm volatile (
2576 "1:"
2577 "movdqa (%1),%%xmm0 \n"
2578 "lea 0x10(%1),%1 \n"
2579 "movdqa %%xmm0,(%0) \n"
2580 "lea 0x10(%0),%0 \n"
2581 "sub $0x10,%2 \n"
2582 "ja 1b \n"
2583 "mov -0x1(%0),%%al \n"
2584 "mov %%al,(%0) \n"
2585 : "+r"(dst_ptr), // %0
2586 "+r"(src_ptr), // %1
2587 "+r"(dst_width) // %2
2588 :
2589 : "memory", "cc", "rax"
2590 );
2591 return;
2592 } else if (source_y_fraction == 128) {
2593 asm volatile (
2594 "1:"
2595 "movdqa (%1),%%xmm0 \n"
2596 "movdqa (%1,%3,1),%%xmm2 \n"
2597 "lea 0x10(%1),%1 \n"
2598 "pavgb %%xmm2,%%xmm0 \n"
2599 "movdqa %%xmm0,(%0) \n"
2600 "lea 0x10(%0),%0 \n"
2601 "sub $0x10,%2 \n"
2602 "ja 1b \n"
2603 "mov -0x1(%0),%%al \n"
2604 "mov %%al,(%0) \n"
2605 : "+r"(dst_ptr), // %0
2606 "+r"(src_ptr), // %1
2607 "+r"(dst_width) // %2
2608 : "r"((intptr_t)(src_stride)) // %3
2609 : "memory", "cc", "rax"
2610 );
2611 return;
2612 } else {
2613 asm volatile (
2614 "mov %3,%%eax \n"
2615 "movd %%eax,%%xmm6 \n"
2616 "punpcklwd %%xmm6,%%xmm6 \n"
2617 "pshufd $0x0,%%xmm6,%%xmm6 \n"
2618 "neg %%eax \n"
2619 "add $0x100,%%eax \n"
2620 "movd %%eax,%%xmm5 \n"
2621 "punpcklwd %%xmm5,%%xmm5 \n"
2622 "pshufd $0x0,%%xmm5,%%xmm5 \n"
2623 "pxor %%xmm7,%%xmm7 \n"
2624 "1:"
2625 "movdqa (%1),%%xmm0 \n"
2626 "movdqa (%1,%4,1),%%xmm2 \n"
2627 "lea 0x10(%1),%1 \n"
2628 "movdqa %%xmm0,%%xmm1 \n"
2629 "movdqa %%xmm2,%%xmm3 \n"
2630 "punpcklbw %%xmm7,%%xmm0 \n"
2631 "punpcklbw %%xmm7,%%xmm2 \n"
2632 "punpckhbw %%xmm7,%%xmm1 \n"
2633 "punpckhbw %%xmm7,%%xmm3 \n"
2634 "pmullw %%xmm5,%%xmm0 \n"
2635 "pmullw %%xmm5,%%xmm1 \n"
2636 "pmullw %%xmm6,%%xmm2 \n"
2637 "pmullw %%xmm6,%%xmm3 \n"
2638 "paddusw %%xmm2,%%xmm0 \n"
2639 "paddusw %%xmm3,%%xmm1 \n"
2640 "psrlw $0x8,%%xmm0 \n"
2641 "psrlw $0x8,%%xmm1 \n"
2642 "packuswb %%xmm1,%%xmm0 \n"
2643 "movdqa %%xmm0,(%0) \n"
2644 "lea 0x10(%0),%0 \n"
2645 "sub $0x10,%2 \n"
2646 "ja 1b \n"
2647 "mov -0x1(%0),%%al \n"
2648 "mov %%al,(%0) \n"
2649 : "+r"(dst_ptr), // %0
2650 "+r"(src_ptr), // %1
2651 "+r"(dst_width), // %2
2652 "+r"(source_y_fraction) // %3
2653 : "r"((intptr_t)(src_stride)) // %4
2654 : "memory", "cc", "rax", "xmm6", "xmm7"
2655 );
2656 }
2657 return;
2658 }
2659
2660 // Bilinear row filtering combines 16x2 -> 16x1. SSSE3 version
2661 #define HAS_SCALEFILTERROWS_SSSE3
ScaleFilterRows_SSSE3(uint8 * dst_ptr,const uint8 * src_ptr,int src_stride,int dst_width,int source_y_fraction)2662 static void ScaleFilterRows_SSSE3(uint8* dst_ptr,
2663 const uint8* src_ptr, int src_stride,
2664 int dst_width, int source_y_fraction) {
2665 source_y_fraction >>= 1;
2666 if (source_y_fraction == 0) {
2667 asm volatile (
2668 "1:"
2669 "movdqa (%1),%%xmm0 \n"
2670 "lea 0x10(%1),%1 \n"
2671 "movdqa %%xmm0,(%0) \n"
2672 "lea 0x10(%0),%0 \n"
2673 "sub $0x10,%2 \n"
2674 "ja 1b \n"
2675 "mov -0x1(%0),%%al \n"
2676 "mov %%al,(%0) \n"
2677 : "+r"(dst_ptr), // %0
2678 "+r"(src_ptr), // %1
2679 "+r"(dst_width) // %2
2680 :
2681 : "memory", "cc", "rax"
2682 );
2683 return;
2684 } else if (source_y_fraction == 64) {
2685 asm volatile (
2686 "1:"
2687 "movdqa (%1),%%xmm0 \n"
2688 "movdqa (%1,%3,1),%%xmm2 \n"
2689 "lea 0x10(%1),%1 \n"
2690 "pavgb %%xmm2,%%xmm0 \n"
2691 "movdqa %%xmm0,(%0) \n"
2692 "lea 0x10(%0),%0 \n"
2693 "sub $0x10,%2 \n"
2694 "ja 1b \n"
2695 "mov -0x1(%0),%%al \n"
2696 "mov %%al,(%0) \n"
2697 : "+r"(dst_ptr), // %0
2698 "+r"(src_ptr), // %1
2699 "+r"(dst_width) // %2
2700 : "r"((intptr_t)(src_stride)) // %3
2701 : "memory", "cc", "rax"
2702 );
2703 return;
2704 } else {
2705 asm volatile (
2706 "mov %3,%%eax \n"
2707 "mov %%al,%%ah \n"
2708 "neg %%al \n"
2709 "add $0x80,%%al \n"
2710 "movd %%eax,%%xmm5 \n"
2711 "punpcklwd %%xmm5,%%xmm5 \n"
2712 "pshufd $0x0,%%xmm5,%%xmm5 \n"
2713 "1:"
2714 "movdqa (%1),%%xmm0 \n"
2715 "movdqa (%1,%4,1),%%xmm2 \n"
2716 "lea 0x10(%1),%1 \n"
2717 "movdqa %%xmm0,%%xmm1 \n"
2718 "punpcklbw %%xmm2,%%xmm0 \n"
2719 "punpckhbw %%xmm2,%%xmm1 \n"
2720 "pmaddubsw %%xmm5,%%xmm0 \n"
2721 "pmaddubsw %%xmm5,%%xmm1 \n"
2722 "psrlw $0x7,%%xmm0 \n"
2723 "psrlw $0x7,%%xmm1 \n"
2724 "packuswb %%xmm1,%%xmm0 \n"
2725 "movdqa %%xmm0,(%0) \n"
2726 "lea 0x10(%0),%0 \n"
2727 "sub $0x10,%2 \n"
2728 "ja 1b \n"
2729 "mov -0x1(%0),%%al \n"
2730 "mov %%al,(%0) \n"
2731 : "+r"(dst_ptr), // %0
2732 "+r"(src_ptr), // %1
2733 "+r"(dst_width), // %2
2734 "+r"(source_y_fraction) // %3
2735 : "r"((intptr_t)(src_stride)) // %4
2736 : "memory", "cc", "rax"
2737 );
2738 }
2739 return;
2740 }
2741 #endif
2742 #endif
2743
2744 // CPU agnostic row functions
ScaleRowDown2_C(const uint8 * src_ptr,int src_stride,uint8 * dst,int dst_width)2745 static void ScaleRowDown2_C(const uint8* src_ptr, int src_stride,
2746 uint8* dst, int dst_width) {
2747 int x;
2748 for (x = 0; x < dst_width; ++x) {
2749 *dst++ = *src_ptr;
2750 src_ptr += 2;
2751 }
2752 }
2753
ScaleRowDown2Int_C(const uint8 * src_ptr,int src_stride,uint8 * dst,int dst_width)2754 static void ScaleRowDown2Int_C(const uint8* src_ptr, int src_stride,
2755 uint8* dst, int dst_width) {
2756 int x;
2757 for (x = 0; x < dst_width; ++x) {
2758 *dst++ = (src_ptr[0] + src_ptr[1] +
2759 src_ptr[src_stride] + src_ptr[src_stride + 1] + 2) >> 2;
2760 src_ptr += 2;
2761 }
2762 }
2763
ScaleRowDown4_C(const uint8 * src_ptr,int src_stride,uint8 * dst,int dst_width)2764 static void ScaleRowDown4_C(const uint8* src_ptr, int src_stride,
2765 uint8* dst, int dst_width) {
2766 int x;
2767 for (x = 0; x < dst_width; ++x) {
2768 *dst++ = *src_ptr;
2769 src_ptr += 4;
2770 }
2771 }
2772
ScaleRowDown4Int_C(const uint8 * src_ptr,int src_stride,uint8 * dst,int dst_width)2773 static void ScaleRowDown4Int_C(const uint8* src_ptr, int src_stride,
2774 uint8* dst, int dst_width) {
2775 int x;
2776 for (x = 0; x < dst_width; ++x) {
2777 *dst++ = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] +
2778 src_ptr[src_stride + 0] + src_ptr[src_stride + 1] +
2779 src_ptr[src_stride + 2] + src_ptr[src_stride + 3] +
2780 src_ptr[src_stride * 2 + 0] + src_ptr[src_stride * 2 + 1] +
2781 src_ptr[src_stride * 2 + 2] + src_ptr[src_stride * 2 + 3] +
2782 src_ptr[src_stride * 3 + 0] + src_ptr[src_stride * 3 + 1] +
2783 src_ptr[src_stride * 3 + 2] + src_ptr[src_stride * 3 + 3] +
2784 8) >> 4;
2785 src_ptr += 4;
2786 }
2787 }
2788
2789 // 640 output pixels is enough to allow 5120 input pixels with 1/8 scale down.
2790 // Keeping the total buffer under 4096 bytes avoids a stackcheck, saving 4% cpu.
2791 // The following 2 lines cause error on Windows.
2792 //static const int kMaxOutputWidth = 640;
2793 //static const int kMaxRow12 = 1280; //kMaxOutputWidth * 2;
2794 #define kMaxOutputWidth 640
2795 #define kMaxRow12 1280
2796
ScaleRowDown8_C(const uint8 * src_ptr,int src_stride,uint8 * dst,int dst_width)2797 static void ScaleRowDown8_C(const uint8* src_ptr, int src_stride,
2798 uint8* dst, int dst_width) {
2799 int x;
2800 for (x = 0; x < dst_width; ++x) {
2801 *dst++ = *src_ptr;
2802 src_ptr += 8;
2803 }
2804 }
2805
2806 // Note calling code checks width is less than max and if not
2807 // uses ScaleRowDown8_C instead.
ScaleRowDown8Int_C(const uint8 * src_ptr,int src_stride,uint8 * dst,int dst_width)2808 static void ScaleRowDown8Int_C(const uint8* src_ptr, int src_stride,
2809 uint8* dst, int dst_width) {
2810 ALIGN16(uint8 src_row[kMaxRow12 * 2]);
2811 assert(dst_width <= kMaxOutputWidth);
2812 ScaleRowDown4Int_C(src_ptr, src_stride, src_row, dst_width * 2);
2813 ScaleRowDown4Int_C(src_ptr + src_stride * 4, src_stride,
2814 src_row + kMaxOutputWidth,
2815 dst_width * 2);
2816 ScaleRowDown2Int_C(src_row, kMaxOutputWidth, dst, dst_width);
2817 }
2818
ScaleRowDown34_C(const uint8 * src_ptr,int src_stride,uint8 * dst,int dst_width)2819 static void ScaleRowDown34_C(const uint8* src_ptr, int src_stride,
2820 uint8* dst, int dst_width) {
2821 uint8* dend;
2822 assert((dst_width % 3 == 0) && (dst_width > 0));
2823 dend = dst + dst_width;
2824 do {
2825 dst[0] = src_ptr[0];
2826 dst[1] = src_ptr[1];
2827 dst[2] = src_ptr[3];
2828 dst += 3;
2829 src_ptr += 4;
2830 } while (dst < dend);
2831 }
2832
2833 // Filter rows 0 and 1 together, 3 : 1
ScaleRowDown34_0_Int_C(const uint8 * src_ptr,int src_stride,uint8 * d,int dst_width)2834 static void ScaleRowDown34_0_Int_C(const uint8* src_ptr, int src_stride,
2835 uint8* d, int dst_width) {
2836 uint8* dend;
2837 const uint8* s;
2838 const uint8* t;
2839 assert((dst_width % 3 == 0) && (dst_width > 0));
2840 dend = d + dst_width;
2841 s = src_ptr;
2842 t = src_ptr + src_stride;
2843 do {
2844 uint8 a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2;
2845 uint8 a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1;
2846 uint8 a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2;
2847 uint8 b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2;
2848 uint8 b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1;
2849 uint8 b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2;
2850 d[0] = (a0 * 3 + b0 + 2) >> 2;
2851 d[1] = (a1 * 3 + b1 + 2) >> 2;
2852 d[2] = (a2 * 3 + b2 + 2) >> 2;
2853 d += 3;
2854 s += 4;
2855 t += 4;
2856 } while (d < dend);
2857 }
2858
2859 // Filter rows 1 and 2 together, 1 : 1
ScaleRowDown34_1_Int_C(const uint8 * src_ptr,int src_stride,uint8 * d,int dst_width)2860 static void ScaleRowDown34_1_Int_C(const uint8* src_ptr, int src_stride,
2861 uint8* d, int dst_width) {
2862 uint8* dend;
2863 const uint8* s;
2864 const uint8* t;
2865 assert((dst_width % 3 == 0) && (dst_width > 0));
2866 dend = d + dst_width;
2867 s = src_ptr;
2868 t = src_ptr + src_stride;
2869 do {
2870 uint8 a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2;
2871 uint8 a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1;
2872 uint8 a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2;
2873 uint8 b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2;
2874 uint8 b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1;
2875 uint8 b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2;
2876 d[0] = (a0 + b0 + 1) >> 1;
2877 d[1] = (a1 + b1 + 1) >> 1;
2878 d[2] = (a2 + b2 + 1) >> 1;
2879 d += 3;
2880 s += 4;
2881 t += 4;
2882 } while (d < dend);
2883 }
2884
2885 #if defined(HAS_SCALEFILTERROWS_SSE2)
2886 // Filter row to 3/4
ScaleFilterCols34_C(uint8 * dst_ptr,const uint8 * src_ptr,int dst_width)2887 static void ScaleFilterCols34_C(uint8* dst_ptr, const uint8* src_ptr,
2888 int dst_width) {
2889 uint8* dend;
2890 const uint8* s;
2891 assert((dst_width % 3 == 0) && (dst_width > 0));
2892 dend = dst_ptr + dst_width;
2893 s = src_ptr;
2894 do {
2895 dst_ptr[0] = (s[0] * 3 + s[1] * 1 + 2) >> 2;
2896 dst_ptr[1] = (s[1] * 1 + s[2] * 1 + 1) >> 1;
2897 dst_ptr[2] = (s[2] * 1 + s[3] * 3 + 2) >> 2;
2898 dst_ptr += 3;
2899 s += 4;
2900 } while (dst_ptr < dend);
2901 }
2902 #endif
2903
ScaleFilterCols_C(uint8 * dst_ptr,const uint8 * src_ptr,int dst_width,int dx)2904 static void ScaleFilterCols_C(uint8* dst_ptr, const uint8* src_ptr,
2905 int dst_width, int dx) {
2906 int x = 0;
2907 int j;
2908 for (j = 0; j < dst_width; ++j) {
2909 int xi = x >> 16;
2910 int xf1 = x & 0xffff;
2911 int xf0 = 65536 - xf1;
2912
2913 *dst_ptr++ = (src_ptr[xi] * xf0 + src_ptr[xi + 1] * xf1) >> 16;
2914 x += dx;
2915 }
2916 }
2917
2918 //Not work on Windows
2919 //static const int kMaxInputWidth = 2560;
2920 #define kMaxInputWidth 2560
2921 #if defined(HAS_SCALEFILTERROWS_SSE2)
2922 #define HAS_SCALEROWDOWN34_SSE2
2923 // Filter rows 0 and 1 together, 3 : 1
ScaleRowDown34_0_Int_SSE2(const uint8 * src_ptr,int src_stride,uint8 * dst_ptr,int dst_width)2924 static void ScaleRowDown34_0_Int_SSE2(const uint8* src_ptr, int src_stride,
2925 uint8* dst_ptr, int dst_width) {
2926 ALIGN16(uint8 row[kMaxInputWidth]);
2927 assert((dst_width % 3 == 0) && (dst_width > 0));
2928 ScaleFilterRows_SSE2(row, src_ptr, src_stride, dst_width * 4 / 3, 256 / 4);
2929 ScaleFilterCols34_C(dst_ptr, row, dst_width);
2930 }
2931
2932 // Filter rows 1 and 2 together, 1 : 1
ScaleRowDown34_1_Int_SSE2(const uint8 * src_ptr,int src_stride,uint8 * dst_ptr,int dst_width)2933 static void ScaleRowDown34_1_Int_SSE2(const uint8* src_ptr, int src_stride,
2934 uint8* dst_ptr, int dst_width) {
2935 ALIGN16(uint8 row[kMaxInputWidth]);
2936 assert((dst_width % 3 == 0) && (dst_width > 0));
2937 ScaleFilterRows_SSE2(row, src_ptr, src_stride, dst_width * 4 / 3, 256 / 2);
2938 ScaleFilterCols34_C(dst_ptr, row, dst_width);
2939 }
2940 #endif
2941
ScaleRowDown38_C(const uint8 * src_ptr,int src_stride,uint8 * dst,int dst_width)2942 static void ScaleRowDown38_C(const uint8* src_ptr, int src_stride,
2943 uint8* dst, int dst_width) {
2944 int x;
2945 assert(dst_width % 3 == 0);
2946 for (x = 0; x < dst_width; x += 3) {
2947 dst[0] = src_ptr[0];
2948 dst[1] = src_ptr[3];
2949 dst[2] = src_ptr[6];
2950 dst += 3;
2951 src_ptr += 8;
2952 }
2953 }
2954
2955 // 8x3 -> 3x1
ScaleRowDown38_3_Int_C(const uint8 * src_ptr,int src_stride,uint8 * dst_ptr,int dst_width)2956 static void ScaleRowDown38_3_Int_C(const uint8* src_ptr, int src_stride,
2957 uint8* dst_ptr, int dst_width) {
2958 int i;
2959 assert((dst_width % 3 == 0) && (dst_width > 0));
2960 for (i = 0; i < dst_width; i+=3) {
2961 dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] +
2962 src_ptr[src_stride + 0] + src_ptr[src_stride + 1] +
2963 src_ptr[src_stride + 2] + src_ptr[src_stride * 2 + 0] +
2964 src_ptr[src_stride * 2 + 1] + src_ptr[src_stride * 2 + 2]) *
2965 (65536 / 9) >> 16;
2966 dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] +
2967 src_ptr[src_stride + 3] + src_ptr[src_stride + 4] +
2968 src_ptr[src_stride + 5] + src_ptr[src_stride * 2 + 3] +
2969 src_ptr[src_stride * 2 + 4] + src_ptr[src_stride * 2 + 5]) *
2970 (65536 / 9) >> 16;
2971 dst_ptr[2] = (src_ptr[6] + src_ptr[7] +
2972 src_ptr[src_stride + 6] + src_ptr[src_stride + 7] +
2973 src_ptr[src_stride * 2 + 6] + src_ptr[src_stride * 2 + 7]) *
2974 (65536 / 6) >> 16;
2975 src_ptr += 8;
2976 dst_ptr += 3;
2977 }
2978 }
2979
2980 // 8x2 -> 3x1
ScaleRowDown38_2_Int_C(const uint8 * src_ptr,int src_stride,uint8 * dst_ptr,int dst_width)2981 static void ScaleRowDown38_2_Int_C(const uint8* src_ptr, int src_stride,
2982 uint8* dst_ptr, int dst_width) {
2983 int i;
2984 assert((dst_width % 3 == 0) && (dst_width > 0));
2985 for (i = 0; i < dst_width; i+=3) {
2986 dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] +
2987 src_ptr[src_stride + 0] + src_ptr[src_stride + 1] +
2988 src_ptr[src_stride + 2]) * (65536 / 6) >> 16;
2989 dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] +
2990 src_ptr[src_stride + 3] + src_ptr[src_stride + 4] +
2991 src_ptr[src_stride + 5]) * (65536 / 6) >> 16;
2992 dst_ptr[2] = (src_ptr[6] + src_ptr[7] +
2993 src_ptr[src_stride + 6] + src_ptr[src_stride + 7]) *
2994 (65536 / 4) >> 16;
2995 src_ptr += 8;
2996 dst_ptr += 3;
2997 }
2998 }
2999
3000 // C version 8x2 -> 8x1
ScaleFilterRows_C(uint8 * dst_ptr,const uint8 * src_ptr,int src_stride,int dst_width,int source_y_fraction)3001 static void ScaleFilterRows_C(uint8* dst_ptr,
3002 const uint8* src_ptr, int src_stride,
3003 int dst_width, int source_y_fraction) {
3004 int y1_fraction;
3005 int y0_fraction;
3006 const uint8* src_ptr1;
3007 uint8* end;
3008 assert(dst_width > 0);
3009 y1_fraction = source_y_fraction;
3010 y0_fraction = 256 - y1_fraction;
3011 src_ptr1 = src_ptr + src_stride;
3012 end = dst_ptr + dst_width;
3013 do {
3014 dst_ptr[0] = (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction) >> 8;
3015 dst_ptr[1] = (src_ptr[1] * y0_fraction + src_ptr1[1] * y1_fraction) >> 8;
3016 dst_ptr[2] = (src_ptr[2] * y0_fraction + src_ptr1[2] * y1_fraction) >> 8;
3017 dst_ptr[3] = (src_ptr[3] * y0_fraction + src_ptr1[3] * y1_fraction) >> 8;
3018 dst_ptr[4] = (src_ptr[4] * y0_fraction + src_ptr1[4] * y1_fraction) >> 8;
3019 dst_ptr[5] = (src_ptr[5] * y0_fraction + src_ptr1[5] * y1_fraction) >> 8;
3020 dst_ptr[6] = (src_ptr[6] * y0_fraction + src_ptr1[6] * y1_fraction) >> 8;
3021 dst_ptr[7] = (src_ptr[7] * y0_fraction + src_ptr1[7] * y1_fraction) >> 8;
3022 src_ptr += 8;
3023 src_ptr1 += 8;
3024 dst_ptr += 8;
3025 } while (dst_ptr < end);
3026 dst_ptr[0] = dst_ptr[-1];
3027 }
3028
ScaleAddRows_C(const uint8 * src_ptr,int src_stride,uint16 * dst_ptr,int src_width,int src_height)3029 void ScaleAddRows_C(const uint8* src_ptr, int src_stride,
3030 uint16* dst_ptr, int src_width, int src_height) {
3031 int x,y;
3032 assert(src_width > 0);
3033 assert(src_height > 0);
3034 for (x = 0; x < src_width; ++x) {
3035 const uint8* s = src_ptr + x;
3036 int sum = 0;
3037 for (y = 0; y < src_height; ++y) {
3038 sum += s[0];
3039 s += src_stride;
3040 }
3041 dst_ptr[x] = sum;
3042 }
3043 }
3044
3045 /**
3046 * Scale plane, 1/2
3047 *
3048 * This is an optimized version for scaling down a plane to 1/2 of
3049 * its original size.
3050 *
3051 */
ScalePlaneDown2(int src_width,int src_height,int dst_width,int dst_height,int src_stride,int dst_stride,const uint8 * src_ptr,uint8 * dst_ptr,FilterMode filtering)3052 static void ScalePlaneDown2(int src_width, int src_height,
3053 int dst_width, int dst_height,
3054 int src_stride, int dst_stride,
3055 const uint8* src_ptr, uint8* dst_ptr,
3056 FilterMode filtering) {
3057 void (*ScaleRowDown2)(const uint8* src_ptr, int src_stride,
3058 uint8* dst_ptr, int dst_width);
3059 assert(IS_ALIGNED(src_width, 2));
3060 assert(IS_ALIGNED(src_height, 2));
3061
3062 #if defined(HAS_SCALEROWDOWN2_NEON)
3063 if (TestCpuFlag(kCpuHasNEON) &&
3064 IS_ALIGNED(dst_width, 16)) {
3065 ScaleRowDown2 = filtering ? ScaleRowDown2Int_NEON : ScaleRowDown2_NEON;
3066 } else
3067 #endif
3068 #if defined(HAS_SCALEROWDOWN2_SSE2)
3069 if (TestCpuFlag(kCpuHasSSE2) &&
3070 IS_ALIGNED(dst_width, 16) &&
3071 IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16) &&
3072 IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) {
3073 ScaleRowDown2 = filtering ? ScaleRowDown2Int_SSE2 : ScaleRowDown2_SSE2;
3074 } else
3075 #endif
3076 {
3077 ScaleRowDown2 = filtering ? ScaleRowDown2Int_C : ScaleRowDown2_C;
3078 }
3079
3080 {
3081 int y;
3082 for (y = 0; y < dst_height; ++y) {
3083 ScaleRowDown2(src_ptr, src_stride, dst_ptr, dst_width);
3084 src_ptr += (src_stride << 1);
3085 dst_ptr += dst_stride;
3086 }
3087 }
3088 }
3089
3090 /**
3091 * Scale plane, 1/4
3092 *
3093 * This is an optimized version for scaling down a plane to 1/4 of
3094 * its original size.
3095 */
ScalePlaneDown4(int src_width,int src_height,int dst_width,int dst_height,int src_stride,int dst_stride,const uint8 * src_ptr,uint8 * dst_ptr,FilterMode filtering)3096 static void ScalePlaneDown4(int src_width, int src_height,
3097 int dst_width, int dst_height,
3098 int src_stride, int dst_stride,
3099 const uint8* src_ptr, uint8* dst_ptr,
3100 FilterMode filtering) {
3101 void (*ScaleRowDown4)(const uint8* src_ptr, int src_stride,
3102 uint8* dst_ptr, int dst_width);
3103 assert(IS_ALIGNED(src_width, 4));
3104 assert(IS_ALIGNED(src_height, 4));
3105
3106 #if defined(HAS_SCALEROWDOWN4_NEON)
3107 if (TestCpuFlag(kCpuHasNEON) &&
3108 IS_ALIGNED(dst_width, 4)) {
3109 ScaleRowDown4 = filtering ? ScaleRowDown4Int_NEON : ScaleRowDown4_NEON;
3110 } else
3111 #endif
3112 #if defined(HAS_SCALEROWDOWN4_SSE2)
3113 if (TestCpuFlag(kCpuHasSSE2) &&
3114 IS_ALIGNED(dst_width, 8) &&
3115 IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16) &&
3116 IS_ALIGNED(dst_ptr, 8) && IS_ALIGNED(dst_stride, 8)) {
3117 ScaleRowDown4 = filtering ? ScaleRowDown4Int_SSE2 : ScaleRowDown4_SSE2;
3118 } else
3119 #endif
3120 {
3121 ScaleRowDown4 = filtering ? ScaleRowDown4Int_C : ScaleRowDown4_C;
3122 }
3123
3124 {
3125 int y;
3126 for (y = 0; y < dst_height; ++y) {
3127 ScaleRowDown4(src_ptr, src_stride, dst_ptr, dst_width);
3128 src_ptr += (src_stride << 2);
3129 dst_ptr += dst_stride;
3130 }
3131 }
3132 }
3133
3134 /**
3135 * Scale plane, 1/8
3136 *
3137 * This is an optimized version for scaling down a plane to 1/8
3138 * of its original size.
3139 *
3140 */
ScalePlaneDown8(int src_width,int src_height,int dst_width,int dst_height,int src_stride,int dst_stride,const uint8 * src_ptr,uint8 * dst_ptr,FilterMode filtering)3141 static void ScalePlaneDown8(int src_width, int src_height,
3142 int dst_width, int dst_height,
3143 int src_stride, int dst_stride,
3144 const uint8* src_ptr, uint8* dst_ptr,
3145 FilterMode filtering) {
3146 void (*ScaleRowDown8)(const uint8* src_ptr, int src_stride,
3147 uint8* dst_ptr, int dst_width);
3148 assert(IS_ALIGNED(src_width, 8));
3149 assert(IS_ALIGNED(src_height, 8));
3150
3151 #if defined(HAS_SCALEROWDOWN8_SSE2)
3152 if (TestCpuFlag(kCpuHasSSE2) &&
3153 IS_ALIGNED(dst_width, 4) &&
3154 IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16) &&
3155 IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
3156 ScaleRowDown8 = filtering ? ScaleRowDown8Int_SSE2 : ScaleRowDown8_SSE2;
3157 } else
3158 #endif
3159 {
3160 ScaleRowDown8 = filtering && (dst_width <= kMaxOutputWidth) ?
3161 ScaleRowDown8Int_C : ScaleRowDown8_C;
3162 }
3163
3164 {
3165 int y;
3166 for (y = 0; y < dst_height; ++y) {
3167 ScaleRowDown8(src_ptr, src_stride, dst_ptr, dst_width);
3168 src_ptr += (src_stride << 3);
3169 dst_ptr += dst_stride;
3170 }
3171 }
3172 }
3173
3174 /**
3175 * Scale plane down, 3/4
3176 *
3177 * Provided by Frank Barchard (fbarchard@google.com)
3178 *
3179 */
ScalePlaneDown34(int src_width,int src_height,int dst_width,int dst_height,int src_stride,int dst_stride,const uint8 * src_ptr,uint8 * dst_ptr,FilterMode filtering)3180 static void ScalePlaneDown34(int src_width, int src_height,
3181 int dst_width, int dst_height,
3182 int src_stride, int dst_stride,
3183 const uint8* src_ptr, uint8* dst_ptr,
3184 FilterMode filtering) {
3185 void (*ScaleRowDown34_0)(const uint8* src_ptr, int src_stride,
3186 uint8* dst_ptr, int dst_width);
3187 void (*ScaleRowDown34_1)(const uint8* src_ptr, int src_stride,
3188 uint8* dst_ptr, int dst_width);
3189 assert(dst_width % 3 == 0);
3190 #if defined(HAS_SCALEROWDOWN34_NEON)
3191 if (TestCpuFlag(kCpuHasNEON) &&
3192 (dst_width % 24 == 0)) {
3193 if (!filtering) {
3194 ScaleRowDown34_0 = ScaleRowDown34_NEON;
3195 ScaleRowDown34_1 = ScaleRowDown34_NEON;
3196 } else {
3197 ScaleRowDown34_0 = ScaleRowDown34_0_Int_NEON;
3198 ScaleRowDown34_1 = ScaleRowDown34_1_Int_NEON;
3199 }
3200 } else
3201 #endif
3202
3203 #if defined(HAS_SCALEROWDOWN34_SSSE3)
3204 if (TestCpuFlag(kCpuHasSSSE3) &&
3205 (dst_width % 24 == 0) &&
3206 IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16) &&
3207 IS_ALIGNED(dst_ptr, 8) && IS_ALIGNED(dst_stride, 8)) {
3208 if (!filtering) {
3209 ScaleRowDown34_0 = ScaleRowDown34_SSSE3;
3210 ScaleRowDown34_1 = ScaleRowDown34_SSSE3;
3211 } else {
3212 ScaleRowDown34_0 = ScaleRowDown34_0_Int_SSSE3;
3213 ScaleRowDown34_1 = ScaleRowDown34_1_Int_SSSE3;
3214 }
3215 } else
3216 #endif
3217 #if defined(HAS_SCALEROWDOWN34_SSE2)
3218 if (TestCpuFlag(kCpuHasSSE2) &&
3219 (dst_width % 24 == 0) && IS_ALIGNED(src_stride, 16) &&
3220 IS_ALIGNED(dst_stride, 8) &&
3221 IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(dst_ptr, 8) &&
3222 filtering) {
3223 ScaleRowDown34_0 = ScaleRowDown34_0_Int_SSE2;
3224 ScaleRowDown34_1 = ScaleRowDown34_1_Int_SSE2;
3225 } else
3226 #endif
3227 {
3228 if (!filtering) {
3229 ScaleRowDown34_0 = ScaleRowDown34_C;
3230 ScaleRowDown34_1 = ScaleRowDown34_C;
3231 } else {
3232 ScaleRowDown34_0 = ScaleRowDown34_0_Int_C;
3233 ScaleRowDown34_1 = ScaleRowDown34_1_Int_C;
3234 }
3235 }
3236 {
3237 int src_row = 0;
3238 int y;
3239 for (y = 0; y < dst_height; ++y) {
3240 switch (src_row) {
3241 case 0:
3242 ScaleRowDown34_0(src_ptr, src_stride, dst_ptr, dst_width);
3243 break;
3244
3245 case 1:
3246 ScaleRowDown34_1(src_ptr, src_stride, dst_ptr, dst_width);
3247 break;
3248
3249 case 2:
3250 ScaleRowDown34_0(src_ptr + src_stride, -src_stride,
3251 dst_ptr, dst_width);
3252 break;
3253 }
3254 ++src_row;
3255 src_ptr += src_stride;
3256 dst_ptr += dst_stride;
3257 if (src_row >= 3) {
3258 src_ptr += src_stride;
3259 src_row = 0;
3260 }
3261 }
3262 }
3263 }
3264
3265 /**
3266 * Scale plane, 3/8
3267 *
3268 * This is an optimized version for scaling down a plane to 3/8
3269 * of its original size.
3270 *
3271 * Reduces 16x3 to 6x1
3272 */
ScalePlaneDown38(int src_width,int src_height,int dst_width,int dst_height,int src_stride,int dst_stride,const uint8 * src_ptr,uint8 * dst_ptr,FilterMode filtering)3273 static void ScalePlaneDown38(int src_width, int src_height,
3274 int dst_width, int dst_height,
3275 int src_stride, int dst_stride,
3276 const uint8* src_ptr, uint8* dst_ptr,
3277 FilterMode filtering) {
3278 void (*ScaleRowDown38_3)(const uint8* src_ptr, int src_stride,
3279 uint8* dst_ptr, int dst_width);
3280 void (*ScaleRowDown38_2)(const uint8* src_ptr, int src_stride,
3281 uint8* dst_ptr, int dst_width);
3282 assert(dst_width % 3 == 0);
3283 #if defined(HAS_SCALEROWDOWN38_NEON)
3284 if (TestCpuFlag(kCpuHasNEON) &&
3285 (dst_width % 12 == 0)) {
3286 if (!filtering) {
3287 ScaleRowDown38_3 = ScaleRowDown38_NEON;
3288 ScaleRowDown38_2 = ScaleRowDown38_NEON;
3289 } else {
3290 ScaleRowDown38_3 = ScaleRowDown38_3_Int_NEON;
3291 ScaleRowDown38_2 = ScaleRowDown38_2_Int_NEON;
3292 }
3293 } else
3294 #endif
3295
3296 #if defined(HAS_SCALEROWDOWN38_SSSE3)
3297 if (TestCpuFlag(kCpuHasSSSE3) &&
3298 (dst_width % 24 == 0) && IS_ALIGNED(src_stride, 16) &&
3299 IS_ALIGNED(dst_stride, 8) &&
3300 IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(dst_ptr, 8)) {
3301 if (!filtering) {
3302 ScaleRowDown38_3 = ScaleRowDown38_SSSE3;
3303 ScaleRowDown38_2 = ScaleRowDown38_SSSE3;
3304 } else {
3305 ScaleRowDown38_3 = ScaleRowDown38_3_Int_SSSE3;
3306 ScaleRowDown38_2 = ScaleRowDown38_2_Int_SSSE3;
3307 }
3308 } else
3309 #endif
3310 {
3311 if (!filtering) {
3312 ScaleRowDown38_3 = ScaleRowDown38_C;
3313 ScaleRowDown38_2 = ScaleRowDown38_C;
3314 } else {
3315 ScaleRowDown38_3 = ScaleRowDown38_3_Int_C;
3316 ScaleRowDown38_2 = ScaleRowDown38_2_Int_C;
3317 }
3318 }
3319 {
3320 int src_row = 0;
3321 int y;
3322 for (y = 0; y < dst_height; ++y) {
3323 switch (src_row) {
3324 case 0:
3325 case 1:
3326 ScaleRowDown38_3(src_ptr, src_stride, dst_ptr, dst_width);
3327 src_ptr += src_stride * 3;
3328 ++src_row;
3329 break;
3330
3331 case 2:
3332 ScaleRowDown38_2(src_ptr, src_stride, dst_ptr, dst_width);
3333 src_ptr += src_stride * 2;
3334 src_row = 0;
3335 break;
3336 }
3337 dst_ptr += dst_stride;
3338 }
3339 }
3340 }
3341
SumBox(int iboxwidth,int iboxheight,int src_stride,const uint8 * src_ptr)3342 __inline static uint32 SumBox(int iboxwidth, int iboxheight,
3343 int src_stride, const uint8* src_ptr) {
3344 int x, y;
3345 uint32 sum;
3346 assert(iboxwidth > 0);
3347 assert(iboxheight > 0);
3348 sum = 0u;
3349 for (y = 0; y < iboxheight; ++y) {
3350 for (x = 0; x < iboxwidth; ++x) {
3351 sum += src_ptr[x];
3352 }
3353 src_ptr += src_stride;
3354 }
3355 return sum;
3356 }
3357
ScalePlaneBoxRow(int dst_width,int boxheight,int dx,int src_stride,const uint8 * src_ptr,uint8 * dst_ptr)3358 static void ScalePlaneBoxRow(int dst_width, int boxheight,
3359 int dx, int src_stride,
3360 const uint8* src_ptr, uint8* dst_ptr) {
3361 int x = 0;
3362 int i;
3363 for (i = 0; i < dst_width; ++i) {
3364 int ix = x >> 16;
3365 int boxwidth;
3366 x += dx;
3367 boxwidth = (x >> 16) - ix;
3368 *dst_ptr++ = SumBox(boxwidth, boxheight, src_stride, src_ptr + ix) /
3369 (boxwidth * boxheight);
3370 }
3371 }
3372
SumPixels(int iboxwidth,const uint16 * src_ptr)3373 __inline static uint32 SumPixels(int iboxwidth, const uint16* src_ptr) {
3374 uint32 sum;
3375 int x;
3376 assert(iboxwidth > 0);
3377 sum = 0u;
3378 for (x = 0; x < iboxwidth; ++x) {
3379 sum += src_ptr[x];
3380 }
3381 return sum;
3382 }
3383
ScaleAddCols2_C(int dst_width,int boxheight,int dx,const uint16 * src_ptr,uint8 * dst_ptr)3384 static void ScaleAddCols2_C(int dst_width, int boxheight, int dx,
3385 const uint16* src_ptr, uint8* dst_ptr) {
3386 int scaletbl[2];
3387 int minboxwidth = (dx >> 16);
3388 scaletbl[0] = 65536 / (minboxwidth * boxheight);
3389 scaletbl[1] = 65536 / ((minboxwidth + 1) * boxheight);
3390 {
3391 int *scaleptr = scaletbl - minboxwidth;
3392 int x = 0;
3393 int i;
3394 for (i = 0; i < dst_width; ++i) {
3395 int ix = x >> 16;
3396 int boxwidth;
3397 x += dx;
3398 boxwidth = (x >> 16) - ix;
3399 *dst_ptr++ = SumPixels(boxwidth, src_ptr + ix) * scaleptr[boxwidth] >> 16;
3400 }
3401 }
3402 }
3403
ScaleAddCols1_C(int dst_width,int boxheight,int dx,const uint16 * src_ptr,uint8 * dst_ptr)3404 static void ScaleAddCols1_C(int dst_width, int boxheight, int dx,
3405 const uint16* src_ptr, uint8* dst_ptr) {
3406 int boxwidth = (dx >> 16);
3407 int scaleval = 65536 / (boxwidth * boxheight);
3408 int x = 0;
3409 int i;
3410 for (i = 0; i < dst_width; ++i) {
3411 *dst_ptr++ = SumPixels(boxwidth, src_ptr + x) * scaleval >> 16;
3412 x += boxwidth;
3413 }
3414 }
3415
3416 /**
3417 * Scale plane down to any dimensions, with interpolation.
3418 * (boxfilter).
3419 *
3420 * Same method as SimpleScale, which is fixed point, outputting
3421 * one pixel of destination using fixed point (16.16) to step
3422 * through source, sampling a box of pixel with simple
3423 * averaging.
3424 */
ScalePlaneBox(int src_width,int src_height,int dst_width,int dst_height,int src_stride,int dst_stride,const uint8 * src_ptr,uint8 * dst_ptr)3425 static void ScalePlaneBox(int src_width, int src_height,
3426 int dst_width, int dst_height,
3427 int src_stride, int dst_stride,
3428 const uint8* src_ptr, uint8* dst_ptr) {
3429 int dx, dy;
3430 assert(dst_width > 0);
3431 assert(dst_height > 0);
3432 dy = (src_height << 16) / dst_height;
3433 dx = (src_width << 16) / dst_width;
3434 if (!IS_ALIGNED(src_width, 16) || (src_width > kMaxInputWidth) ||
3435 dst_height * 2 > src_height) {
3436 uint8* dst = dst_ptr;
3437 int dy = (src_height << 16) / dst_height;
3438 int dx = (src_width << 16) / dst_width;
3439 int y = 0;
3440 int j;
3441 for (j = 0; j < dst_height; ++j) {
3442 int iy = y >> 16;
3443 const uint8* const src = src_ptr + iy * src_stride;
3444 int boxheight;
3445 y += dy;
3446 if (y > (src_height << 16)) {
3447 y = (src_height << 16);
3448 }
3449 boxheight = (y >> 16) - iy;
3450 ScalePlaneBoxRow(dst_width, boxheight,
3451 dx, src_stride,
3452 src, dst);
3453
3454 dst += dst_stride;
3455 }
3456 } else {
3457 ALIGN16(uint16 row[kMaxInputWidth]);
3458 void (*ScaleAddRows)(const uint8* src_ptr, int src_stride,
3459 uint16* dst_ptr, int src_width, int src_height);
3460 void (*ScaleAddCols)(int dst_width, int boxheight, int dx,
3461 const uint16* src_ptr, uint8* dst_ptr);
3462 #if defined(HAS_SCALEADDROWS_SSE2)
3463 if (TestCpuFlag(kCpuHasSSE2) &&
3464 IS_ALIGNED(src_stride, 16) && IS_ALIGNED(src_ptr, 16) &&
3465 IS_ALIGNED(src_width, 16)) {
3466 ScaleAddRows = ScaleAddRows_SSE2;
3467 } else
3468 #endif
3469 {
3470 ScaleAddRows = ScaleAddRows_C;
3471 }
3472 if (dx & 0xffff) {
3473 ScaleAddCols = ScaleAddCols2_C;
3474 } else {
3475 ScaleAddCols = ScaleAddCols1_C;
3476 }
3477
3478 {
3479 int y = 0;
3480 int j;
3481 for (j = 0; j < dst_height; ++j) {
3482 int iy = y >> 16;
3483 const uint8* const src = src_ptr + iy * src_stride;
3484 int boxheight;
3485 y += dy;
3486 if (y > (src_height << 16)) {
3487 y = (src_height << 16);
3488 }
3489 boxheight = (y >> 16) - iy;
3490 ScaleAddRows(src, src_stride, row, src_width, boxheight);
3491 ScaleAddCols(dst_width, boxheight, dx, row, dst_ptr);
3492 dst_ptr += dst_stride;
3493 }
3494 }
3495 }
3496 }
3497
3498 /**
3499 * Scale plane to/from any dimensions, with interpolation.
3500 */
ScalePlaneBilinearSimple(int src_width,int src_height,int dst_width,int dst_height,int src_stride,int dst_stride,const uint8 * src_ptr,uint8 * dst_ptr)3501 static void ScalePlaneBilinearSimple(int src_width, int src_height,
3502 int dst_width, int dst_height,
3503 int src_stride, int dst_stride,
3504 const uint8* src_ptr, uint8* dst_ptr) {
3505 int i, j;
3506 uint8* dst = dst_ptr;
3507 int dx = (src_width << 16) / dst_width;
3508 int dy = (src_height << 16) / dst_height;
3509 int maxx = ((src_width - 1) << 16) - 1;
3510 int maxy = ((src_height - 1) << 16) - 1;
3511 int y = (dst_height < src_height) ? 32768 :
3512 (src_height << 16) / dst_height - 32768;
3513 for (i = 0; i < dst_height; ++i) {
3514 int cy = (y < 0) ? 0 : y;
3515 int yi = cy >> 16;
3516 int yf = cy & 0xffff;
3517 const uint8* const src = src_ptr + yi * src_stride;
3518 int x = (dst_width < src_width) ? 32768 :
3519 (src_width << 16) / dst_width - 32768;
3520 for (j = 0; j < dst_width; ++j) {
3521 int cx = (x < 0) ? 0 : x;
3522 int xi = cx >> 16;
3523 int xf = cx & 0xffff;
3524 int r0 = (src[xi] * (65536 - xf) + src[xi + 1] * xf) >> 16;
3525 int r1 = (src[xi + src_stride] * (65536 - xf) +
3526 src[xi + src_stride + 1] * xf) >> 16;
3527 *dst++ = (r0 * (65536 - yf) + r1 * yf) >> 16;
3528 x += dx;
3529 if (x > maxx)
3530 x = maxx;
3531 }
3532 dst += dst_stride - dst_width;
3533 y += dy;
3534 if (y > maxy)
3535 y = maxy;
3536 }
3537 }
3538
3539 /**
3540 * Scale plane to/from any dimensions, with bilinear
3541 * interpolation.
3542 */
ScalePlaneBilinear(int src_width,int src_height,int dst_width,int dst_height,int src_stride,int dst_stride,const uint8 * src_ptr,uint8 * dst_ptr)3543 static void ScalePlaneBilinear(int src_width, int src_height,
3544 int dst_width, int dst_height,
3545 int src_stride, int dst_stride,
3546 const uint8* src_ptr, uint8* dst_ptr) {
3547 int dy;
3548 int dx;
3549 assert(dst_width > 0);
3550 assert(dst_height > 0);
3551 dy = (src_height << 16) / dst_height;
3552 dx = (src_width << 16) / dst_width;
3553 if (!IS_ALIGNED(src_width, 8) || (src_width > kMaxInputWidth)) {
3554 ScalePlaneBilinearSimple(src_width, src_height, dst_width, dst_height,
3555 src_stride, dst_stride, src_ptr, dst_ptr);
3556
3557 } else {
3558 ALIGN16(uint8 row[kMaxInputWidth + 1]);
3559 void (*ScaleFilterRows)(uint8* dst_ptr, const uint8* src_ptr,
3560 int src_stride,
3561 int dst_width, int source_y_fraction);
3562 void (*ScaleFilterCols)(uint8* dst_ptr, const uint8* src_ptr,
3563 int dst_width, int dx);
3564 #if defined(HAS_SCALEFILTERROWS_SSSE3)
3565 if (TestCpuFlag(kCpuHasSSSE3) &&
3566 IS_ALIGNED(src_stride, 16) && IS_ALIGNED(src_ptr, 16) &&
3567 IS_ALIGNED(src_width, 16)) {
3568 ScaleFilterRows = ScaleFilterRows_SSSE3;
3569 } else
3570 #endif
3571 #if defined(HAS_SCALEFILTERROWS_SSE2)
3572 if (TestCpuFlag(kCpuHasSSE2) &&
3573 IS_ALIGNED(src_stride, 16) && IS_ALIGNED(src_ptr, 16) &&
3574 IS_ALIGNED(src_width, 16)) {
3575 ScaleFilterRows = ScaleFilterRows_SSE2;
3576 } else
3577 #endif
3578 {
3579 ScaleFilterRows = ScaleFilterRows_C;
3580 }
3581 ScaleFilterCols = ScaleFilterCols_C;
3582
3583 {
3584 int y = 0;
3585 int maxy = ((src_height - 1) << 16) - 1; // max is filter of last 2 rows.
3586 int j;
3587 for (j = 0; j < dst_height; ++j) {
3588 int iy = y >> 16;
3589 int fy = (y >> 8) & 255;
3590 const uint8* const src = src_ptr + iy * src_stride;
3591 ScaleFilterRows(row, src, src_stride, src_width, fy);
3592 ScaleFilterCols(dst_ptr, row, dst_width, dx);
3593 dst_ptr += dst_stride;
3594 y += dy;
3595 if (y > maxy) {
3596 y = maxy;
3597 }
3598 }
3599 }
3600 }
3601 }
3602
3603 /**
3604 * Scale plane to/from any dimensions, without interpolation.
3605 * Fixed point math is used for performance: The upper 16 bits
3606 * of x and dx is the integer part of the source position and
3607 * the lower 16 bits are the fixed decimal part.
3608 */
ScalePlaneSimple(int src_width,int src_height,int dst_width,int dst_height,int src_stride,int dst_stride,const uint8 * src_ptr,uint8 * dst_ptr)3609 static void ScalePlaneSimple(int src_width, int src_height,
3610 int dst_width, int dst_height,
3611 int src_stride, int dst_stride,
3612 const uint8* src_ptr, uint8* dst_ptr) {
3613 uint8* dst = dst_ptr;
3614 int dx = (src_width << 16) / dst_width;
3615 int y;
3616 for (y = 0; y < dst_height; ++y) {
3617 const uint8* const src = src_ptr + (y * src_height / dst_height) *
3618 src_stride;
3619 // TODO(fbarchard): Round X coordinate by setting x=0x8000.
3620 int x = 0;
3621 int i;
3622 for (i = 0; i < dst_width; ++i) {
3623 *dst++ = src[x >> 16];
3624 x += dx;
3625 }
3626 dst += dst_stride - dst_width;
3627 }
3628 }
3629
3630 /**
3631 * Scale plane to/from any dimensions.
3632 */
ScalePlaneAnySize(int src_width,int src_height,int dst_width,int dst_height,int src_stride,int dst_stride,const uint8 * src_ptr,uint8 * dst_ptr,FilterMode filtering)3633 static void ScalePlaneAnySize(int src_width, int src_height,
3634 int dst_width, int dst_height,
3635 int src_stride, int dst_stride,
3636 const uint8* src_ptr, uint8* dst_ptr,
3637 FilterMode filtering) {
3638 if (!filtering) {
3639 ScalePlaneSimple(src_width, src_height, dst_width, dst_height,
3640 src_stride, dst_stride, src_ptr, dst_ptr);
3641 } else {
3642 // fall back to non-optimized version
3643 ScalePlaneBilinear(src_width, src_height, dst_width, dst_height,
3644 src_stride, dst_stride, src_ptr, dst_ptr);
3645 }
3646 }
3647
3648 /**
3649 * Scale plane down, any size
3650 *
3651 * This is an optimized version for scaling down a plane to any size.
3652 * The current implementation is ~10 times faster compared to the
3653 * reference implementation for e.g. XGA->LowResPAL
3654 *
3655 */
ScalePlaneDown(int src_width,int src_height,int dst_width,int dst_height,int src_stride,int dst_stride,const uint8 * src_ptr,uint8 * dst_ptr,FilterMode filtering)3656 static void ScalePlaneDown(int src_width, int src_height,
3657 int dst_width, int dst_height,
3658 int src_stride, int dst_stride,
3659 const uint8* src_ptr, uint8* dst_ptr,
3660 FilterMode filtering) {
3661 if (!filtering) {
3662 ScalePlaneSimple(src_width, src_height, dst_width, dst_height,
3663 src_stride, dst_stride, src_ptr, dst_ptr);
3664 } else if (filtering == kFilterBilinear || src_height * 2 > dst_height) {
3665 // between 1/2x and 1x use bilinear
3666 ScalePlaneBilinear(src_width, src_height, dst_width, dst_height,
3667 src_stride, dst_stride, src_ptr, dst_ptr);
3668 } else {
3669 ScalePlaneBox(src_width, src_height, dst_width, dst_height,
3670 src_stride, dst_stride, src_ptr, dst_ptr);
3671 }
3672 }
3673
3674 /**
3675 * Copy plane, no scaling
3676 *
3677 * This simply copies the given plane without scaling.
3678 * The current implementation is ~115 times faster
3679 * compared to the reference implementation.
3680 *
3681 */
CopyPlane(int src_width,int src_height,int dst_width,int dst_height,int src_stride,int dst_stride,const uint8 * src_ptr,uint8 * dst_ptr)3682 static void CopyPlane(int src_width, int src_height,
3683 int dst_width, int dst_height,
3684 int src_stride, int dst_stride,
3685 const uint8* src_ptr, uint8* dst_ptr) {
3686 if (src_stride == src_width && dst_stride == dst_width) {
3687 // All contiguous, so can use REALLY fast path.
3688 memcpy(dst_ptr, src_ptr, src_width * src_height);
3689 } else {
3690 // Not all contiguous; must copy scanlines individually
3691 const uint8* src = src_ptr;
3692 uint8* dst = dst_ptr;
3693 int i;
3694 for (i = 0; i < src_height; ++i) {
3695 memcpy(dst, src, src_width);
3696 dst += dst_stride;
3697 src += src_stride;
3698 }
3699 }
3700 }
3701
ScalePlane(const uint8 * src,int src_stride,int src_width,int src_height,uint8 * dst,int dst_stride,int dst_width,int dst_height,FilterMode filtering,int use_ref)3702 static void ScalePlane(const uint8* src, int src_stride,
3703 int src_width, int src_height,
3704 uint8* dst, int dst_stride,
3705 int dst_width, int dst_height,
3706 FilterMode filtering, int use_ref) {
3707 // Use specialized scales to improve performance for common resolutions.
3708 // For example, all the 1/2 scalings will use ScalePlaneDown2()
3709 if (dst_width == src_width && dst_height == src_height) {
3710 // Straight copy.
3711 CopyPlane(src_width, src_height, dst_width, dst_height, src_stride,
3712 dst_stride, src, dst);
3713 } else if (dst_width <= src_width && dst_height <= src_height) {
3714 // Scale down.
3715 if (use_ref) {
3716 // For testing, allow the optimized versions to be disabled.
3717 ScalePlaneDown(src_width, src_height, dst_width, dst_height,
3718 src_stride, dst_stride, src, dst, filtering);
3719 } else if (4 * dst_width == 3 * src_width &&
3720 4 * dst_height == 3 * src_height) {
3721 // optimized, 3/4
3722 ScalePlaneDown34(src_width, src_height, dst_width, dst_height,
3723 src_stride, dst_stride, src, dst, filtering);
3724 } else if (2 * dst_width == src_width && 2 * dst_height == src_height) {
3725 // optimized, 1/2
3726 ScalePlaneDown2(src_width, src_height, dst_width, dst_height,
3727 src_stride, dst_stride, src, dst, filtering);
3728 // 3/8 rounded up for odd sized chroma height.
3729 } else if (8 * dst_width == 3 * src_width &&
3730 dst_height == ((src_height * 3 + 7) / 8)) {
3731 // optimized, 3/8
3732 ScalePlaneDown38(src_width, src_height, dst_width, dst_height,
3733 src_stride, dst_stride, src, dst, filtering);
3734 } else if (4 * dst_width == src_width && 4 * dst_height == src_height) {
3735 // optimized, 1/4
3736 ScalePlaneDown4(src_width, src_height, dst_width, dst_height,
3737 src_stride, dst_stride, src, dst, filtering);
3738 } else if (8 * dst_width == src_width && 8 * dst_height == src_height) {
3739 // optimized, 1/8
3740 ScalePlaneDown8(src_width, src_height, dst_width, dst_height,
3741 src_stride, dst_stride, src, dst, filtering);
3742 } else {
3743 // Arbitrary downsample
3744 ScalePlaneDown(src_width, src_height, dst_width, dst_height,
3745 src_stride, dst_stride, src, dst, filtering);
3746 }
3747 } else {
3748 // Arbitrary scale up and/or down.
3749 ScalePlaneAnySize(src_width, src_height, dst_width, dst_height,
3750 src_stride, dst_stride, src, dst, filtering);
3751 }
3752 }
3753
3754 /**
3755 * Scale a plane.
3756 *
3757 * This function in turn calls a scaling function
3758 * suitable for handling the desired resolutions.
3759 *
3760 */
3761
I420Scale(const uint8 * src_y,int src_stride_y,const uint8 * src_u,int src_stride_u,const uint8 * src_v,int src_stride_v,int src_width,int src_height,uint8 * dst_y,int dst_stride_y,uint8 * dst_u,int dst_stride_u,uint8 * dst_v,int dst_stride_v,int dst_width,int dst_height,FilterMode filtering)3762 int I420Scale(const uint8* src_y, int src_stride_y,
3763 const uint8* src_u, int src_stride_u,
3764 const uint8* src_v, int src_stride_v,
3765 int src_width, int src_height,
3766 uint8* dst_y, int dst_stride_y,
3767 uint8* dst_u, int dst_stride_u,
3768 uint8* dst_v, int dst_stride_v,
3769 int dst_width, int dst_height,
3770 FilterMode filtering) {
3771 if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 ||
3772 !dst_y || !dst_u || !dst_v || dst_width <= 0 || dst_height <= 0) {
3773 return -1;
3774 }
3775 // Negative height means invert the image.
3776 if (src_height < 0) {
3777 int halfheight;
3778 src_height = -src_height;
3779 halfheight = (src_height + 1) >> 1;
3780 src_y = src_y + (src_height - 1) * src_stride_y;
3781 src_u = src_u + (halfheight - 1) * src_stride_u;
3782 src_v = src_v + (halfheight - 1) * src_stride_v;
3783 src_stride_y = -src_stride_y;
3784 src_stride_u = -src_stride_u;
3785 src_stride_v = -src_stride_v;
3786 }
3787 {
3788 int src_halfwidth = (src_width + 1) >> 1;
3789 int src_halfheight = (src_height + 1) >> 1;
3790 int dst_halfwidth = (dst_width + 1) >> 1;
3791 int dst_halfheight = (dst_height + 1) >> 1;
3792
3793 ScalePlane(src_y, src_stride_y, src_width, src_height,
3794 dst_y, dst_stride_y, dst_width, dst_height,
3795 filtering, use_reference_impl_);
3796 ScalePlane(src_u, src_stride_u, src_halfwidth, src_halfheight,
3797 dst_u, dst_stride_u, dst_halfwidth, dst_halfheight,
3798 filtering, use_reference_impl_);
3799 ScalePlane(src_v, src_stride_v, src_halfwidth, src_halfheight,
3800 dst_v, dst_stride_v, dst_halfwidth, dst_halfheight,
3801 filtering, use_reference_impl_);
3802 }
3803 return 0;
3804 }
3805
3806 // Deprecated api
Scale(const uint8 * src_y,const uint8 * src_u,const uint8 * src_v,int src_stride_y,int src_stride_u,int src_stride_v,int src_width,int src_height,uint8 * dst_y,uint8 * dst_u,uint8 * dst_v,int dst_stride_y,int dst_stride_u,int dst_stride_v,int dst_width,int dst_height,int interpolate)3807 int Scale(const uint8* src_y, const uint8* src_u, const uint8* src_v,
3808 int src_stride_y, int src_stride_u, int src_stride_v,
3809 int src_width, int src_height,
3810 uint8* dst_y, uint8* dst_u, uint8* dst_v,
3811 int dst_stride_y, int dst_stride_u, int dst_stride_v,
3812 int dst_width, int dst_height,
3813 int interpolate) {
3814 if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 ||
3815 !dst_y || !dst_u || !dst_v || dst_width <= 0 || dst_height <= 0) {
3816 return -1;
3817 }
3818 // Negative height means invert the image.
3819 if (src_height < 0) {
3820 int halfheight;
3821 src_height = -src_height;
3822 halfheight = (src_height + 1) >> 1;
3823 src_y = src_y + (src_height - 1) * src_stride_y;
3824 src_u = src_u + (halfheight - 1) * src_stride_u;
3825 src_v = src_v + (halfheight - 1) * src_stride_v;
3826 src_stride_y = -src_stride_y;
3827 src_stride_u = -src_stride_u;
3828 src_stride_v = -src_stride_v;
3829 }
3830 {
3831 int src_halfwidth = (src_width + 1) >> 1;
3832 int src_halfheight = (src_height + 1) >> 1;
3833 int dst_halfwidth = (dst_width + 1) >> 1;
3834 int dst_halfheight = (dst_height + 1) >> 1;
3835 FilterMode filtering = interpolate ? kFilterBox : kFilterNone;
3836
3837 ScalePlane(src_y, src_stride_y, src_width, src_height,
3838 dst_y, dst_stride_y, dst_width, dst_height,
3839 filtering, use_reference_impl_);
3840 ScalePlane(src_u, src_stride_u, src_halfwidth, src_halfheight,
3841 dst_u, dst_stride_u, dst_halfwidth, dst_halfheight,
3842 filtering, use_reference_impl_);
3843 ScalePlane(src_v, src_stride_v, src_halfwidth, src_halfheight,
3844 dst_v, dst_stride_v, dst_halfwidth, dst_halfheight,
3845 filtering, use_reference_impl_);
3846 }
3847 return 0;
3848 }
3849
3850 // Deprecated api
ScaleOffset(const uint8 * src,int src_width,int src_height,uint8 * dst,int dst_width,int dst_height,int dst_yoffset,int interpolate)3851 int ScaleOffset(const uint8* src, int src_width, int src_height,
3852 uint8* dst, int dst_width, int dst_height, int dst_yoffset,
3853 int interpolate) {
3854 if (!src || src_width <= 0 || src_height <= 0 ||
3855 !dst || dst_width <= 0 || dst_height <= 0 || dst_yoffset < 0 ||
3856 dst_yoffset >= dst_height) {
3857 return -1;
3858 }
3859 dst_yoffset = dst_yoffset & ~1; // chroma requires offset to multiple of 2.
3860 {
3861 int src_halfwidth = (src_width + 1) >> 1;
3862 int src_halfheight = (src_height + 1) >> 1;
3863 int dst_halfwidth = (dst_width + 1) >> 1;
3864 int dst_halfheight = (dst_height + 1) >> 1;
3865 int aheight = dst_height - dst_yoffset * 2; // actual output height
3866 const uint8* const src_y = src;
3867 const uint8* const src_u = src + src_width * src_height;
3868 const uint8* const src_v = src + src_width * src_height +
3869 src_halfwidth * src_halfheight;
3870 uint8* dst_y = dst + dst_yoffset * dst_width;
3871 uint8* dst_u = dst + dst_width * dst_height +
3872 (dst_yoffset >> 1) * dst_halfwidth;
3873 uint8* dst_v = dst + dst_width * dst_height + dst_halfwidth * dst_halfheight +
3874 (dst_yoffset >> 1) * dst_halfwidth;
3875 return Scale(src_y, src_u, src_v, src_width, src_halfwidth, src_halfwidth,
3876 src_width, src_height, dst_y, dst_u, dst_v, dst_width,
3877 dst_halfwidth, dst_halfwidth, dst_width, aheight, interpolate);
3878 }
3879 }
3880
3881 #ifdef __cplusplus
3882 } // extern "C"
3883 } // namespace libyuv
3884 #endif
3885