1 /*
2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11
12 /****************************************************************************
13 *
14 * Module Title : scaleopt.cpp
15 *
16 * Description : Optimized scaling functions
17 *
18 ****************************************************************************/
19 #include "pragmas.h"
20
21 /****************************************************************************
22 * Module Statics
23 ****************************************************************************/
24 __declspec(align(16)) const static unsigned short round_values[] = { 128, 128, 128, 128 };
25
26 #include "vpx_scale/vpx_scale.h"
27 #include "vpx_mem/vpx_mem.h"
28
29 __declspec(align(16)) const static unsigned short const54_2[] = { 0, 64, 128, 192 };
30 __declspec(align(16)) const static unsigned short const54_1[] = {256, 192, 128, 64 };
31
32
33 /****************************************************************************
34 *
35 * ROUTINE : horizontal_line_5_4_scale_mmx
36 *
37 * INPUTS : const unsigned char *source : Pointer to source data.
38 * unsigned int source_width : Stride of source.
39 * unsigned char *dest : Pointer to destination data.
40 * unsigned int dest_width : Stride of destination (NOT USED).
41 *
42 * OUTPUTS : None.
43 *
44 * RETURNS : void
45 *
46 * FUNCTION : Copies horizontal line of pixels from source to
47 * destination scaling up by 4 to 5.
48 *
49 * SPECIAL NOTES : None.
50 *
51 ****************************************************************************/
52 static
horizontal_line_5_4_scale_mmx(const unsigned char * source,unsigned int source_width,unsigned char * dest,unsigned int dest_width)53 void horizontal_line_5_4_scale_mmx
54 (
55 const unsigned char *source,
56 unsigned int source_width,
57 unsigned char *dest,
58 unsigned int dest_width
59 ) {
60 /*
61 unsigned i;
62 unsigned int a, b, c, d, e;
63 unsigned char *des = dest;
64 const unsigned char *src = source;
65
66 (void) dest_width;
67
68 for ( i=0; i<source_width; i+=5 )
69 {
70 a = src[0];
71 b = src[1];
72 c = src[2];
73 d = src[3];
74 e = src[4];
75
76 des[0] = a;
77 des[1] = ((b*192 + c* 64 + 128)>>8);
78 des[2] = ((c*128 + d*128 + 128)>>8);
79 des[3] = ((d* 64 + e*192 + 128)>>8);
80
81 src += 5;
82 des += 4;
83 }
84 */
85 (void) dest_width;
86
87 __asm {
88
89 mov esi, source;
90 mov edi, dest;
91
92 mov ecx, source_width;
93 movq mm5, const54_1;
94
95 pxor mm7, mm7;
96 movq mm6, const54_2;
97
98 movq mm4, round_values;
99 lea edx, [esi+ecx];
100 horizontal_line_5_4_loop:
101
102 movq mm0, QWORD PTR [esi];
103 00 01 02 03 04 05 06 07
104 movq mm1, mm0;
105 00 01 02 03 04 05 06 07
106
107 psrlq mm0, 8;
108 01 02 03 04 05 06 07 xx
109 punpcklbw mm1, mm7;
110 xx 00 xx 01 xx 02 xx 03
111
112 punpcklbw mm0, mm7;
113 xx 01 xx 02 xx 03 xx 04
114 pmullw mm1, mm5
115
116 pmullw mm0, mm6
117 add esi, 5
118
119 add edi, 4
120 paddw mm1, mm0
121
122 paddw mm1, mm4
123 psrlw mm1, 8
124
125 cmp esi, edx
126 packuswb mm1, mm7
127
128 movd DWORD PTR [edi-4], mm1
129
130 jl horizontal_line_5_4_loop
131
132 }
133
134 }
135 __declspec(align(16)) const static unsigned short one_fourths[] = { 64, 64, 64, 64 };
136 __declspec(align(16)) const static unsigned short two_fourths[] = { 128, 128, 128, 128 };
137 __declspec(align(16)) const static unsigned short three_fourths[] = { 192, 192, 192, 192 };
138
139 static
vertical_band_5_4_scale_mmx(unsigned char * source,unsigned int src_pitch,unsigned char * dest,unsigned int dest_pitch,unsigned int dest_width)140 void vertical_band_5_4_scale_mmx(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width) {
141
142 __asm {
143 push ebx
144
145 mov esi, source // Get the source and destination pointer
146 mov ecx, src_pitch // Get the pitch size
147
148 mov edi, dest // tow lines below
149 pxor mm7, mm7 // clear out mm7
150
151 mov edx, dest_pitch // Loop counter
152 mov ebx, dest_width
153
154 vs_5_4_loop:
155
156 movd mm0, DWORD ptr [esi] // src[0];
157 movd mm1, DWORD ptr [esi+ecx] // src[1];
158
159 movd mm2, DWORD ptr [esi+ecx*2]
160 lea eax, [esi+ecx*2] //
161
162 punpcklbw mm1, mm7
163 punpcklbw mm2, mm7
164
165 movq mm3, mm2
166 pmullw mm1, three_fourths
167
168 pmullw mm2, one_fourths
169 movd mm4, [eax+ecx]
170
171 pmullw mm3, two_fourths
172 punpcklbw mm4, mm7
173
174 movq mm5, mm4
175 pmullw mm4, two_fourths
176
177 paddw mm1, mm2
178 movd mm6, [eax+ecx*2]
179
180 pmullw mm5, one_fourths
181 paddw mm1, round_values;
182
183 paddw mm3, mm4
184 psrlw mm1, 8
185
186 punpcklbw mm6, mm7
187 paddw mm3, round_values
188
189 pmullw mm6, three_fourths
190 psrlw mm3, 8
191
192 packuswb mm1, mm7
193 packuswb mm3, mm7
194
195 movd DWORD PTR [edi], mm0
196 movd DWORD PTR [edi+edx], mm1
197
198
199 paddw mm5, mm6
200 movd DWORD PTR [edi+edx*2], mm3
201
202 lea eax, [edi+edx*2]
203 paddw mm5, round_values
204
205 psrlw mm5, 8
206 add edi, 4
207
208 packuswb mm5, mm7
209 movd DWORD PTR [eax+edx], mm5
210
211 add esi, 4
212 sub ebx, 4
213
214 jg vs_5_4_loop
215
216 pop ebx
217 }
218 }
219
220
221 __declspec(align(16)) const static unsigned short const53_1[] = { 0, 85, 171, 0 };
222 __declspec(align(16)) const static unsigned short const53_2[] = {256, 171, 85, 0 };
223
224
225 static
horizontal_line_5_3_scale_mmx(const unsigned char * source,unsigned int source_width,unsigned char * dest,unsigned int dest_width)226 void horizontal_line_5_3_scale_mmx
227 (
228 const unsigned char *source,
229 unsigned int source_width,
230 unsigned char *dest,
231 unsigned int dest_width
232 ) {
233
234 (void) dest_width;
235 __asm {
236
237 mov esi, source;
238 mov edi, dest;
239
240 mov ecx, source_width;
241 movq mm5, const53_1;
242
243 pxor mm7, mm7;
244 movq mm6, const53_2;
245
246 movq mm4, round_values;
247 lea edx, [esi+ecx-5];
248 horizontal_line_5_3_loop:
249
250 movq mm0, QWORD PTR [esi];
251 00 01 02 03 04 05 06 07
252 movq mm1, mm0;
253 00 01 02 03 04 05 06 07
254
255 psllw mm0, 8;
256 xx 00 xx 02 xx 04 xx 06
257 psrlw mm1, 8;
258 01 xx 03 xx 05 xx 07 xx
259
260 psrlw mm0, 8;
261 00 xx 02 xx 04 xx 06 xx
262 psllq mm1, 16;
263 xx xx 01 xx 03 xx 05 xx
264
265 pmullw mm0, mm6
266
267 pmullw mm1, mm5
268 add esi, 5
269
270 add edi, 3
271 paddw mm1, mm0
272
273 paddw mm1, mm4
274 psrlw mm1, 8
275
276 cmp esi, edx
277 packuswb mm1, mm7
278
279 movd DWORD PTR [edi-3], mm1
280 jl horizontal_line_5_3_loop
281
282 // exit condition
283 movq mm0, QWORD PTR [esi];
284 00 01 02 03 04 05 06 07
285 movq mm1, mm0;
286 00 01 02 03 04 05 06 07
287
288 psllw mm0, 8;
289 xx 00 xx 02 xx 04 xx 06
290 psrlw mm1, 8;
291 01 xx 03 xx 05 xx 07 xx
292
293 psrlw mm0, 8;
294 00 xx 02 xx 04 xx 06 xx
295 psllq mm1, 16;
296 xx xx 01 xx 03 xx 05 xx
297
298 pmullw mm0, mm6
299
300 pmullw mm1, mm5
301 paddw mm1, mm0
302
303 paddw mm1, mm4
304 psrlw mm1, 8
305
306 packuswb mm1, mm7
307 movd eax, mm1
308
309 mov edx, eax
310 shr edx, 16
311
312 mov WORD PTR[edi], ax
313 mov BYTE PTR[edi+2], dl
314
315 }
316
317 }
318
319 __declspec(align(16)) const static unsigned short one_thirds[] = { 85, 85, 85, 85 };
320 __declspec(align(16)) const static unsigned short two_thirds[] = { 171, 171, 171, 171 };
321
322 static
vertical_band_5_3_scale_mmx(unsigned char * source,unsigned int src_pitch,unsigned char * dest,unsigned int dest_pitch,unsigned int dest_width)323 void vertical_band_5_3_scale_mmx(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width) {
324
325 __asm {
326 push ebx
327
328 mov esi, source // Get the source and destination pointer
329 mov ecx, src_pitch // Get the pitch size
330
331 mov edi, dest // tow lines below
332 pxor mm7, mm7 // clear out mm7
333
334 mov edx, dest_pitch // Loop counter
335 movq mm5, one_thirds
336
337 movq mm6, two_thirds
338 mov ebx, dest_width;
339
340 vs_5_3_loop:
341
342 movd mm0, DWORD ptr [esi] // src[0];
343 movd mm1, DWORD ptr [esi+ecx] // src[1];
344
345 movd mm2, DWORD ptr [esi+ecx*2]
346 lea eax, [esi+ecx*2] //
347
348 punpcklbw mm1, mm7
349 punpcklbw mm2, mm7
350
351 pmullw mm1, mm5
352 pmullw mm2, mm6
353
354 movd mm3, DWORD ptr [eax+ecx]
355 movd mm4, DWORD ptr [eax+ecx*2]
356
357 punpcklbw mm3, mm7
358 punpcklbw mm4, mm7
359
360 pmullw mm3, mm6
361 pmullw mm4, mm5
362
363
364 movd DWORD PTR [edi], mm0
365 paddw mm1, mm2
366
367 paddw mm1, round_values
368 psrlw mm1, 8
369
370 packuswb mm1, mm7
371 paddw mm3, mm4
372
373 paddw mm3, round_values
374 movd DWORD PTR [edi+edx], mm1
375
376 psrlw mm3, 8
377 packuswb mm3, mm7
378
379 movd DWORD PTR [edi+edx*2], mm3
380
381
382 add edi, 4
383 add esi, 4
384
385 sub ebx, 4
386 jg vs_5_3_loop
387
388 pop ebx
389 }
390 }
391
392
393
394
395 /****************************************************************************
396 *
397 * ROUTINE : horizontal_line_2_1_scale
398 *
399 * INPUTS : const unsigned char *source :
400 * unsigned int source_width :
401 * unsigned char *dest :
402 * unsigned int dest_width :
403 *
404 * OUTPUTS : None.
405 *
406 * RETURNS : void
407 *
408 * FUNCTION : 1 to 2 up-scaling of a horizontal line of pixels.
409 *
410 * SPECIAL NOTES : None.
411 *
412 ****************************************************************************/
413 static
horizontal_line_2_1_scale_mmx(const unsigned char * source,unsigned int source_width,unsigned char * dest,unsigned int dest_width)414 void horizontal_line_2_1_scale_mmx
415 (
416 const unsigned char *source,
417 unsigned int source_width,
418 unsigned char *dest,
419 unsigned int dest_width
420 ) {
421 (void) dest_width;
422 (void) source_width;
423 __asm {
424 mov esi, source
425 mov edi, dest
426
427 pxor mm7, mm7
428 mov ecx, dest_width
429
430 xor edx, edx
431 hs_2_1_loop:
432
433 movq mm0, [esi+edx*2]
434 psllw mm0, 8
435
436 psrlw mm0, 8
437 packuswb mm0, mm7
438
439 movd DWORD Ptr [edi+edx], mm0;
440 add edx, 4
441
442 cmp edx, ecx
443 jl hs_2_1_loop
444
445 }
446 }
447
448
449
450 static
vertical_band_2_1_scale_mmx(unsigned char * source,unsigned int src_pitch,unsigned char * dest,unsigned int dest_pitch,unsigned int dest_width)451 void vertical_band_2_1_scale_mmx(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width) {
452 (void) dest_pitch;
453 (void) src_pitch;
454 vpx_memcpy(dest, source, dest_width);
455 }
456
457
458 __declspec(align(16)) const static unsigned short three_sixteenths[] = { 48, 48, 48, 48 };
459 __declspec(align(16)) const static unsigned short ten_sixteenths[] = { 160, 160, 160, 160 };
460
461 static
vertical_band_2_1_scale_i_mmx(unsigned char * source,unsigned int src_pitch,unsigned char * dest,unsigned int dest_pitch,unsigned int dest_width)462 void vertical_band_2_1_scale_i_mmx(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width) {
463
464 (void) dest_pitch;
465 __asm {
466 mov esi, source
467 mov edi, dest
468
469 mov eax, src_pitch
470 mov edx, dest_width
471
472 pxor mm7, mm7
473 sub esi, eax // back one line
474
475
476 lea ecx, [esi+edx];
477 movq mm6, round_values;
478
479 movq mm5, three_sixteenths;
480 movq mm4, ten_sixteenths;
481
482 vs_2_1_i_loop:
483 movd mm0, [esi] //
484 movd mm1, [esi+eax] //
485
486 movd mm2, [esi+eax*2] //
487 punpcklbw mm0, mm7
488
489 pmullw mm0, mm5
490 punpcklbw mm1, mm7
491
492 pmullw mm1, mm4
493 punpcklbw mm2, mm7
494
495 pmullw mm2, mm5
496 paddw mm0, round_values
497
498 paddw mm1, mm2
499 paddw mm0, mm1
500
501 psrlw mm0, 8
502 packuswb mm0, mm7
503
504 movd DWORD PTR [edi], mm0
505 add esi, 4
506
507 add edi, 4;
508 cmp esi, ecx
509 jl vs_2_1_i_loop
510
511 }
512 }
513
514
515
516 void
register_mmxscalers(void)517 register_mmxscalers(void) {
518 vp8_vertical_band_5_4_scale = vertical_band_5_4_scale_mmx;
519 vp8_vertical_band_5_3_scale = vertical_band_5_3_scale_mmx;
520 vp8_vertical_band_2_1_scale = vertical_band_2_1_scale_mmx;
521 vp8_vertical_band_2_1_scale_i = vertical_band_2_1_scale_i_mmx;
522 vp8_horizontal_line_2_1_scale = horizontal_line_2_1_scale_mmx;
523 vp8_horizontal_line_5_3_scale = horizontal_line_5_3_scale_mmx;
524 vp8_horizontal_line_5_4_scale = horizontal_line_5_4_scale_mmx;
525 }
526