1 /*
2  *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 
12 /****************************************************************************
13 *
14 *   Module Title :     scaleopt.cpp
15 *
16 *   Description  :     Optimized scaling functions
17 *
18 ****************************************************************************/
19 #include "pragmas.h"
20 
21 /****************************************************************************
22 *  Module Statics
23 ****************************************************************************/
24 __declspec(align(16)) const static unsigned short round_values[] = { 128, 128, 128, 128 };
25 
26 #include "vpx_scale/vpx_scale.h"
27 #include "vpx_mem/vpx_mem.h"
28 
29 __declspec(align(16)) const static unsigned short const54_2[] = {  0,  64, 128, 192 };
30 __declspec(align(16)) const static unsigned short const54_1[] = {256, 192, 128,  64 };
31 
32 
33 /****************************************************************************
34  *
35  *  ROUTINE       : horizontal_line_5_4_scale_mmx
36  *
37  *  INPUTS        : const unsigned char *source : Pointer to source data.
38  *                  unsigned int source_width    : Stride of source.
39  *                  unsigned char *dest         : Pointer to destination data.
40  *                  unsigned int dest_width      : Stride of destination (NOT USED).
41  *
42  *  OUTPUTS       : None.
43  *
44  *  RETURNS       : void
45  *
46  *  FUNCTION      : Copies horizontal line of pixels from source to
47  *                  destination scaling up by 4 to 5.
48  *
49  *  SPECIAL NOTES : None.
50  *
51  ****************************************************************************/
52 static
horizontal_line_5_4_scale_mmx(const unsigned char * source,unsigned int source_width,unsigned char * dest,unsigned int dest_width)53 void horizontal_line_5_4_scale_mmx
54 (
55   const unsigned char *source,
56   unsigned int source_width,
57   unsigned char *dest,
58   unsigned int dest_width
59 ) {
60   /*
61   unsigned i;
62   unsigned int a, b, c, d, e;
63   unsigned char *des = dest;
64   const unsigned char *src = source;
65 
66   (void) dest_width;
67 
68   for ( i=0; i<source_width; i+=5 )
69   {
70       a = src[0];
71       b = src[1];
72       c = src[2];
73       d = src[3];
74       e = src[4];
75 
76       des[0] = a;
77       des[1] = ((b*192 + c* 64 + 128)>>8);
78       des[2] = ((c*128 + d*128 + 128)>>8);
79       des[3] = ((d* 64 + e*192 + 128)>>8);
80 
81       src += 5;
82       des += 4;
83   }
84   */
85   (void) dest_width;
86 
87   __asm {
88 
89     mov         esi,        source;
90     mov         edi,        dest;
91 
92     mov         ecx,        source_width;
93     movq        mm5,        const54_1;
94 
95     pxor        mm7,        mm7;
96     movq        mm6,        const54_2;
97 
98     movq        mm4,        round_values;
99     lea         edx,        [esi+ecx];
100     horizontal_line_5_4_loop:
101 
102     movq        mm0,        QWORD PTR  [esi];
103     00 01 02 03 04 05 06 07
104     movq        mm1,        mm0;
105     00 01 02 03 04 05 06 07
106 
107     psrlq       mm0,        8;
108     01 02 03 04 05 06 07 xx
109     punpcklbw   mm1,        mm7;
110     xx 00 xx 01 xx 02 xx 03
111 
112     punpcklbw   mm0,        mm7;
113     xx 01 xx 02 xx 03 xx 04
114     pmullw      mm1,        mm5
115 
116     pmullw      mm0,        mm6
117     add         esi,        5
118 
119     add         edi,        4
120     paddw       mm1,        mm0
121 
122     paddw       mm1,        mm4
123     psrlw       mm1,        8
124 
125     cmp         esi,        edx
126     packuswb    mm1,        mm7
127 
128     movd        DWORD PTR [edi-4], mm1
129 
130     jl          horizontal_line_5_4_loop
131 
132   }
133 
134 }
135 __declspec(align(16)) const static unsigned short one_fourths[]   = {  64,  64,  64, 64  };
136 __declspec(align(16)) const static unsigned short two_fourths[]   = { 128, 128, 128, 128 };
137 __declspec(align(16)) const static unsigned short three_fourths[] = { 192, 192, 192, 192 };
138 
139 static
vertical_band_5_4_scale_mmx(unsigned char * source,unsigned int src_pitch,unsigned char * dest,unsigned int dest_pitch,unsigned int dest_width)140 void vertical_band_5_4_scale_mmx(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width) {
141 
142   __asm {
143     push        ebx
144 
145     mov         esi,    source                    // Get the source and destination pointer
146     mov         ecx,    src_pitch               // Get the pitch size
147 
148     mov         edi,    dest                    // tow lines below
149     pxor        mm7,    mm7                     // clear out mm7
150 
151     mov         edx,    dest_pitch               // Loop counter
152     mov         ebx,    dest_width
153 
154     vs_5_4_loop:
155 
156     movd        mm0,    DWORD ptr [esi]         // src[0];
157     movd        mm1,    DWORD ptr [esi+ecx]     // src[1];
158 
159     movd        mm2,    DWORD ptr [esi+ecx*2]
160     lea         eax,    [esi+ecx*2]             //
161 
162     punpcklbw   mm1,    mm7
163     punpcklbw   mm2,    mm7
164 
165     movq        mm3,    mm2
166     pmullw      mm1,    three_fourths
167 
168     pmullw      mm2,    one_fourths
169     movd        mm4,    [eax+ecx]
170 
171     pmullw      mm3,    two_fourths
172     punpcklbw   mm4,    mm7
173 
174     movq        mm5,    mm4
175     pmullw      mm4,    two_fourths
176 
177     paddw       mm1,    mm2
178     movd        mm6,    [eax+ecx*2]
179 
180     pmullw      mm5,    one_fourths
181     paddw       mm1,    round_values;
182 
183     paddw       mm3,    mm4
184     psrlw       mm1,    8
185 
186     punpcklbw   mm6,    mm7
187     paddw       mm3,    round_values
188 
189     pmullw      mm6,    three_fourths
190     psrlw       mm3,    8
191 
192     packuswb    mm1,    mm7
193     packuswb    mm3,    mm7
194 
195     movd        DWORD PTR [edi], mm0
196     movd        DWORD PTR [edi+edx], mm1
197 
198 
199     paddw       mm5,    mm6
200     movd        DWORD PTR [edi+edx*2], mm3
201 
202     lea         eax,    [edi+edx*2]
203     paddw       mm5,    round_values
204 
205     psrlw       mm5,    8
206     add         edi,    4
207 
208     packuswb    mm5,    mm7
209     movd        DWORD PTR [eax+edx], mm5
210 
211     add         esi,    4
212     sub         ebx,    4
213 
214     jg         vs_5_4_loop
215 
216     pop         ebx
217   }
218 }
219 
220 
221 __declspec(align(16)) const static unsigned short const53_1[] = {  0,  85, 171, 0 };
222 __declspec(align(16)) const static unsigned short const53_2[] = {256, 171,  85, 0 };
223 
224 
225 static
horizontal_line_5_3_scale_mmx(const unsigned char * source,unsigned int source_width,unsigned char * dest,unsigned int dest_width)226 void horizontal_line_5_3_scale_mmx
227 (
228   const unsigned char *source,
229   unsigned int source_width,
230   unsigned char *dest,
231   unsigned int dest_width
232 ) {
233 
234   (void) dest_width;
235   __asm {
236 
237     mov         esi,        source;
238     mov         edi,        dest;
239 
240     mov         ecx,        source_width;
241     movq        mm5,        const53_1;
242 
243     pxor        mm7,        mm7;
244     movq        mm6,        const53_2;
245 
246     movq        mm4,        round_values;
247     lea         edx,        [esi+ecx-5];
248     horizontal_line_5_3_loop:
249 
250     movq        mm0,        QWORD PTR  [esi];
251     00 01 02 03 04 05 06 07
252     movq        mm1,        mm0;
253     00 01 02 03 04 05 06 07
254 
255     psllw       mm0,        8;
256     xx 00 xx 02 xx 04 xx 06
257     psrlw       mm1,        8;
258     01 xx 03 xx 05 xx 07 xx
259 
260     psrlw       mm0,        8;
261     00 xx 02 xx 04 xx 06 xx
262     psllq       mm1,        16;
263     xx xx 01 xx 03 xx 05 xx
264 
265     pmullw      mm0,        mm6
266 
267     pmullw      mm1,        mm5
268     add         esi,        5
269 
270     add         edi,        3
271     paddw       mm1,        mm0
272 
273     paddw       mm1,        mm4
274     psrlw       mm1,        8
275 
276     cmp         esi,        edx
277     packuswb    mm1,        mm7
278 
279     movd        DWORD PTR [edi-3], mm1
280     jl          horizontal_line_5_3_loop
281 
282 // exit condition
283     movq        mm0,        QWORD PTR  [esi];
284     00 01 02 03 04 05 06 07
285     movq        mm1,        mm0;
286     00 01 02 03 04 05 06 07
287 
288     psllw       mm0,        8;
289     xx 00 xx 02 xx 04 xx 06
290     psrlw       mm1,        8;
291     01 xx 03 xx 05 xx 07 xx
292 
293     psrlw       mm0,        8;
294     00 xx 02 xx 04 xx 06 xx
295     psllq       mm1,        16;
296     xx xx 01 xx 03 xx 05 xx
297 
298     pmullw      mm0,        mm6
299 
300     pmullw      mm1,        mm5
301     paddw       mm1,        mm0
302 
303     paddw       mm1,        mm4
304     psrlw       mm1,        8
305 
306     packuswb    mm1,        mm7
307     movd        eax,        mm1
308 
309     mov         edx,        eax
310     shr         edx,        16
311 
312     mov         WORD PTR[edi],   ax
313     mov         BYTE PTR[edi+2], dl
314 
315   }
316 
317 }
318 
319 __declspec(align(16)) const static unsigned short one_thirds[] = {  85,  85,  85,  85 };
320 __declspec(align(16)) const static unsigned short two_thirds[] = { 171, 171, 171, 171 };
321 
322 static
vertical_band_5_3_scale_mmx(unsigned char * source,unsigned int src_pitch,unsigned char * dest,unsigned int dest_pitch,unsigned int dest_width)323 void vertical_band_5_3_scale_mmx(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width) {
324 
325   __asm {
326     push        ebx
327 
328     mov         esi,    source                    // Get the source and destination pointer
329     mov         ecx,    src_pitch               // Get the pitch size
330 
331     mov         edi,    dest                    // tow lines below
332     pxor        mm7,    mm7                     // clear out mm7
333 
334     mov         edx,    dest_pitch               // Loop counter
335     movq        mm5,    one_thirds
336 
337     movq        mm6,    two_thirds
338     mov         ebx,    dest_width;
339 
340     vs_5_3_loop:
341 
342     movd        mm0,    DWORD ptr [esi]         // src[0];
343     movd        mm1,    DWORD ptr [esi+ecx]     // src[1];
344 
345     movd        mm2,    DWORD ptr [esi+ecx*2]
346     lea         eax,    [esi+ecx*2]             //
347 
348     punpcklbw   mm1,    mm7
349     punpcklbw   mm2,    mm7
350 
351     pmullw      mm1,    mm5
352     pmullw      mm2,    mm6
353 
354     movd        mm3,    DWORD ptr [eax+ecx]
355     movd        mm4,    DWORD ptr [eax+ecx*2]
356 
357     punpcklbw   mm3,    mm7
358     punpcklbw   mm4,    mm7
359 
360     pmullw      mm3,    mm6
361     pmullw      mm4,    mm5
362 
363 
364     movd        DWORD PTR [edi], mm0
365     paddw       mm1,    mm2
366 
367     paddw       mm1,    round_values
368     psrlw       mm1,    8
369 
370     packuswb    mm1,    mm7
371     paddw       mm3,    mm4
372 
373     paddw       mm3,    round_values
374     movd        DWORD PTR [edi+edx], mm1
375 
376     psrlw       mm3,    8
377     packuswb    mm3,    mm7
378 
379     movd        DWORD PTR [edi+edx*2], mm3
380 
381 
382     add         edi,    4
383     add         esi,    4
384 
385     sub         ebx,    4
386     jg          vs_5_3_loop
387 
388     pop         ebx
389   }
390 }
391 
392 
393 
394 
395 /****************************************************************************
396  *
397  *  ROUTINE       : horizontal_line_2_1_scale
398  *
399  *  INPUTS        : const unsigned char *source :
400  *                  unsigned int source_width    :
401  *                  unsigned char *dest         :
402  *                  unsigned int dest_width      :
403  *
404  *  OUTPUTS       : None.
405  *
406  *  RETURNS       : void
407  *
408  *  FUNCTION      : 1 to 2 up-scaling of a horizontal line of pixels.
409  *
410  *  SPECIAL NOTES : None.
411  *
412  ****************************************************************************/
413 static
horizontal_line_2_1_scale_mmx(const unsigned char * source,unsigned int source_width,unsigned char * dest,unsigned int dest_width)414 void horizontal_line_2_1_scale_mmx
415 (
416   const unsigned char *source,
417   unsigned int source_width,
418   unsigned char *dest,
419   unsigned int dest_width
420 ) {
421   (void) dest_width;
422   (void) source_width;
423   __asm {
424     mov         esi,    source
425     mov         edi,    dest
426 
427     pxor        mm7,    mm7
428     mov         ecx,    dest_width
429 
430     xor         edx,    edx
431     hs_2_1_loop:
432 
433     movq        mm0,    [esi+edx*2]
434     psllw       mm0,    8
435 
436     psrlw       mm0,    8
437     packuswb    mm0,    mm7
438 
439     movd        DWORD Ptr [edi+edx], mm0;
440     add         edx,    4
441 
442     cmp         edx,    ecx
443     jl          hs_2_1_loop
444 
445   }
446 }
447 
448 
449 
450 static
vertical_band_2_1_scale_mmx(unsigned char * source,unsigned int src_pitch,unsigned char * dest,unsigned int dest_pitch,unsigned int dest_width)451 void vertical_band_2_1_scale_mmx(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width) {
452   (void) dest_pitch;
453   (void) src_pitch;
454   vpx_memcpy(dest, source, dest_width);
455 }
456 
457 
458 __declspec(align(16)) const static unsigned short three_sixteenths[] = {  48,  48,  48,  48 };
459 __declspec(align(16)) const static unsigned short ten_sixteenths[]   = { 160, 160, 160, 160 };
460 
461 static
vertical_band_2_1_scale_i_mmx(unsigned char * source,unsigned int src_pitch,unsigned char * dest,unsigned int dest_pitch,unsigned int dest_width)462 void vertical_band_2_1_scale_i_mmx(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width) {
463 
464   (void) dest_pitch;
465   __asm {
466     mov         esi,        source
467     mov         edi,        dest
468 
469     mov         eax,        src_pitch
470     mov         edx,        dest_width
471 
472     pxor        mm7,        mm7
473     sub         esi,        eax             // back one line
474 
475 
476     lea         ecx,        [esi+edx];
477     movq        mm6,        round_values;
478 
479     movq        mm5,        three_sixteenths;
480     movq        mm4,        ten_sixteenths;
481 
482     vs_2_1_i_loop:
483     movd        mm0,        [esi]           //
484     movd        mm1,        [esi+eax]       //
485 
486     movd        mm2,        [esi+eax*2]     //
487     punpcklbw   mm0,        mm7
488 
489     pmullw      mm0,        mm5
490     punpcklbw   mm1,        mm7
491 
492     pmullw      mm1,        mm4
493     punpcklbw   mm2,        mm7
494 
495     pmullw      mm2,        mm5
496     paddw       mm0,        round_values
497 
498     paddw       mm1,        mm2
499     paddw       mm0,        mm1
500 
501     psrlw       mm0,        8
502     packuswb    mm0,        mm7
503 
504     movd        DWORD PTR [edi],        mm0
505     add         esi,        4
506 
507     add         edi,        4;
508     cmp         esi,        ecx
509     jl          vs_2_1_i_loop
510 
511   }
512 }
513 
514 
515 
516 void
register_mmxscalers(void)517 register_mmxscalers(void) {
518   vp8_vertical_band_5_4_scale           = vertical_band_5_4_scale_mmx;
519   vp8_vertical_band_5_3_scale           = vertical_band_5_3_scale_mmx;
520   vp8_vertical_band_2_1_scale           = vertical_band_2_1_scale_mmx;
521   vp8_vertical_band_2_1_scale_i         = vertical_band_2_1_scale_i_mmx;
522   vp8_horizontal_line_2_1_scale         = horizontal_line_2_1_scale_mmx;
523   vp8_horizontal_line_5_3_scale         = horizontal_line_5_3_scale_mmx;
524   vp8_horizontal_line_5_4_scale         = horizontal_line_5_4_scale_mmx;
525 }
526