1 /******************************************************************************
2  *
3  * Copyright (C) 2015 The Android Open Source Project
4  *
5  * Licensed under the Apache License, Version 2.0 (the "License");
6  * you may not use this file except in compliance with the License.
7  * You may obtain a copy of the License at:
8  *
9  * http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  *
17  *****************************************************************************
18  * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19 */
20 
21 /**
22  *******************************************************************************
23  * @file
24  *  impeg2_inter_pred_sse42_intr.c
25  *
26  * @brief
27  *  Contains Motion compensation function definitions for MPEG2 decoder
28  *
29  * @author
30  *  Mohit [100664]
31  *
32  * - impeg2_copy_mb_sse42()
33  * - impeg2_interpolate_sse42()
34  * - impeg2_mc_halfx_halfy_8x8_sse42()
35  * - impeg2_mc_halfx_fully_8x8_sse42()
36  * - impeg2_mc_fullx_halfy_8x8_sse42()
37  * - impeg2_mc_fullx_fully_8x8_sse42()
38  *
39  * @remarks
40  *  None
41  *
42  *******************************************************************************
43  */
44 #include <stdio.h>
45 #include <string.h>
46 #include "iv_datatypedef.h"
47 #include "impeg2_macros.h"
48 #include "impeg2_defs.h"
49 #include "impeg2_inter_pred.h"
50 
51 #include <immintrin.h>
52 #include <emmintrin.h>
53 #include <smmintrin.h>
54 #include <tmmintrin.h>
55 
56 /*******************************************************************************
57 *  Function Name   : impeg2_copy_mb
58 *
59 *  Description     : copies 3 components to the frame from mc_buf
60 *
61 *  Arguments       :
62 *  src_buf         : Source Buffer
63 *  dst_buf         : Destination Buffer
64 *  src_wd          : Source Width
65 *  dst_wd          : destination Width
66 *
67 *  Values Returned : None
68 *******************************************************************************/
impeg2_copy_mb_sse42(yuv_buf_t * src_buf,yuv_buf_t * dst_buf,UWORD32 src_wd,UWORD32 dst_wd)69 void impeg2_copy_mb_sse42(yuv_buf_t *src_buf,
70                     yuv_buf_t *dst_buf,
71                     UWORD32 src_wd,
72                     UWORD32 dst_wd)
73 {
74     UWORD8 *src;
75     UWORD8 *dst;
76     __m128i src_r0, src_r1, src_r2, src_r3;
77 
78     /*******************************************************/
79     /* copy Y                                              */
80     /*******************************************************/
81     src = src_buf->pu1_y;
82     dst = dst_buf->pu1_y;
83     // Row 0-3
84     src_r0 = _mm_loadu_si128((__m128i *) (src));
85     src_r1 = _mm_loadu_si128((__m128i *) (src + src_wd));
86     src_r2 = _mm_loadu_si128((__m128i *) (src + 2 * src_wd));
87     src_r3 = _mm_loadu_si128((__m128i *) (src + 3 * src_wd));
88 
89     _mm_storeu_si128((__m128i *) dst, src_r0);
90     _mm_storeu_si128((__m128i *) (dst + dst_wd), src_r1);
91     _mm_storeu_si128((__m128i *) (dst + 2 * dst_wd), src_r2);
92     _mm_storeu_si128((__m128i *) (dst + 3 * dst_wd), src_r3);
93 
94     // Row 4-7
95     src += 4 * src_wd;
96     dst += 4 * dst_wd;
97     src_r0 = _mm_loadu_si128((__m128i *) (src));
98     src_r1 = _mm_loadu_si128((__m128i *) (src + src_wd));
99     src_r2 = _mm_loadu_si128((__m128i *) (src + 2 * src_wd));
100     src_r3 = _mm_loadu_si128((__m128i *) (src + 3 * src_wd));
101 
102     _mm_storeu_si128((__m128i *) dst, src_r0);
103     _mm_storeu_si128((__m128i *) (dst + dst_wd), src_r1);
104     _mm_storeu_si128((__m128i *) (dst + 2 * dst_wd), src_r2);
105     _mm_storeu_si128((__m128i *) (dst + 3 * dst_wd), src_r3);
106 
107     // Row 8-11
108     src += 4 * src_wd;
109     dst += 4 * dst_wd;
110     src_r0 = _mm_loadu_si128((__m128i *) (src));
111     src_r1 = _mm_loadu_si128((__m128i *) (src + src_wd));
112     src_r2 = _mm_loadu_si128((__m128i *) (src + 2 * src_wd));
113     src_r3 = _mm_loadu_si128((__m128i *) (src + 3 * src_wd));
114 
115     _mm_storeu_si128((__m128i *) dst, src_r0);
116     _mm_storeu_si128((__m128i *) (dst + dst_wd), src_r1);
117     _mm_storeu_si128((__m128i *) (dst + 2 * dst_wd), src_r2);
118     _mm_storeu_si128((__m128i *) (dst + 3 * dst_wd), src_r3);
119 
120     // Row 12-15
121     src += 4 * src_wd;
122     dst += 4 * dst_wd;
123     src_r0 = _mm_loadu_si128((__m128i *) (src));
124     src_r1 = _mm_loadu_si128((__m128i *) (src + src_wd));
125     src_r2 = _mm_loadu_si128((__m128i *) (src + 2 * src_wd));
126     src_r3 = _mm_loadu_si128((__m128i *) (src + 3 * src_wd));
127 
128     _mm_storeu_si128((__m128i *) dst, src_r0);
129     _mm_storeu_si128((__m128i *) (dst + dst_wd), src_r1);
130     _mm_storeu_si128((__m128i *) (dst + 2 * dst_wd), src_r2);
131     _mm_storeu_si128((__m128i *) (dst + 3 * dst_wd), src_r3);
132 
133     src_wd >>= 1;
134     dst_wd >>= 1;
135 
136     /*******************************************************/
137     /* copy U                                              */
138     /*******************************************************/
139     src = src_buf->pu1_u;
140     dst = dst_buf->pu1_u;
141 
142     // Row 0-3
143     src_r0 =  _mm_loadl_epi64((__m128i *)src);
144     src_r1 =  _mm_loadl_epi64((__m128i *)(src + src_wd));
145     src_r2 =  _mm_loadl_epi64((__m128i *)(src + 2 * src_wd));
146     src_r3 =  _mm_loadl_epi64((__m128i *)(src + 3 * src_wd));
147 
148     _mm_storel_epi64((__m128i *)dst, src_r0);
149     _mm_storel_epi64((__m128i *)(dst + dst_wd), src_r1);
150     _mm_storel_epi64((__m128i *)(dst + 2 * dst_wd), src_r2);
151     _mm_storel_epi64((__m128i *)(dst + 3 * dst_wd), src_r3);
152 
153     // Row 4-7
154     src += 4 * src_wd;
155     dst += 4 * dst_wd;
156 
157     src_r0 =  _mm_loadl_epi64((__m128i *)src);
158     src_r1 =  _mm_loadl_epi64((__m128i *)(src + src_wd));
159     src_r2 =  _mm_loadl_epi64((__m128i *)(src + 2 * src_wd));
160     src_r3 =  _mm_loadl_epi64((__m128i *)(src + 3 * src_wd));
161 
162     _mm_storel_epi64((__m128i *)dst, src_r0);
163     _mm_storel_epi64((__m128i *)(dst + dst_wd), src_r1);
164     _mm_storel_epi64((__m128i *)(dst + 2 * dst_wd), src_r2);
165     _mm_storel_epi64((__m128i *)(dst + 3 * dst_wd), src_r3);
166 
167     /*******************************************************/
168     /* copy V                                              */
169     /*******************************************************/
170     src = src_buf->pu1_v;
171     dst = dst_buf->pu1_v;
172     // Row 0-3
173     src_r0 =  _mm_loadl_epi64((__m128i *)src);
174     src_r1 =  _mm_loadl_epi64((__m128i *)(src + src_wd));
175     src_r2 =  _mm_loadl_epi64((__m128i *)(src + 2 * src_wd));
176     src_r3 =  _mm_loadl_epi64((__m128i *)(src + 3 * src_wd));
177 
178     _mm_storel_epi64((__m128i *)dst, src_r0);
179     _mm_storel_epi64((__m128i *)(dst + dst_wd), src_r1);
180     _mm_storel_epi64((__m128i *)(dst + 2 * dst_wd), src_r2);
181     _mm_storel_epi64((__m128i *)(dst + 3 * dst_wd), src_r3);
182 
183     // Row 4-7
184     src += 4 * src_wd;
185     dst += 4 * dst_wd;
186 
187     src_r0 =  _mm_loadl_epi64((__m128i *)src);
188     src_r1 =  _mm_loadl_epi64((__m128i *)(src + src_wd));
189     src_r2 =  _mm_loadl_epi64((__m128i *)(src + 2 * src_wd));
190     src_r3 =  _mm_loadl_epi64((__m128i *)(src + 3 * src_wd));
191 
192     _mm_storel_epi64((__m128i *)dst, src_r0);
193     _mm_storel_epi64((__m128i *)(dst + dst_wd), src_r1);
194     _mm_storel_epi64((__m128i *)(dst + 2 * dst_wd), src_r2);
195     _mm_storel_epi64((__m128i *)(dst + 3 * dst_wd), src_r3);
196 }
197 
198 /*****************************************************************************/
199 /*                                                                           */
200 /*  Function Name : impeg2_interpolate                                       */
201 /*                                                                           */
202 /*  Description   : averages the contents of buf_src1 and buf_src2 and stores*/
203 /*                  result in buf_dst                                        */
204 /*                                                                           */
205 /*  Inputs        : buf_src1 -  First Source                                 */
206 /*                  buf_src2 -  Second Source                                */
207 /*                                                                           */
208 /*  Globals       : None                                                     */
209 /*                                                                           */
210 /*  Processing    : Avg the values from two sources and store the result in  */
211 /*                  destination buffer                                       */
212 /*                                                                           */
213 /*  Outputs       : buf_dst  -  Avg of contents of buf_src1 and buf_src2     */
214 /*                                                                           */
215 /*  Returns       : None                                                     */
216 /*                                                                           */
217 /*  Issues        : Assumes that all 3 buffers are of same size              */
218 /*                                                                           */
219 /*****************************************************************************/
impeg2_interpolate_sse42(yuv_buf_t * buf_src1,yuv_buf_t * buf_src2,yuv_buf_t * buf_dst,UWORD32 stride)220 void impeg2_interpolate_sse42(yuv_buf_t *buf_src1,
221                         yuv_buf_t *buf_src2,
222                         yuv_buf_t *buf_dst,
223                         UWORD32 stride)
224 {
225     UWORD8 *src1, *src2;
226     UWORD8 *dst;
227     __m128i src1_r0, src1_r1, src1_r2, src1_r3;
228     __m128i src2_r0, src2_r1, src2_r2, src2_r3;
229 
230     /*******************************************************/
231     /* interpolate Y                                       */
232     /*******************************************************/
233     src1 = buf_src1->pu1_y;
234     src2 = buf_src2->pu1_y;
235     dst  = buf_dst->pu1_y;
236     // Row 0-3
237     src1_r0 = _mm_loadu_si128((__m128i *) (src1));
238     src1_r1 = _mm_loadu_si128((__m128i *) (src1 + 16));
239     src1_r2 = _mm_loadu_si128((__m128i *) (src1 + 2 * 16));
240     src1_r3 = _mm_loadu_si128((__m128i *) (src1 + 3 * 16));
241 
242     src2_r0 = _mm_loadu_si128((__m128i *) (src2));
243     src2_r1 = _mm_loadu_si128((__m128i *) (src2 + 16));
244     src2_r2 = _mm_loadu_si128((__m128i *) (src2 + 2 * 16));
245     src2_r3 = _mm_loadu_si128((__m128i *) (src2 + 3 * 16));
246 
247     src1_r0 = _mm_avg_epu8 (src1_r0, src2_r0);
248     src1_r1 = _mm_avg_epu8 (src1_r1, src2_r1);
249     src1_r2 = _mm_avg_epu8 (src1_r2, src2_r2);
250     src1_r3 = _mm_avg_epu8 (src1_r3, src2_r3);
251 
252     _mm_storeu_si128((__m128i *) dst, src1_r0);
253     _mm_storeu_si128((__m128i *) (dst + stride), src1_r1);
254     _mm_storeu_si128((__m128i *) (dst + 2 * stride), src1_r2);
255     _mm_storeu_si128((__m128i *) (dst + 3 * stride), src1_r3);
256 
257     // Row 4-7
258     src1 += 4 * 16;
259     src2 += 4 * 16;
260     dst += 4 * stride;
261     src1_r0 = _mm_loadu_si128((__m128i *) (src1));
262     src1_r1 = _mm_loadu_si128((__m128i *) (src1 + 16));
263     src1_r2 = _mm_loadu_si128((__m128i *) (src1 + 2 * 16));
264     src1_r3 = _mm_loadu_si128((__m128i *) (src1 + 3 * 16));
265 
266     src2_r0 = _mm_loadu_si128((__m128i *) (src2));
267     src2_r1 = _mm_loadu_si128((__m128i *) (src2 + 16));
268     src2_r2 = _mm_loadu_si128((__m128i *) (src2 + 2 * 16));
269     src2_r3 = _mm_loadu_si128((__m128i *) (src2 + 3 * 16));
270 
271     src1_r0 = _mm_avg_epu8 (src1_r0, src2_r0);
272     src1_r1 = _mm_avg_epu8 (src1_r1, src2_r1);
273     src1_r2 = _mm_avg_epu8 (src1_r2, src2_r2);
274     src1_r3 = _mm_avg_epu8 (src1_r3, src2_r3);
275 
276     _mm_storeu_si128((__m128i *) dst, src1_r0);
277     _mm_storeu_si128((__m128i *) (dst + stride), src1_r1);
278     _mm_storeu_si128((__m128i *) (dst + 2 * stride), src1_r2);
279     _mm_storeu_si128((__m128i *) (dst + 3 * stride), src1_r3);
280 
281     // Row 8-11
282     src1 += 4 * 16;
283     src2 += 4 * 16;
284     dst += 4 * stride;
285     src1_r0 = _mm_loadu_si128((__m128i *) (src1));
286     src1_r1 = _mm_loadu_si128((__m128i *) (src1 + 16));
287     src1_r2 = _mm_loadu_si128((__m128i *) (src1 + 2 * 16));
288     src1_r3 = _mm_loadu_si128((__m128i *) (src1 + 3 * 16));
289 
290     src2_r0 = _mm_loadu_si128((__m128i *) (src2));
291     src2_r1 = _mm_loadu_si128((__m128i *) (src2 + 16));
292     src2_r2 = _mm_loadu_si128((__m128i *) (src2 + 2 * 16));
293     src2_r3 = _mm_loadu_si128((__m128i *) (src2 + 3 * 16));
294 
295     src1_r0 = _mm_avg_epu8 (src1_r0, src2_r0);
296     src1_r1 = _mm_avg_epu8 (src1_r1, src2_r1);
297     src1_r2 = _mm_avg_epu8 (src1_r2, src2_r2);
298     src1_r3 = _mm_avg_epu8 (src1_r3, src2_r3);
299 
300     _mm_storeu_si128((__m128i *) dst, src1_r0);
301     _mm_storeu_si128((__m128i *) (dst + stride), src1_r1);
302     _mm_storeu_si128((__m128i *) (dst + 2 * stride), src1_r2);
303     _mm_storeu_si128((__m128i *) (dst + 3 * stride), src1_r3);
304 
305     // Row 12-15
306     src1 += 4 * 16;
307     src2 += 4 * 16;
308     dst += 4 * stride;
309     src1_r0 = _mm_loadu_si128((__m128i *) (src1));
310     src1_r1 = _mm_loadu_si128((__m128i *) (src1 + 16));
311     src1_r2 = _mm_loadu_si128((__m128i *) (src1 + 2 * 16));
312     src1_r3 = _mm_loadu_si128((__m128i *) (src1 + 3 * 16));
313 
314     src2_r0 = _mm_loadu_si128((__m128i *) (src2));
315     src2_r1 = _mm_loadu_si128((__m128i *) (src2 + 16));
316     src2_r2 = _mm_loadu_si128((__m128i *) (src2 + 2 * 16));
317     src2_r3 = _mm_loadu_si128((__m128i *) (src2 + 3 * 16));
318 
319     src1_r0 = _mm_avg_epu8 (src1_r0, src2_r0);
320     src1_r1 = _mm_avg_epu8 (src1_r1, src2_r1);
321     src1_r2 = _mm_avg_epu8 (src1_r2, src2_r2);
322     src1_r3 = _mm_avg_epu8 (src1_r3, src2_r3);
323 
324     _mm_storeu_si128((__m128i *) dst, src1_r0);
325     _mm_storeu_si128((__m128i *) (dst + stride), src1_r1);
326     _mm_storeu_si128((__m128i *) (dst + 2 * stride), src1_r2);
327     _mm_storeu_si128((__m128i *) (dst + 3 * stride), src1_r3);
328 
329     stride >>= 1;
330 
331     /*******************************************************/
332     /* interpolate U                                       */
333     /*******************************************************/
334     src1 = buf_src1->pu1_u;
335     src2 = buf_src2->pu1_u;
336     dst  = buf_dst->pu1_u;
337     // Row 0-3
338     src1_r0 = _mm_loadl_epi64((__m128i *) (src1));
339     src1_r1 = _mm_loadl_epi64((__m128i *) (src1 + 8));
340     src1_r2 = _mm_loadl_epi64((__m128i *) (src1 + 2 * 8));
341     src1_r3 = _mm_loadl_epi64((__m128i *) (src1 + 3 * 8));
342 
343     src2_r0 = _mm_loadl_epi64((__m128i *) (src2));
344     src2_r1 = _mm_loadl_epi64((__m128i *) (src2 + 8));
345     src2_r2 = _mm_loadl_epi64((__m128i *) (src2 + 2 * 8));
346     src2_r3 = _mm_loadl_epi64((__m128i *) (src2 + 3 * 8));
347 
348     src1_r0 = _mm_avg_epu8 (src1_r0, src2_r0);
349     src1_r1 = _mm_avg_epu8 (src1_r1, src2_r1);
350     src1_r2 = _mm_avg_epu8 (src1_r2, src2_r2);
351     src1_r3 = _mm_avg_epu8 (src1_r3, src2_r3);
352 
353     _mm_storel_epi64((__m128i *) dst, src1_r0);
354     _mm_storel_epi64((__m128i *) (dst + stride), src1_r1);
355     _mm_storel_epi64((__m128i *) (dst + 2 * stride), src1_r2);
356     _mm_storel_epi64((__m128i *) (dst + 3 * stride), src1_r3);
357 
358     // Row 4-7
359     src1 += 4 * 8;
360     src2 += 4 * 8;
361     dst += 4 * stride;
362 
363     src1_r0 = _mm_loadl_epi64((__m128i *) (src1));
364     src1_r1 = _mm_loadl_epi64((__m128i *) (src1 + 8));
365     src1_r2 = _mm_loadl_epi64((__m128i *) (src1 + 2 * 8));
366     src1_r3 = _mm_loadl_epi64((__m128i *) (src1 + 3 * 8));
367 
368     src2_r0 = _mm_loadl_epi64((__m128i *) (src2));
369     src2_r1 = _mm_loadl_epi64((__m128i *) (src2 + 8));
370     src2_r2 = _mm_loadl_epi64((__m128i *) (src2 + 2 * 8));
371     src2_r3 = _mm_loadl_epi64((__m128i *) (src2 + 3 * 8));
372 
373     src1_r0 = _mm_avg_epu8 (src1_r0, src2_r0);
374     src1_r1 = _mm_avg_epu8 (src1_r1, src2_r1);
375     src1_r2 = _mm_avg_epu8 (src1_r2, src2_r2);
376     src1_r3 = _mm_avg_epu8 (src1_r3, src2_r3);
377 
378     _mm_storel_epi64((__m128i *) dst, src1_r0);
379     _mm_storel_epi64((__m128i *) (dst + stride), src1_r1);
380     _mm_storel_epi64((__m128i *) (dst + 2 * stride), src1_r2);
381     _mm_storel_epi64((__m128i *) (dst + 3 * stride), src1_r3);
382 
383     /*******************************************************/
384     /* interpolate V                                       */
385     /*******************************************************/
386     src1 = buf_src1->pu1_v;
387     src2 = buf_src2->pu1_v;
388     dst  = buf_dst->pu1_v;
389 
390     // Row 0-3
391     src1_r0 = _mm_loadl_epi64((__m128i *) (src1));
392     src1_r1 = _mm_loadl_epi64((__m128i *) (src1 + 8));
393     src1_r2 = _mm_loadl_epi64((__m128i *) (src1 + 2 * 8));
394     src1_r3 = _mm_loadl_epi64((__m128i *) (src1 + 3 * 8));
395 
396     src2_r0 = _mm_loadl_epi64((__m128i *) (src2));
397     src2_r1 = _mm_loadl_epi64((__m128i *) (src2 + 8));
398     src2_r2 = _mm_loadl_epi64((__m128i *) (src2 + 2 * 8));
399     src2_r3 = _mm_loadl_epi64((__m128i *) (src2 + 3 * 8));
400 
401     src1_r0 = _mm_avg_epu8 (src1_r0, src2_r0);
402     src1_r1 = _mm_avg_epu8 (src1_r1, src2_r1);
403     src1_r2 = _mm_avg_epu8 (src1_r2, src2_r2);
404     src1_r3 = _mm_avg_epu8 (src1_r3, src2_r3);
405 
406     _mm_storel_epi64((__m128i *) dst, src1_r0);
407     _mm_storel_epi64((__m128i *) (dst + stride), src1_r1);
408     _mm_storel_epi64((__m128i *) (dst + 2 * stride), src1_r2);
409     _mm_storel_epi64((__m128i *) (dst + 3 * stride), src1_r3);
410 
411     // Row 4-7
412     src1 += 4 * 8;
413     src2 += 4 * 8;
414     dst += 4 * stride;
415 
416     src1_r0 = _mm_loadl_epi64((__m128i *) (src1));
417     src1_r1 = _mm_loadl_epi64((__m128i *) (src1 + 8));
418     src1_r2 = _mm_loadl_epi64((__m128i *) (src1 + 2 * 8));
419     src1_r3 = _mm_loadl_epi64((__m128i *) (src1 + 3 * 8));
420 
421     src2_r0 = _mm_loadl_epi64((__m128i *) (src2));
422     src2_r1 = _mm_loadl_epi64((__m128i *) (src2 + 8));
423     src2_r2 = _mm_loadl_epi64((__m128i *) (src2 + 2 * 8));
424     src2_r3 = _mm_loadl_epi64((__m128i *) (src2 + 3 * 8));
425 
426     src1_r0 = _mm_avg_epu8 (src1_r0, src2_r0);
427     src1_r1 = _mm_avg_epu8 (src1_r1, src2_r1);
428     src1_r2 = _mm_avg_epu8 (src1_r2, src2_r2);
429     src1_r3 = _mm_avg_epu8 (src1_r3, src2_r3);
430 
431     _mm_storel_epi64((__m128i *) dst, src1_r0);
432     _mm_storel_epi64((__m128i *) (dst + stride), src1_r1);
433     _mm_storel_epi64((__m128i *) (dst + 2 * stride), src1_r2);
434     _mm_storel_epi64((__m128i *) (dst + 3 * stride), src1_r3);
435 }
436 
437 /*****************************************************************************/
438 /*                                                                           */
439 /*  Function Name : impeg2_mc_halfx_halfy_8x8_sse42()                                 */
440 /*                                                                           */
441 /*  Description   : Gets the buffer from (0.5,0.5) to (8.5,8.5)              */
442 /*                  and the above block of size 8 x 8 will be placed as a    */
443 /*                  block from the current position of out_buf               */
444 /*                                                                           */
445 /*  Inputs        : ref - Reference frame from which the block will be       */
446 /*                        block will be extracted.                           */
447 /*                  ref_wid - WIdth of reference frame                       */
448 /*                  out_wid - WIdth of the output frame                      */
449 /*                  blk_width  - width of the block                          */
450 /*                  blk_width  - height of the block                         */
451 /*                                                                           */
452 /*  Globals       : None                                                     */
453 /*                                                                           */
454 /*  Processing    : Point to the (0,0),(1,0),(0,1),(1,1) position in         */
455 /*                  the ref frame.Interpolate these four values to get the   */
456 /*                  value at(0.5,0.5).Repeat this to get an 8 x 8 block      */
457 /*                  using 9 x 9 block from reference frame                   */
458 /*                                                                           */
459 /*  Outputs       : out -  Output containing the extracted block             */
460 /*                                                                           */
461 /*  Returns       : None                                                     */
462 /*                                                                           */
463 /*  Issues        : None                                                     */
464 /*                                                                           */
465 /*****************************************************************************/
impeg2_mc_halfx_halfy_8x8_sse42(UWORD8 * out,UWORD8 * ref,UWORD32 ref_wid,UWORD32 out_wid)466 void impeg2_mc_halfx_halfy_8x8_sse42(UWORD8 *out,
467                             UWORD8 *ref,
468                             UWORD32 ref_wid,
469                             UWORD32 out_wid)
470 {
471     UWORD8 *ref_p0,*ref_p1,*ref_p2,*ref_p3;
472     /* P0-P3 are the pixels in the reference frame and Q is the value being */
473     /* estimated                                                            */
474     /*
475        P0 P1
476          Q
477        P2 P3
478     */
479     __m128i src_r0, src_r0_1, src_r1, src_r1_1;
480     __m128i tmp0, tmp1;
481     __m128i value_2 = _mm_set1_epi16(2);
482 
483     ref_p0 = ref;
484     ref_p1 = ref + 1;
485     ref_p2 = ref + ref_wid;
486     ref_p3 = ref + ref_wid + 1;
487 
488     src_r0 = _mm_loadl_epi64((__m128i *) (ref_p0));     //Row 0
489     src_r0_1 = _mm_loadl_epi64((__m128i *) (ref_p1));
490     src_r1 = _mm_loadl_epi64((__m128i *) (ref_p2));     //Row 1
491     src_r1_1 = _mm_loadl_epi64((__m128i *) (ref_p3));
492 
493     src_r0 =  _mm_cvtepu8_epi16(src_r0);
494     src_r0_1 =  _mm_cvtepu8_epi16(src_r0_1);
495     src_r1 =  _mm_cvtepu8_epi16(src_r1);
496     src_r1_1 =  _mm_cvtepu8_epi16(src_r1_1);
497 
498     tmp0 = _mm_add_epi16(src_r0, src_r0_1);             //Row 0 horizontal interpolation
499     tmp1 = _mm_add_epi16(src_r1, src_r1_1);             //Row 1 horizontal interpolation
500     tmp0 = _mm_add_epi16(tmp0, tmp1);                   //Row 0 vertical interpolation
501     tmp0 = _mm_add_epi16(tmp0, value_2);
502     tmp0 =  _mm_srli_epi16(tmp0, 2);
503     tmp0 = _mm_packus_epi16(tmp0, value_2);
504 
505     _mm_storel_epi64((__m128i *)out, tmp0);
506 
507     //Row 1
508     ref_p2 += ref_wid;
509     ref_p3 += ref_wid;
510     out += out_wid;
511 
512     src_r0 = _mm_loadl_epi64((__m128i *) (ref_p2));     //Row 2
513     src_r0_1 = _mm_loadl_epi64((__m128i *) (ref_p3));
514 
515     src_r0 =  _mm_cvtepu8_epi16(src_r0);
516     src_r0_1 =  _mm_cvtepu8_epi16(src_r0_1);
517 
518     tmp0 = _mm_add_epi16(src_r0, src_r0_1);         //Row 2 horizontal interpolation
519     tmp1 = _mm_add_epi16(tmp0, tmp1);               //Row 1 vertical interpolation
520     tmp1 = _mm_add_epi16(tmp1, value_2);
521     tmp1 =  _mm_srli_epi16(tmp1, 2);
522     tmp1 = _mm_packus_epi16(tmp1, value_2);
523 
524     _mm_storel_epi64((__m128i *)out, tmp1);
525 
526     //Row 2
527     ref_p2 += ref_wid;
528     ref_p3 += ref_wid;
529     out += out_wid;
530 
531     src_r0 = _mm_loadl_epi64((__m128i *) (ref_p2));     //Row 3
532     src_r0_1 = _mm_loadl_epi64((__m128i *) (ref_p3));
533 
534     src_r0 =  _mm_cvtepu8_epi16(src_r0);
535     src_r0_1 =  _mm_cvtepu8_epi16(src_r0_1);
536 
537     tmp1 = _mm_add_epi16(src_r0, src_r0_1);         //Row 3 horizontal interpolation
538 
539     tmp0 = _mm_add_epi16(tmp0, tmp1);               //Row 2 vertical interpolation
540     tmp0 = _mm_add_epi16(tmp0, value_2);
541     tmp0 =  _mm_srli_epi16(tmp0, 2);
542     tmp0 = _mm_packus_epi16(tmp0, value_2);
543 
544     _mm_storel_epi64((__m128i *)out, tmp0);
545 
546     //Row 3
547     ref_p2 += ref_wid;
548     ref_p3 += ref_wid;
549     out += out_wid;
550 
551     src_r0 = _mm_loadl_epi64((__m128i *) (ref_p2));     //Row 4
552     src_r0_1 = _mm_loadl_epi64((__m128i *) (ref_p3));
553 
554     src_r0 =  _mm_cvtepu8_epi16(src_r0);
555     src_r0_1 =  _mm_cvtepu8_epi16(src_r0_1);
556 
557     tmp0 = _mm_add_epi16(src_r0, src_r0_1);         //Row 4 horizontal interpolation
558 
559     tmp1 = _mm_add_epi16(tmp0, tmp1);               //Row 3 vertical interpolation
560     tmp1 = _mm_add_epi16(tmp1, value_2);
561     tmp1 =  _mm_srli_epi16(tmp1, 2);
562     tmp1 = _mm_packus_epi16(tmp1, value_2);
563 
564     _mm_storel_epi64((__m128i *)out, tmp1);
565 
566     //Row 4
567     ref_p2 += ref_wid;
568     ref_p3 += ref_wid;
569     out += out_wid;
570 
571     src_r0 = _mm_loadl_epi64((__m128i *) (ref_p2));     //Row 5
572     src_r0_1 = _mm_loadl_epi64((__m128i *) (ref_p3));
573 
574     src_r0 =  _mm_cvtepu8_epi16(src_r0);
575     src_r0_1 =  _mm_cvtepu8_epi16(src_r0_1);
576 
577     tmp1 = _mm_add_epi16(src_r0, src_r0_1);     //Row 5 horizontal interpolation
578 
579     tmp0 = _mm_add_epi16(tmp0, tmp1);           //Row 4 vertical interpolation
580     tmp0 = _mm_add_epi16(tmp0, value_2);
581     tmp0 =  _mm_srli_epi16(tmp0, 2);
582     tmp0 = _mm_packus_epi16(tmp0, value_2);
583 
584     _mm_storel_epi64((__m128i *)out, tmp0);
585 
586     //Row 5
587     ref_p2 += ref_wid;
588     ref_p3 += ref_wid;
589     out += out_wid;
590 
591     src_r0 = _mm_loadl_epi64((__m128i *) (ref_p2));     //Row 6
592     src_r0_1 = _mm_loadl_epi64((__m128i *) (ref_p3));
593 
594     src_r0 =  _mm_cvtepu8_epi16(src_r0);
595     src_r0_1 =  _mm_cvtepu8_epi16(src_r0_1);
596 
597     tmp0 = _mm_add_epi16(src_r0, src_r0_1);             //Row 6 horizontal interpolation
598 
599     tmp1 = _mm_add_epi16(tmp0, tmp1);                   //Row 5 vertical interpolation
600     tmp1 = _mm_add_epi16(tmp1, value_2);
601     tmp1 =  _mm_srli_epi16(tmp1, 2);
602     tmp1 = _mm_packus_epi16(tmp1, value_2);
603 
604     _mm_storel_epi64((__m128i *)out, tmp1);
605 
606     //Row 6
607     ref_p2 += ref_wid;
608     ref_p3 += ref_wid;
609     out += out_wid;
610 
611     src_r0 = _mm_loadl_epi64((__m128i *) (ref_p2));     //Row 7
612     src_r0_1 = _mm_loadl_epi64((__m128i *) (ref_p3));
613 
614     src_r0 =  _mm_cvtepu8_epi16(src_r0);
615     src_r0_1 =  _mm_cvtepu8_epi16(src_r0_1);
616 
617     tmp1 = _mm_add_epi16(src_r0, src_r0_1);             //Row 7 horizontal interpolation
618 
619     tmp0 = _mm_add_epi16(tmp0, tmp1);                   //Row 6 vertical interpolation
620     tmp0 = _mm_add_epi16(tmp0, value_2);
621     tmp0 =  _mm_srli_epi16(tmp0, 2);
622     tmp0 = _mm_packus_epi16(tmp0, value_2);
623 
624     _mm_storel_epi64((__m128i *)out, tmp0);
625 
626     //Row 7
627     ref_p2 += ref_wid;
628     ref_p3 += ref_wid;
629     out += out_wid;
630 
631     src_r0 = _mm_loadl_epi64((__m128i *) (ref_p2));     //Row 8
632     src_r0_1 = _mm_loadl_epi64((__m128i *) (ref_p3));
633 
634     src_r0 =  _mm_cvtepu8_epi16(src_r0);
635     src_r0_1 =  _mm_cvtepu8_epi16(src_r0_1);
636 
637     tmp0 = _mm_add_epi16(src_r0, src_r0_1);             //Row 8 horizontal interpolation
638 
639     tmp1 = _mm_add_epi16(tmp0, tmp1);                   //Row 7 vertical interpolation
640     tmp1 = _mm_add_epi16(tmp1, value_2);
641     tmp1 =  _mm_srli_epi16(tmp1, 2);
642     tmp1 = _mm_packus_epi16(tmp1, value_2);
643 
644     _mm_storel_epi64((__m128i *)out, tmp1);
645 
646     return;
647 }
648 
649 /*****************************************************************************/
650 /*                                                                           */
651 /*  Function Name : impeg2_mc_halfx_fully_8x8_sse42()                                 */
652 /*                                                                           */
653 /*  Description   : Gets the buffer from (0.5,0) to (8.5,8)                  */
654 /*                  and the above block of size 8 x 8 will be placed as a    */
655 /*                  block from the current position of out_buf               */
656 /*                                                                           */
657 /*  Inputs        : ref - Reference frame from which the block will be       */
658 /*                        block will be extracted.                           */
659 /*                  ref_wid - WIdth of reference frame                       */
660 /*                  out_wid - WIdth of the output frame                      */
661 /*                  blk_width  - width of the block                          */
662 /*                  blk_width  - height of the block                         */
663 /*                                                                           */
664 /*  Globals       : None                                                     */
665 /*                                                                           */
666 /*  Processing    : Point to the (0,0) and (1,0) position in the ref frame   */
667 /*                  Interpolate these two values to get the value at(0.5,0)  */
668 /*                  Repeat this to get an 8 x 8 block using 9 x 8 block from */
669 /*                  reference frame                                          */
670 /*                                                                           */
671 /*  Outputs       : out -  Output containing the extracted block             */
672 /*                                                                           */
673 /*  Returns       : None                                                     */
674 /*                                                                           */
675 /*  Issues        : None                                                     */
676 /*                                                                           */
677 /*****************************************************************************/
impeg2_mc_halfx_fully_8x8_sse42(UWORD8 * out,UWORD8 * ref,UWORD32 ref_wid,UWORD32 out_wid)678 void impeg2_mc_halfx_fully_8x8_sse42(UWORD8 *out,
679                             UWORD8 *ref,
680                             UWORD32 ref_wid,
681                             UWORD32 out_wid)
682 {
683     UWORD8 *ref_p0,*ref_p1;
684     __m128i src_r0, src_r0_1, src_r1, src_r1_1;
685     /* P0-P3 are the pixels in the reference frame and Q is the value being */
686     /* estimated                                                            */
687     /*
688        P0 Q P1
689     */
690 
691     ref_p0 = ref;
692     ref_p1 = ref + 1;
693 
694     // Row 0 and 1
695     src_r0 = _mm_loadl_epi64((__m128i *) (ref_p0));     //Row 0
696     src_r0_1 = _mm_loadl_epi64((__m128i *) (ref_p1));
697     src_r1 = _mm_loadl_epi64((__m128i *) (ref_p0 + ref_wid));       //Row 1
698     src_r1_1 = _mm_loadl_epi64((__m128i *) (ref_p1 + ref_wid));
699 
700     src_r0 = _mm_avg_epu8(src_r0, src_r0_1);
701     src_r1 = _mm_avg_epu8(src_r1, src_r1_1);
702 
703     _mm_storel_epi64((__m128i *)out, src_r0);
704     _mm_storel_epi64((__m128i *)(out + out_wid), src_r1);
705 
706     // Row 2 and 3
707     ref_p0 += 2*ref_wid;
708     ref_p1 += 2*ref_wid;
709     out += 2*out_wid;
710 
711     src_r0 = _mm_loadl_epi64((__m128i *) (ref_p0));     //Row 2
712     src_r0_1 = _mm_loadl_epi64((__m128i *) (ref_p1));
713     src_r1 = _mm_loadl_epi64((__m128i *) (ref_p0 + ref_wid));       //Row 3
714     src_r1_1 = _mm_loadl_epi64((__m128i *) (ref_p1 + ref_wid));
715 
716     src_r0 = _mm_avg_epu8(src_r0, src_r0_1);
717     src_r1 = _mm_avg_epu8(src_r1, src_r1_1);
718 
719     _mm_storel_epi64((__m128i *)out, src_r0);
720     _mm_storel_epi64((__m128i *)(out + out_wid), src_r1);
721 
722     // Row 4 and 5
723     ref_p0 += 2*ref_wid;
724     ref_p1 += 2*ref_wid;
725     out += 2*out_wid;
726 
727     src_r0 = _mm_loadl_epi64((__m128i *) (ref_p0));     //Row 4
728     src_r0_1 = _mm_loadl_epi64((__m128i *) (ref_p1));
729     src_r1 = _mm_loadl_epi64((__m128i *) (ref_p0 + ref_wid));       //Row 5
730     src_r1_1 = _mm_loadl_epi64((__m128i *) (ref_p1 + ref_wid));
731 
732     src_r0 = _mm_avg_epu8(src_r0, src_r0_1);
733     src_r1 = _mm_avg_epu8(src_r1, src_r1_1);
734 
735     _mm_storel_epi64((__m128i *)out, src_r0);
736     _mm_storel_epi64((__m128i *)(out + out_wid), src_r1);
737 
738     // Row 6 and 7
739     ref_p0 += 2*ref_wid;
740     ref_p1 += 2*ref_wid;
741     out += 2*out_wid;
742 
743     src_r0 = _mm_loadl_epi64((__m128i *) (ref_p0));     //Row 6
744     src_r0_1 = _mm_loadl_epi64((__m128i *) (ref_p1));
745     src_r1 = _mm_loadl_epi64((__m128i *) (ref_p0 + ref_wid));       //Row 7
746     src_r1_1 = _mm_loadl_epi64((__m128i *) (ref_p1 + ref_wid));
747 
748     src_r0 = _mm_avg_epu8(src_r0, src_r0_1);
749     src_r1 = _mm_avg_epu8(src_r1, src_r1_1);
750 
751     _mm_storel_epi64((__m128i *)out, src_r0);
752     _mm_storel_epi64((__m128i *)(out + out_wid), src_r1);
753 
754     return;
755 }
756 
757 
758 /*****************************************************************************/
759 /*                                                                           */
760 /*  Function Name : impeg2_mc_fullx_halfy_8x8_sse42()                                 */
761 /*                                                                           */
762 /*  Description   : Gets the buffer from (0,0.5) to (8,8.5)                  */
763 /*                  and the above block of size 8 x 8 will be placed as a    */
764 /*                  block from the current position of out_buf               */
765 /*                                                                           */
766 /*  Inputs        : ref - Reference frame from which the block will be       */
767 /*                        block will be extracted.                           */
768 /*                  ref_wid - WIdth of reference frame                       */
769 /*                  out_wid - WIdth of the output frame                      */
770 /*                  blk_width  - width of the block                          */
771 /*                  blk_width  - height of the block                         */
772 /*                                                                           */
773 /*  Globals       : None                                                     */
774 /*                                                                           */
775 /*  Processing    : Point to the (0,0) and (0,1)   position in the ref frame */
776 /*                  Interpolate these two values to get the value at(0,0.5)  */
777 /*                  Repeat this to get an 8 x 8 block using 8 x 9 block from */
778 /*                  reference frame                                          */
779 /*                                                                           */
780 /*  Outputs       : out -  Output containing the extracted block             */
781 /*                                                                           */
782 /*  Returns       : None                                                     */
783 /*                                                                           */
784 /*  Issues        : None                                                     */
785 /*                                                                           */
786 /*****************************************************************************/
impeg2_mc_fullx_halfy_8x8_sse42(UWORD8 * out,UWORD8 * ref,UWORD32 ref_wid,UWORD32 out_wid)787 void impeg2_mc_fullx_halfy_8x8_sse42(UWORD8 *out,
788                             UWORD8 *ref,
789                             UWORD32 ref_wid,
790                             UWORD32 out_wid)
791 {
792     __m128i src_r0, src_r1, src_r2, temp0, temp1;
793     /* P0-P3 are the pixels in the reference frame and Q is the value being */
794     /* estimated                                                            */
795     /*
796        P0
797         x
798        P1
799     */
800     src_r0 = _mm_loadl_epi64((__m128i *)ref);               //Row 0
801     src_r1 = _mm_loadl_epi64((__m128i *)(ref + ref_wid));   //Row 1
802     src_r2 = _mm_loadl_epi64((__m128i *)(ref + 2 * ref_wid));   //Row 2
803     temp0 = _mm_avg_epu8(src_r0, src_r1);
804     temp1 = _mm_avg_epu8(src_r1, src_r2);
805     _mm_storel_epi64((__m128i *)out, temp0);                //Row 0
806     _mm_storel_epi64((__m128i *)(out + out_wid), temp1);    //Row 1
807 
808     ref+= 3*ref_wid;
809     out+= 2*out_wid;
810 
811     src_r0 = _mm_loadl_epi64((__m128i *)ref);               //Row 3
812     src_r1 = _mm_loadl_epi64((__m128i *)(ref + ref_wid));   //Row 4
813     temp0 = _mm_avg_epu8(src_r2, src_r0);
814     temp1 = _mm_avg_epu8(src_r0, src_r1);
815     _mm_storel_epi64((__m128i *)out, temp0);                //Row 2
816     _mm_storel_epi64((__m128i *)(out + out_wid), temp1);    //Row 3
817 
818     ref += 2*ref_wid;
819     out+= 2*out_wid;
820 
821     src_r2 = _mm_loadl_epi64((__m128i *)ref);               //Row 5
822     src_r0 = _mm_loadl_epi64((__m128i *)(ref + ref_wid));   //Row 6
823     temp0 = _mm_avg_epu8(src_r1, src_r2);
824     temp1 = _mm_avg_epu8(src_r2, src_r0);
825     _mm_storel_epi64((__m128i *)out, temp0);                //Row 4
826     _mm_storel_epi64((__m128i *)(out + out_wid), temp1);    //Row 5
827 
828     ref += 2*ref_wid;
829     out+= 2*out_wid;
830 
831     src_r1 = _mm_loadl_epi64((__m128i *)ref);               //Row 7
832     src_r2 = _mm_loadl_epi64((__m128i *) (ref + ref_wid));  //Row 8
833     temp0 = _mm_avg_epu8(src_r0, src_r1);
834     temp1 = _mm_avg_epu8(src_r1, src_r2);
835     _mm_storel_epi64((__m128i *)out, temp0);                //Row 6
836     _mm_storel_epi64((__m128i *)(out + out_wid), temp1);    //Row 7
837 
838     return;
839 }
840 
841 /*****************************************************************************/
842 /*                                                                           */
843 /*  Function Name : impeg2_mc_fullx_fully_8x8_sse42()                                 */
844 /*                                                                           */
845 /*  Description   : Gets the buffer from (x,y) to (x+8,y+8)                  */
846 /*                  and the above block of size 8 x 8 will be placed as a    */
847 /*                  block from the current position of out_buf               */
848 /*                                                                           */
849 /*  Inputs        : ref - Reference frame from which the block will be       */
850 /*                        block will be extracted.                           */
851 /*                  ref_wid - WIdth of reference frame                       */
852 /*                  out_wid - WIdth of the output frame                      */
853 /*                  blk_width  - width of the block                          */
854 /*                  blk_width  - height of the block                         */
855 /*                                                                           */
856 /*  Globals       : None                                                     */
857 /*                                                                           */
858 /*  Processing    : Point to the (0,0) position in the ref frame             */
859 /*                  Get an 8 x 8 block from reference frame                  */
860 /*                                                                           */
861 /*  Outputs       : out -  Output containing the extracted block             */
862 /*                                                                           */
863 /*  Returns       : None                                                     */
864 /*                                                                           */
865 /*  Issues        : None                                                     */
866 /*                                                                           */
867 /*****************************************************************************/
impeg2_mc_fullx_fully_8x8_sse42(UWORD8 * out,UWORD8 * ref,UWORD32 ref_wid,UWORD32 out_wid)868 void impeg2_mc_fullx_fully_8x8_sse42(UWORD8 *out,
869                             UWORD8 *ref,
870                             UWORD32 ref_wid,
871                             UWORD32 out_wid)
872 {
873     __m128i src_r0, src_r1, src_r2, src_r3;
874     // Row 0-3
875     src_r0 =  _mm_loadl_epi64((__m128i *)ref);
876     src_r1 =  _mm_loadl_epi64((__m128i *)(ref + ref_wid));
877     src_r2 =  _mm_loadl_epi64((__m128i *)(ref + 2 * ref_wid));
878     src_r3 =  _mm_loadl_epi64((__m128i *)(ref + 3 * ref_wid));
879 
880     _mm_storel_epi64((__m128i *)out, src_r0);
881     _mm_storel_epi64((__m128i *)(out + out_wid), src_r1);
882     _mm_storel_epi64((__m128i *)(out + 2 * out_wid), src_r2);
883     _mm_storel_epi64((__m128i *)(out + 3 * out_wid), src_r3);
884 
885     // Row 4-7
886     ref += 4 * ref_wid;
887     out += 4 * out_wid;
888 
889     src_r0 =  _mm_loadl_epi64((__m128i *)ref);
890     src_r1 =  _mm_loadl_epi64((__m128i *)(ref + ref_wid));
891     src_r2 =  _mm_loadl_epi64((__m128i *)(ref + 2 * ref_wid));
892     src_r3 =  _mm_loadl_epi64((__m128i *)(ref + 3 * ref_wid));
893 
894     _mm_storel_epi64((__m128i *)out, src_r0);
895     _mm_storel_epi64((__m128i *)(out + out_wid), src_r1);
896     _mm_storel_epi64((__m128i *)(out + 2 * out_wid), src_r2);
897     _mm_storel_epi64((__m128i *)(out + 3 * out_wid), src_r3);
898     return;
899 }
900