1 /******************************************************************************
2  *
3  * Copyright (C) 2015 The Android Open Source Project
4  *
5  * Licensed under the Apache License, Version 2.0 (the "License");
6  * you may not use this file except in compliance with the License.
7  * You may obtain a copy of the License at:
8  *
9  * http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  *
17  *****************************************************************************
18  * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19 */
20 /*****************************************************************************/
21 /*                                                                           */
22 /*  File Name         : ih264_weighted_pred_intr_sse42.c                     */
23 /*                                                                           */
24 /*  Description       : Contains function definitions for weighted           */
25 /*                      prediction functions in x86 sse4 intrinsics          */
26 /*                                                                           */
27 /*  List of Functions : ih264_default_weighted_pred_luma_sse42()             */
28 /*                      ih264_default_weighted_pred_chroma_sse42()           */
29 /*                      ih264_weighted_pred_luma_sse42()                     */
30 /*                      ih264_weighted_pred_chroma_sse42()                   */
31 /*                      ih264_weighted_bipred_luma_sse42()                   */
32 /*                      ih264_weighted_bipred_chroma_sse42()                 */
33 /*                                                                           */
34 /*  Issues / Problems : None                                                 */
35 /*                                                                           */
36 /*  Revision History  :                                                      */
37 /*                                                                           */
38 /*         DD MM YYYY   Author(s)       Changes                              */
39 /*         30 01 2015   Kaushik         Initial version                      */
40 /*                      Senthoor                                             */
41 /*                                                                           */
42 /*****************************************************************************/
43 /*****************************************************************************/
44 /* File Includes                                                             */
45 /*****************************************************************************/
46 
47 #include <immintrin.h>
48 #include "ih264_typedefs.h"
49 #include "ih264_macros.h"
50 #include "ih264_platform_macros.h"
51 #include "ih264_weighted_pred.h"
52 
53 /*****************************************************************************/
54 /*  Function definitions .                                                   */
55 /*****************************************************************************/
56 /*****************************************************************************/
57 /*                                                                           */
58 /*  Function Name : ih264_default_weighted_pred_luma_sse42                   */
59 /*                                                                           */
60 /*  Description   : This function performs the default weighted prediction   */
61 /*                  as described in sec 8.4.2.3.1 titled "Default weighted   */
62 /*                  sample prediction process" for luma. The function gets   */
63 /*                  two ht x wd blocks, calculates their rounded-average and */
64 /*                  stores it in the destination block. (ht,wd) can be       */
65 /*                  (4,4), (8,4), (4,8), (8,8), (16,8), (8,16) or (16,16).   */
66 /*                                                                           */
67 /*  Inputs        : pu1_src1  - Pointer to source 1                          */
68 /*                  pu1_src2  - Pointer to source 2                          */
69 /*                  pu1_dst   - Pointer to destination                       */
70 /*                  src_strd1 - stride for source 1                          */
71 /*                  src_strd1 - stride for source 2                          */
72 /*                  dst_strd  - stride for destination                       */
73 /*                  ht        - height of the block                          */
74 /*                  wd        - width of the block                           */
75 /*                                                                           */
76 /*  Issues        : None                                                     */
77 /*                                                                           */
78 /*  Revision History:                                                        */
79 /*                                                                           */
80 /*         DD MM YYYY   Author(s)       Changes                              */
81 /*         04 02 2015   Kaushik         Initial Version                      */
82 /*                      Senthoor                                             */
83 /*                                                                           */
84 /*****************************************************************************/
ih264_default_weighted_pred_luma_sse42(UWORD8 * pu1_src1,UWORD8 * pu1_src2,UWORD8 * pu1_dst,WORD32 src_strd1,WORD32 src_strd2,WORD32 dst_strd,WORD32 ht,WORD32 wd)85 void ih264_default_weighted_pred_luma_sse42(UWORD8 *pu1_src1,
86                                             UWORD8 *pu1_src2,
87                                             UWORD8 *pu1_dst,
88                                             WORD32 src_strd1,
89                                             WORD32 src_strd2,
90                                             WORD32 dst_strd,
91                                             WORD32 ht,
92                                             WORD32 wd)
93 {
94     __m128i y0_0_16x8b, y0_1_16x8b, y0_2_16x8b, y0_3_16x8b;
95     __m128i y1_0_16x8b, y1_1_16x8b, y1_2_16x8b, y1_3_16x8b;
96 
97     if(wd == 4)
98     {
99         do
100         {
101             y0_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src1);
102             y0_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src1 + src_strd1));
103             y0_2_16x8b = _mm_loadl_epi64(
104                             (__m128i *)(pu1_src1 + (src_strd1 << 1)));
105             y0_3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src1 + src_strd1 * 3));
106 
107             y1_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src2);
108             y1_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src2 + src_strd2));
109             y1_2_16x8b = _mm_loadl_epi64(
110                             (__m128i *)(pu1_src2 + (src_strd2 << 1)));
111             y1_3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src2 + src_strd2 * 3));
112 
113             y0_0_16x8b = _mm_avg_epu8(y0_0_16x8b, y1_0_16x8b);
114             y0_1_16x8b = _mm_avg_epu8(y0_1_16x8b, y1_1_16x8b);
115             y0_2_16x8b = _mm_avg_epu8(y0_2_16x8b, y1_2_16x8b);
116             y0_3_16x8b = _mm_avg_epu8(y0_3_16x8b, y1_3_16x8b);
117 
118             *((WORD32 *)(pu1_dst)) = _mm_cvtsi128_si32(y0_0_16x8b);
119             *((WORD32 *)(pu1_dst + dst_strd)) = _mm_cvtsi128_si32(y0_1_16x8b);
120             *((WORD32 *)(pu1_dst + (dst_strd << 1))) = _mm_cvtsi128_si32(y0_2_16x8b);
121             *((WORD32 *)(pu1_dst + dst_strd * 3)) = _mm_cvtsi128_si32(y0_3_16x8b);
122 
123             ht -= 4;
124             pu1_src1 += src_strd1 << 2;
125             pu1_src2 += src_strd2 << 2;
126             pu1_dst += dst_strd << 2;
127         }
128         while(ht > 0);
129     }
130     else if(wd == 8)
131     {
132         do
133         {
134             y0_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src1);
135             y0_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src1 + src_strd1));
136             y0_2_16x8b = _mm_loadl_epi64(
137                             (__m128i *)(pu1_src1 + (src_strd1 << 1)));
138             y0_3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src1 + src_strd1 * 3));
139 
140             y1_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src2);
141             y1_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src2 + src_strd2));
142             y1_2_16x8b = _mm_loadl_epi64(
143                             (__m128i *)(pu1_src2 + (src_strd2 << 1)));
144             y1_3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src2 + src_strd2 * 3));
145 
146             y0_0_16x8b = _mm_avg_epu8(y0_0_16x8b, y1_0_16x8b);
147             y0_1_16x8b = _mm_avg_epu8(y0_1_16x8b, y1_1_16x8b);
148             y0_2_16x8b = _mm_avg_epu8(y0_2_16x8b, y1_2_16x8b);
149             y0_3_16x8b = _mm_avg_epu8(y0_3_16x8b, y1_3_16x8b);
150 
151             _mm_storel_epi64((__m128i *)pu1_dst, y0_0_16x8b);
152             _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), y0_1_16x8b);
153             _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd << 1)), y0_2_16x8b);
154             _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd * 3), y0_3_16x8b);
155 
156             ht -= 4;
157             pu1_src1 += src_strd1 << 2;
158             pu1_src2 += src_strd2 << 2;
159             pu1_dst += dst_strd << 2;
160         }
161         while(ht > 0);
162     }
163     else // wd == 16
164     {
165         __m128i y0_4_16x8b, y0_5_16x8b, y0_6_16x8b, y0_7_16x8b;
166         __m128i y1_4_16x8b, y1_5_16x8b, y1_6_16x8b, y1_7_16x8b;
167 
168         do
169         {
170             y0_0_16x8b = _mm_loadu_si128((__m128i *)pu1_src1);
171             y0_1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src1 + src_strd1));
172             y0_2_16x8b = _mm_loadu_si128(
173                             (__m128i *)(pu1_src1 + (src_strd1 << 1)));
174             y0_3_16x8b = _mm_loadu_si128((__m128i *)(pu1_src1 + src_strd1 * 3));
175             y0_4_16x8b = _mm_loadu_si128(
176                             (__m128i *)(pu1_src1 + (src_strd1 << 2)));
177             y0_5_16x8b = _mm_loadu_si128((__m128i *)(pu1_src1 + src_strd1 * 5));
178             y0_6_16x8b = _mm_loadu_si128((__m128i *)(pu1_src1 + src_strd1 * 6));
179             y0_7_16x8b = _mm_loadu_si128((__m128i *)(pu1_src1 + src_strd1 * 7));
180 
181             y1_0_16x8b = _mm_loadu_si128((__m128i *)pu1_src2);
182             y1_1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src2 + src_strd2));
183             y1_2_16x8b = _mm_loadu_si128(
184                             (__m128i *)(pu1_src2 + (src_strd2 << 1)));
185             y1_3_16x8b = _mm_loadu_si128((__m128i *)(pu1_src2 + src_strd2 * 3));
186             y1_4_16x8b = _mm_loadu_si128(
187                             (__m128i *)(pu1_src2 + (src_strd2 << 2)));
188             y1_5_16x8b = _mm_loadu_si128((__m128i *)(pu1_src2 + src_strd2 * 5));
189             y1_6_16x8b = _mm_loadu_si128((__m128i *)(pu1_src2 + src_strd2 * 6));
190             y1_7_16x8b = _mm_loadu_si128((__m128i *)(pu1_src2 + src_strd2 * 7));
191 
192             y0_0_16x8b = _mm_avg_epu8(y0_0_16x8b, y1_0_16x8b);
193             y0_1_16x8b = _mm_avg_epu8(y0_1_16x8b, y1_1_16x8b);
194             y0_2_16x8b = _mm_avg_epu8(y0_2_16x8b, y1_2_16x8b);
195             y0_3_16x8b = _mm_avg_epu8(y0_3_16x8b, y1_3_16x8b);
196             y0_4_16x8b = _mm_avg_epu8(y0_4_16x8b, y1_4_16x8b);
197             y0_5_16x8b = _mm_avg_epu8(y0_5_16x8b, y1_5_16x8b);
198             y0_6_16x8b = _mm_avg_epu8(y0_6_16x8b, y1_6_16x8b);
199             y0_7_16x8b = _mm_avg_epu8(y0_7_16x8b, y1_7_16x8b);
200 
201             _mm_storeu_si128((__m128i *)pu1_dst, y0_0_16x8b);
202             _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), y0_1_16x8b);
203             _mm_storeu_si128((__m128i *)(pu1_dst + (dst_strd << 1)), y0_2_16x8b);
204             _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd * 3), y0_3_16x8b);
205             _mm_storeu_si128((__m128i *)(pu1_dst + (dst_strd << 2)), y0_4_16x8b);
206             _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd * 5), y0_5_16x8b);
207             _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd * 6), y0_6_16x8b);
208             _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd * 7), y0_7_16x8b);
209 
210             ht -= 8;
211             pu1_src1 += src_strd1 << 3;
212             pu1_src2 += src_strd2 << 3;
213             pu1_dst += dst_strd << 3;
214         }
215         while(ht > 0);
216     }
217 }
218 
219 /*****************************************************************************/
220 /*                                                                           */
221 /*  Function Name : ih264_default_weighted_pred_chroma_sse42                 */
222 /*                                                                           */
223 /*  Description   : This function performs the default weighted prediction   */
224 /*                  as described in sec 8.4.2.3.1 titled "Default weighted   */
225 /*                  sample prediction process" for chroma. The function gets */
226 /*                  two ht x wd blocks, calculates their rounded-average and */
227 /*                  stores it in the destination block. (ht,wd) can be       */
228 /*                  (2,2), (4,2) , (2,4), (4,4), (8,4), (4,8) or (8,8).      */
229 /*                                                                           */
230 /*  Inputs        : pu1_src1  - Pointer to source 1                          */
231 /*                  pu1_src2  - Pointer to source 2                          */
232 /*                  pu1_dst   - Pointer to destination                       */
233 /*                  src_strd1 - stride for source 1                          */
234 /*                  src_strd1 - stride for source 2                          */
235 /*                  dst_strd  - stride for destination                       */
236 /*                  ht        - height of the block                          */
237 /*                  wd        - width of the block                           */
238 /*                                                                           */
239 /*  Issues        : None                                                     */
240 /*                                                                           */
241 /*  Revision History:                                                        */
242 /*                                                                           */
243 /*         DD MM YYYY   Author(s)       Changes                              */
244 /*         04 02 2015   Kaushik         Initial Version                      */
245 /*                      Senthoor                                             */
246 /*                                                                           */
247 /*****************************************************************************/
ih264_default_weighted_pred_chroma_sse42(UWORD8 * pu1_src1,UWORD8 * pu1_src2,UWORD8 * pu1_dst,WORD32 src_strd1,WORD32 src_strd2,WORD32 dst_strd,WORD32 ht,WORD32 wd)248 void ih264_default_weighted_pred_chroma_sse42(UWORD8 *pu1_src1,
249                                               UWORD8 *pu1_src2,
250                                               UWORD8 *pu1_dst,
251                                               WORD32 src_strd1,
252                                               WORD32 src_strd2,
253                                               WORD32 dst_strd,
254                                               WORD32 ht,
255                                               WORD32 wd)
256 {
257     __m128i uv0_0_16x8b, uv0_1_16x8b;
258     __m128i uv1_0_16x8b, uv1_1_16x8b;
259 
260     if(wd == 2)
261     {
262         do
263         {
264             uv0_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src1);
265             uv0_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src1 + src_strd1));
266 
267             uv1_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src2);
268             uv1_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src2 + src_strd2));
269 
270             uv0_0_16x8b = _mm_avg_epu8(uv0_0_16x8b, uv1_0_16x8b);
271             uv0_1_16x8b = _mm_avg_epu8(uv0_1_16x8b, uv1_1_16x8b);
272 
273             *((WORD32 *)(pu1_dst)) = _mm_cvtsi128_si32(uv0_0_16x8b);
274             *((WORD32 *)(pu1_dst + dst_strd)) = _mm_cvtsi128_si32(uv0_1_16x8b);
275 
276             ht -= 2;
277             pu1_src1 += src_strd1 << 1;
278             pu1_src2 += src_strd2 << 1;
279             pu1_dst += dst_strd << 1;
280         }
281         while(ht > 0);
282     }
283     else if(wd == 4)
284     {
285         do
286         {
287             uv0_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src1);
288             uv0_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src1 + src_strd1));
289 
290             uv1_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src2);
291             uv1_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src2 + src_strd2));
292 
293             uv0_0_16x8b = _mm_avg_epu8(uv0_0_16x8b, uv1_0_16x8b);
294             uv0_1_16x8b = _mm_avg_epu8(uv0_1_16x8b, uv1_1_16x8b);
295 
296             _mm_storel_epi64((__m128i *)pu1_dst, uv0_0_16x8b);
297             _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), uv0_1_16x8b);
298 
299             ht -= 2;
300             pu1_src1 += src_strd1 << 1;
301             pu1_src2 += src_strd2 << 1;
302             pu1_dst += dst_strd << 1;
303         }
304         while(ht > 0);
305     }
306     else // wd == 8
307     {
308         __m128i uv0_2_16x8b, uv0_3_16x8b;
309         __m128i uv1_2_16x8b, uv1_3_16x8b;
310 
311         do
312         {
313             uv0_0_16x8b = _mm_loadu_si128((__m128i *)pu1_src1);
314             uv0_1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src1 + src_strd1));
315             uv0_2_16x8b = _mm_loadu_si128(
316                             (__m128i *)(pu1_src1 + (src_strd1 << 1)));
317             uv0_3_16x8b = _mm_loadu_si128(
318                             (__m128i *)(pu1_src1 + src_strd1 * 3));
319 
320             uv1_0_16x8b = _mm_loadu_si128((__m128i *)pu1_src2);
321             uv1_1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src2 + src_strd2));
322             uv1_2_16x8b = _mm_loadu_si128(
323                             (__m128i *)(pu1_src2 + (src_strd2 << 1)));
324             uv1_3_16x8b = _mm_loadu_si128(
325                             (__m128i *)(pu1_src2 + src_strd2 * 3));
326 
327             uv0_0_16x8b = _mm_avg_epu8(uv0_0_16x8b, uv1_0_16x8b);
328             uv0_1_16x8b = _mm_avg_epu8(uv0_1_16x8b, uv1_1_16x8b);
329             uv0_2_16x8b = _mm_avg_epu8(uv0_2_16x8b, uv1_2_16x8b);
330             uv0_3_16x8b = _mm_avg_epu8(uv0_3_16x8b, uv1_3_16x8b);
331 
332             _mm_storeu_si128((__m128i *)pu1_dst, uv0_0_16x8b);
333             _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), uv0_1_16x8b);
334             _mm_storeu_si128(
335                             (__m128i *)(pu1_dst + (dst_strd << 1)), uv0_2_16x8b);
336             _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd * 3), uv0_3_16x8b);
337 
338             ht -= 4;
339             pu1_src1 += src_strd1 << 2;
340             pu1_src2 += src_strd2 << 2;
341             pu1_dst += dst_strd << 2;
342         }
343         while(ht > 0);
344     }
345 }
346 
347 /*****************************************************************************/
348 /*                                                                           */
349 /*  Function Name : ih264_weighted_pred_luma_sse42                           */
350 /*                                                                           */
351 /*  Description   : This function performs the weighted prediction as        */
352 /*                  described in sec 8.4.2.3.2 titled "Weighted sample       */
353 /*                  prediction process" for luma. The function gets one      */
354 /*                  ht x wd block, weights it, rounds it off, offsets it,    */
355 /*                  saturates it to unsigned 8-bit and stores it in the      */
356 /*                  destination block. (ht,wd) can be (4,4), (8,4), (4,8),   */
357 /*                  (8,8), (16,8), (8,16) or (16,16).                        */
358 /*                                                                           */
359 /*  Inputs        : pu1_src  - Pointer to source                             */
360 /*                  pu1_dst  - Pointer to destination                        */
361 /*                  src_strd - stride for source                             */
362 /*                  dst_strd - stride for destination                        */
363 /*                  log_wd   - number of bits to be rounded off              */
364 /*                  wt       - weight value                                  */
365 /*                  ofst     - offset value                                  */
366 /*                  ht       - height of the block                           */
367 /*                  wd       - width of the block                            */
368 /*                                                                           */
369 /*  Issues        : None                                                     */
370 /*                                                                           */
371 /*  Revision History:                                                        */
372 /*                                                                           */
373 /*         DD MM YYYY   Author(s)       Changes                              */
374 /*         04 02 2015   Kaushik         Initial Version                      */
375 /*                      Senthoor                                             */
376 /*                                                                           */
377 /*****************************************************************************/
ih264_weighted_pred_luma_sse42(UWORD8 * pu1_src,UWORD8 * pu1_dst,WORD32 src_strd,WORD32 dst_strd,WORD32 log_wd,WORD32 wt,WORD32 ofst,WORD32 ht,WORD32 wd)378 void ih264_weighted_pred_luma_sse42(UWORD8 *pu1_src,
379                                     UWORD8 *pu1_dst,
380                                     WORD32 src_strd,
381                                     WORD32 dst_strd,
382                                     WORD32 log_wd,
383                                     WORD32 wt,
384                                     WORD32 ofst,
385                                     WORD32 ht,
386                                     WORD32 wd)
387 {
388     __m128i y_0_16x8b, y_1_16x8b, y_2_16x8b, y_3_16x8b;
389 
390     __m128i wt_8x16b, round_8x16b, ofst_8x16b;
391 
392     WORD32 round_val;
393 
394     wt = (WORD16)(wt & 0xffff);
395     round_val = 1 << (log_wd - 1);
396     ofst = (WORD8)(ofst & 0xff);
397 
398     wt_8x16b = _mm_set1_epi16(wt);
399     round_8x16b = _mm_set1_epi16(round_val);
400     ofst_8x16b = _mm_set1_epi16(ofst);
401 
402     if(wd == 4)
403     {
404         __m128i y_0_8x16b, y_2_8x16b;
405 
406         do
407         {
408             y_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src);
409             y_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd));
410             y_2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (src_strd << 1)));
411             y_3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd * 3));
412 
413             y_0_16x8b = _mm_unpacklo_epi32(y_0_16x8b, y_1_16x8b);
414             y_2_16x8b = _mm_unpacklo_epi32(y_2_16x8b, y_3_16x8b);
415 
416             y_0_8x16b = _mm_cvtepu8_epi16(y_0_16x8b);
417             y_2_8x16b = _mm_cvtepu8_epi16(y_2_16x8b);
418 
419             y_0_8x16b = _mm_mullo_epi16(y_0_8x16b, wt_8x16b);
420             y_2_8x16b = _mm_mullo_epi16(y_2_8x16b, wt_8x16b);
421 
422             y_0_8x16b = _mm_adds_epi16(round_8x16b, y_0_8x16b);
423             y_2_8x16b = _mm_adds_epi16(round_8x16b, y_2_8x16b);
424 
425             y_0_8x16b = _mm_srai_epi16(y_0_8x16b, log_wd);
426             y_2_8x16b = _mm_srai_epi16(y_2_8x16b, log_wd);
427 
428             y_0_8x16b = _mm_adds_epi16(ofst_8x16b, y_0_8x16b);
429             y_2_8x16b = _mm_adds_epi16(ofst_8x16b, y_2_8x16b);
430 
431             y_0_16x8b = _mm_packus_epi16(y_0_8x16b, y_2_8x16b);
432             y_1_16x8b = _mm_srli_si128(y_0_16x8b, 4);
433             y_2_16x8b = _mm_srli_si128(y_0_16x8b, 8);
434             y_3_16x8b = _mm_srli_si128(y_0_16x8b, 12);
435 
436             *((WORD32 *)(pu1_dst)) = _mm_cvtsi128_si32(y_0_16x8b);
437             *((WORD32 *)(pu1_dst + dst_strd)) = _mm_cvtsi128_si32(y_1_16x8b);
438             *((WORD32 *)(pu1_dst + (dst_strd << 1))) = _mm_cvtsi128_si32(y_2_16x8b);
439             *((WORD32 *)(pu1_dst + dst_strd * 3)) = _mm_cvtsi128_si32(y_3_16x8b);
440 
441             ht -= 4;
442             pu1_src += src_strd << 2;
443             pu1_dst += dst_strd << 2;
444         }
445         while(ht > 0);
446     }
447     else if(wd == 8)
448     {
449         __m128i y_0_8x16b, y_1_8x16b, y_2_8x16b, y_3_8x16b;
450 
451         do
452         {
453             y_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src);
454             y_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd));
455             y_2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (src_strd << 1)));
456             y_3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd * 3));
457 
458             y_0_8x16b = _mm_cvtepu8_epi16(y_0_16x8b);
459             y_1_8x16b = _mm_cvtepu8_epi16(y_1_16x8b);
460             y_2_8x16b = _mm_cvtepu8_epi16(y_2_16x8b);
461             y_3_8x16b = _mm_cvtepu8_epi16(y_3_16x8b);
462 
463             y_0_8x16b = _mm_mullo_epi16(y_0_8x16b, wt_8x16b);
464             y_1_8x16b = _mm_mullo_epi16(y_1_8x16b, wt_8x16b);
465             y_2_8x16b = _mm_mullo_epi16(y_2_8x16b, wt_8x16b);
466             y_3_8x16b = _mm_mullo_epi16(y_3_8x16b, wt_8x16b);
467 
468             y_0_8x16b = _mm_adds_epi16(round_8x16b, y_0_8x16b);
469             y_1_8x16b = _mm_adds_epi16(round_8x16b, y_1_8x16b);
470             y_2_8x16b = _mm_adds_epi16(round_8x16b, y_2_8x16b);
471             y_3_8x16b = _mm_adds_epi16(round_8x16b, y_3_8x16b);
472 
473             y_0_8x16b = _mm_srai_epi16(y_0_8x16b, log_wd);
474             y_1_8x16b = _mm_srai_epi16(y_1_8x16b, log_wd);
475             y_2_8x16b = _mm_srai_epi16(y_2_8x16b, log_wd);
476             y_3_8x16b = _mm_srai_epi16(y_3_8x16b, log_wd);
477 
478             y_0_8x16b = _mm_adds_epi16(ofst_8x16b, y_0_8x16b);
479             y_1_8x16b = _mm_adds_epi16(ofst_8x16b, y_1_8x16b);
480             y_2_8x16b = _mm_adds_epi16(ofst_8x16b, y_2_8x16b);
481             y_3_8x16b = _mm_adds_epi16(ofst_8x16b, y_3_8x16b);
482 
483             y_0_16x8b = _mm_packus_epi16(y_0_8x16b, y_1_8x16b);
484             y_2_16x8b = _mm_packus_epi16(y_2_8x16b, y_3_8x16b);
485             y_1_16x8b = _mm_srli_si128(y_0_16x8b, 8);
486             y_3_16x8b = _mm_srli_si128(y_2_16x8b, 8);
487 
488             _mm_storel_epi64((__m128i *)pu1_dst, y_0_16x8b);
489             _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), y_1_16x8b);
490             _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd << 1)), y_2_16x8b);
491             _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd * 3), y_3_16x8b);
492 
493             ht -= 4;
494             pu1_src += src_strd << 2;
495             pu1_dst += dst_strd << 2;
496         }
497         while(ht > 0);
498     }
499     else // wd == 16
500     {
501         __m128i y_0L_8x16b, y_1L_8x16b, y_2L_8x16b, y_3L_8x16b;
502         __m128i y_0H_8x16b, y_1H_8x16b, y_2H_8x16b, y_3H_8x16b;
503 
504         __m128i zero_16x8b;
505         zero_16x8b = _mm_set1_epi8(0);
506 
507         do
508         {
509             y_0_16x8b = _mm_loadu_si128((__m128i *)pu1_src);
510             y_1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + src_strd));
511             y_2_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + (src_strd << 1)));
512             y_3_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + src_strd * 3));
513 
514             y_0L_8x16b = _mm_cvtepu8_epi16(y_0_16x8b);
515             y_0H_8x16b = _mm_unpackhi_epi8(y_0_16x8b, zero_16x8b);
516             y_1L_8x16b = _mm_cvtepu8_epi16(y_1_16x8b);
517             y_1H_8x16b = _mm_unpackhi_epi8(y_1_16x8b, zero_16x8b);
518             y_2L_8x16b = _mm_cvtepu8_epi16(y_2_16x8b);
519             y_2H_8x16b = _mm_unpackhi_epi8(y_2_16x8b, zero_16x8b);
520             y_3L_8x16b = _mm_cvtepu8_epi16(y_3_16x8b);
521             y_3H_8x16b = _mm_unpackhi_epi8(y_3_16x8b, zero_16x8b);
522 
523             y_0L_8x16b = _mm_mullo_epi16(y_0L_8x16b, wt_8x16b);
524             y_0H_8x16b = _mm_mullo_epi16(y_0H_8x16b, wt_8x16b);
525             y_1L_8x16b = _mm_mullo_epi16(y_1L_8x16b, wt_8x16b);
526             y_1H_8x16b = _mm_mullo_epi16(y_1H_8x16b, wt_8x16b);
527             y_2L_8x16b = _mm_mullo_epi16(y_2L_8x16b, wt_8x16b);
528             y_2H_8x16b = _mm_mullo_epi16(y_2H_8x16b, wt_8x16b);
529             y_3L_8x16b = _mm_mullo_epi16(y_3L_8x16b, wt_8x16b);
530             y_3H_8x16b = _mm_mullo_epi16(y_3H_8x16b, wt_8x16b);
531 
532             y_0L_8x16b = _mm_adds_epi16(round_8x16b, y_0L_8x16b);
533             y_0H_8x16b = _mm_adds_epi16(round_8x16b, y_0H_8x16b);
534             y_1L_8x16b = _mm_adds_epi16(round_8x16b, y_1L_8x16b);
535             y_1H_8x16b = _mm_adds_epi16(round_8x16b, y_1H_8x16b);
536             y_2L_8x16b = _mm_adds_epi16(round_8x16b, y_2L_8x16b);
537             y_2H_8x16b = _mm_adds_epi16(round_8x16b, y_2H_8x16b);
538             y_3L_8x16b = _mm_adds_epi16(round_8x16b, y_3L_8x16b);
539             y_3H_8x16b = _mm_adds_epi16(round_8x16b, y_3H_8x16b);
540 
541             y_0L_8x16b = _mm_srai_epi16(y_0L_8x16b, log_wd);
542             y_0H_8x16b = _mm_srai_epi16(y_0H_8x16b, log_wd);
543             y_1L_8x16b = _mm_srai_epi16(y_1L_8x16b, log_wd);
544             y_1H_8x16b = _mm_srai_epi16(y_1H_8x16b, log_wd);
545             y_2L_8x16b = _mm_srai_epi16(y_2L_8x16b, log_wd);
546             y_2H_8x16b = _mm_srai_epi16(y_2H_8x16b, log_wd);
547             y_3L_8x16b = _mm_srai_epi16(y_3L_8x16b, log_wd);
548             y_3H_8x16b = _mm_srai_epi16(y_3H_8x16b, log_wd);
549 
550             y_0L_8x16b = _mm_adds_epi16(ofst_8x16b, y_0L_8x16b);
551             y_0H_8x16b = _mm_adds_epi16(ofst_8x16b, y_0H_8x16b);
552             y_1L_8x16b = _mm_adds_epi16(ofst_8x16b, y_1L_8x16b);
553             y_1H_8x16b = _mm_adds_epi16(ofst_8x16b, y_1H_8x16b);
554             y_2L_8x16b = _mm_adds_epi16(ofst_8x16b, y_2L_8x16b);
555             y_2H_8x16b = _mm_adds_epi16(ofst_8x16b, y_2H_8x16b);
556             y_3L_8x16b = _mm_adds_epi16(ofst_8x16b, y_3L_8x16b);
557             y_3H_8x16b = _mm_adds_epi16(ofst_8x16b, y_3H_8x16b);
558 
559             y_0_16x8b = _mm_packus_epi16(y_0L_8x16b, y_0H_8x16b);
560             y_1_16x8b = _mm_packus_epi16(y_1L_8x16b, y_1H_8x16b);
561             y_2_16x8b = _mm_packus_epi16(y_2L_8x16b, y_2H_8x16b);
562             y_3_16x8b = _mm_packus_epi16(y_3L_8x16b, y_3H_8x16b);
563 
564             _mm_storeu_si128((__m128i *)pu1_dst, y_0_16x8b);
565             _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), y_1_16x8b);
566             _mm_storeu_si128((__m128i *)(pu1_dst + (dst_strd << 1)), y_2_16x8b);
567             _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd * 3), y_3_16x8b);
568 
569             ht -= 4;
570             pu1_src += src_strd << 2;
571             pu1_dst += dst_strd << 2;
572         }
573         while(ht > 0);
574     }
575 }
576 
577 /*****************************************************************************/
578 /*                                                                           */
579 /*  Function Name : ih264_weighted_pred_chroma_sse42                         */
580 /*                                                                           */
581 /*  Description   : This function performs the weighted prediction as        */
582 /*                  described in sec 8.4.2.3.2 titled "Weighted sample       */
583 /*                  prediction process" for chroma. The function gets one    */
584 /*                  ht x wd block, weights it, rounds it off, offsets it,    */
585 /*                  saturates it to unsigned 8-bit and stores it in the      */
586 /*                  destination block. (ht,wd) can be (2,2), (4,2), (2,4),   */
587 /*                  (4,4), (8,4), (4,8) or (8,8).                            */
588 /*                                                                           */
589 /*  Inputs        : pu1_src  - Pointer to source                             */
590 /*                  pu1_dst  - Pointer to destination                        */
591 /*                  src_strd - stride for source                             */
592 /*                  dst_strd - stride for destination                        */
593 /*                  log_wd   - number of bits to be rounded off              */
594 /*                  wt       - weight values for u and v                     */
595 /*                  ofst     - offset values for u and v                     */
596 /*                  ht       - height of the block                           */
597 /*                  wd       - width of the block                            */
598 /*                                                                           */
599 /*  Issues        : None                                                     */
600 /*                                                                           */
601 /*  Revision History:                                                        */
602 /*                                                                           */
603 /*         DD MM YYYY   Author(s)       Changes                              */
604 /*         04 02 2015   Kaushik         Initial Version                      */
605 /*                      Senthoor                                             */
606 /*                                                                           */
607 /*****************************************************************************/
ih264_weighted_pred_chroma_sse42(UWORD8 * pu1_src,UWORD8 * pu1_dst,WORD32 src_strd,WORD32 dst_strd,WORD32 log_wd,WORD32 wt,WORD32 ofst,WORD32 ht,WORD32 wd)608 void ih264_weighted_pred_chroma_sse42(UWORD8 *pu1_src,
609                                       UWORD8 *pu1_dst,
610                                       WORD32 src_strd,
611                                       WORD32 dst_strd,
612                                       WORD32 log_wd,
613                                       WORD32 wt,
614                                       WORD32 ofst,
615                                       WORD32 ht,
616                                       WORD32 wd)
617 {
618     __m128i y_0_16x8b, y_1_16x8b;
619 
620     __m128i wt_8x16b, round_8x16b, ofst_8x16b;
621 
622     WORD32 ofst_u, ofst_v;
623     WORD32 round_val;
624 
625     ofst_u = (WORD8)(ofst & 0xff);
626     ofst_v = (WORD8)(ofst >> 8);
627     round_val = 1 << (log_wd - 1);
628     ofst = (ofst_u & 0xffff) | (ofst_v << 16);
629 
630     wt_8x16b = _mm_set1_epi32(wt);
631     round_8x16b = _mm_set1_epi16(round_val);
632     ofst_8x16b = _mm_set1_epi32(ofst);
633 
634     if(wd == 2)
635     {
636         __m128i y_0_8x16b;
637 
638         do
639         {
640             y_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src);
641             y_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd));
642 
643             y_0_16x8b = _mm_unpacklo_epi32(y_0_16x8b, y_1_16x8b);
644 
645             y_0_8x16b = _mm_cvtepu8_epi16(y_0_16x8b);
646 
647             y_0_8x16b = _mm_mullo_epi16(y_0_8x16b, wt_8x16b);
648 
649             y_0_8x16b = _mm_adds_epi16(round_8x16b, y_0_8x16b);
650 
651             y_0_8x16b = _mm_srai_epi16(y_0_8x16b, log_wd);
652 
653             y_0_8x16b = _mm_adds_epi16(ofst_8x16b, y_0_8x16b);
654 
655             y_0_16x8b = _mm_packus_epi16(y_0_8x16b, y_0_8x16b);
656             y_1_16x8b = _mm_srli_si128(y_0_16x8b, 4);
657 
658             *((WORD32 *)(pu1_dst)) = _mm_cvtsi128_si32(y_0_16x8b);
659             *((WORD32 *)(pu1_dst + dst_strd)) = _mm_cvtsi128_si32(y_1_16x8b);
660 
661             ht -= 2;
662             pu1_src += src_strd << 1;
663             pu1_dst += dst_strd << 1;
664         }
665         while(ht > 0);
666     }
667     else if(wd == 4)
668     {
669         __m128i y_0_8x16b, y_1_8x16b;
670 
671         do
672         {
673             y_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src);
674             y_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd));
675 
676             y_0_8x16b = _mm_cvtepu8_epi16(y_0_16x8b);
677             y_1_8x16b = _mm_cvtepu8_epi16(y_1_16x8b);
678 
679             y_0_8x16b = _mm_mullo_epi16(y_0_8x16b, wt_8x16b);
680             y_1_8x16b = _mm_mullo_epi16(y_1_8x16b, wt_8x16b);
681 
682             y_0_8x16b = _mm_adds_epi16(round_8x16b, y_0_8x16b);
683             y_1_8x16b = _mm_adds_epi16(round_8x16b, y_1_8x16b);
684 
685             y_0_8x16b = _mm_srai_epi16(y_0_8x16b, log_wd);
686             y_1_8x16b = _mm_srai_epi16(y_1_8x16b, log_wd);
687 
688             y_0_8x16b = _mm_adds_epi16(ofst_8x16b, y_0_8x16b);
689             y_1_8x16b = _mm_adds_epi16(ofst_8x16b, y_1_8x16b);
690 
691             y_0_16x8b = _mm_packus_epi16(y_0_8x16b, y_1_8x16b);
692             y_1_16x8b = _mm_srli_si128(y_0_16x8b, 8);
693 
694             _mm_storel_epi64((__m128i *)pu1_dst, y_0_16x8b);
695             _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), y_1_16x8b);
696 
697             ht -= 2;
698             pu1_src += src_strd << 1;
699             pu1_dst += dst_strd << 1;
700         }
701         while(ht > 0);
702     }
703     else // wd == 16
704     {
705         __m128i y_2_16x8b, y_3_16x8b;
706         __m128i y_0L_8x16b, y_1L_8x16b, y_2L_8x16b, y_3L_8x16b;
707         __m128i y_0H_8x16b, y_1H_8x16b, y_2H_8x16b, y_3H_8x16b;
708 
709         __m128i zero_16x8b;
710         zero_16x8b = _mm_set1_epi8(0);
711 
712         do
713         {
714             y_0_16x8b = _mm_loadu_si128((__m128i *)pu1_src);
715             y_1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + src_strd));
716             y_2_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + (src_strd << 1)));
717             y_3_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + src_strd * 3));
718 
719             y_0L_8x16b = _mm_cvtepu8_epi16(y_0_16x8b);
720             y_0H_8x16b = _mm_unpackhi_epi8(y_0_16x8b, zero_16x8b);
721             y_1L_8x16b = _mm_cvtepu8_epi16(y_1_16x8b);
722             y_1H_8x16b = _mm_unpackhi_epi8(y_1_16x8b, zero_16x8b);
723             y_2L_8x16b = _mm_cvtepu8_epi16(y_2_16x8b);
724             y_2H_8x16b = _mm_unpackhi_epi8(y_2_16x8b, zero_16x8b);
725             y_3L_8x16b = _mm_cvtepu8_epi16(y_3_16x8b);
726             y_3H_8x16b = _mm_unpackhi_epi8(y_3_16x8b, zero_16x8b);
727 
728             y_0L_8x16b = _mm_mullo_epi16(y_0L_8x16b, wt_8x16b);
729             y_0H_8x16b = _mm_mullo_epi16(y_0H_8x16b, wt_8x16b);
730             y_1L_8x16b = _mm_mullo_epi16(y_1L_8x16b, wt_8x16b);
731             y_1H_8x16b = _mm_mullo_epi16(y_1H_8x16b, wt_8x16b);
732             y_2L_8x16b = _mm_mullo_epi16(y_2L_8x16b, wt_8x16b);
733             y_2H_8x16b = _mm_mullo_epi16(y_2H_8x16b, wt_8x16b);
734             y_3L_8x16b = _mm_mullo_epi16(y_3L_8x16b, wt_8x16b);
735             y_3H_8x16b = _mm_mullo_epi16(y_3H_8x16b, wt_8x16b);
736 
737             y_0L_8x16b = _mm_adds_epi16(round_8x16b, y_0L_8x16b);
738             y_0H_8x16b = _mm_adds_epi16(round_8x16b, y_0H_8x16b);
739             y_1L_8x16b = _mm_adds_epi16(round_8x16b, y_1L_8x16b);
740             y_1H_8x16b = _mm_adds_epi16(round_8x16b, y_1H_8x16b);
741             y_2L_8x16b = _mm_adds_epi16(round_8x16b, y_2L_8x16b);
742             y_2H_8x16b = _mm_adds_epi16(round_8x16b, y_2H_8x16b);
743             y_3L_8x16b = _mm_adds_epi16(round_8x16b, y_3L_8x16b);
744             y_3H_8x16b = _mm_adds_epi16(round_8x16b, y_3H_8x16b);
745 
746             y_0L_8x16b = _mm_srai_epi16(y_0L_8x16b, log_wd);
747             y_0H_8x16b = _mm_srai_epi16(y_0H_8x16b, log_wd);
748             y_1L_8x16b = _mm_srai_epi16(y_1L_8x16b, log_wd);
749             y_1H_8x16b = _mm_srai_epi16(y_1H_8x16b, log_wd);
750             y_2L_8x16b = _mm_srai_epi16(y_2L_8x16b, log_wd);
751             y_2H_8x16b = _mm_srai_epi16(y_2H_8x16b, log_wd);
752             y_3L_8x16b = _mm_srai_epi16(y_3L_8x16b, log_wd);
753             y_3H_8x16b = _mm_srai_epi16(y_3H_8x16b, log_wd);
754 
755             y_0L_8x16b = _mm_adds_epi16(ofst_8x16b, y_0L_8x16b);
756             y_0H_8x16b = _mm_adds_epi16(ofst_8x16b, y_0H_8x16b);
757             y_1L_8x16b = _mm_adds_epi16(ofst_8x16b, y_1L_8x16b);
758             y_1H_8x16b = _mm_adds_epi16(ofst_8x16b, y_1H_8x16b);
759             y_2L_8x16b = _mm_adds_epi16(ofst_8x16b, y_2L_8x16b);
760             y_2H_8x16b = _mm_adds_epi16(ofst_8x16b, y_2H_8x16b);
761             y_3L_8x16b = _mm_adds_epi16(ofst_8x16b, y_3L_8x16b);
762             y_3H_8x16b = _mm_adds_epi16(ofst_8x16b, y_3H_8x16b);
763 
764             y_0_16x8b = _mm_packus_epi16(y_0L_8x16b, y_0H_8x16b);
765             y_1_16x8b = _mm_packus_epi16(y_1L_8x16b, y_1H_8x16b);
766             y_2_16x8b = _mm_packus_epi16(y_2L_8x16b, y_2H_8x16b);
767             y_3_16x8b = _mm_packus_epi16(y_3L_8x16b, y_3H_8x16b);
768 
769             _mm_storeu_si128((__m128i *)pu1_dst, y_0_16x8b);
770             _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), y_1_16x8b);
771             _mm_storeu_si128((__m128i *)(pu1_dst + (dst_strd << 1)), y_2_16x8b);
772             _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd * 3), y_3_16x8b);
773 
774             ht -= 4;
775             pu1_src += src_strd << 2;
776             pu1_dst += dst_strd << 2;
777         }
778         while(ht > 0);
779     }
780 }
781 
782 /*****************************************************************************/
783 /*                                                                           */
784 /*  Function Name : ih264_weighted_bi_pred_luma_sse42                        */
785 /*                                                                           */
786 /*  Description   : This function performs the weighted biprediction as      */
787 /*                  described in sec 8.4.2.3.2 titled "Weighted sample       */
788 /*                  prediction process" for luma. The function gets two      */
789 /*                  ht x wd blocks, weights them, adds them, rounds off the  */
790 /*                  sum, offsets it, saturates it to unsigned 8-bit and      */
791 /*                  stores it in the destination block. (ht,wd) can be       */
792 /*                  (4,4), (8,4), (4,8), (8,8), (16,8), (8,16) or (16,16).   */
793 /*                                                                           */
794 /*  Inputs        : pu1_src1  - Pointer to source 1                          */
795 /*                  pu1_src2  - Pointer to source 2                          */
796 /*                  pu1_dst   - Pointer to destination                       */
797 /*                  src_strd1 - stride for source 1                          */
798 /*                  src_strd2 - stride for source 2                          */
799 /*                  dst_strd2 - stride for destination                       */
800 /*                  log_wd    - number of bits to be rounded off             */
801 /*                  wt1       - weight value for source 1                    */
802 /*                  wt2       - weight value for source 2                    */
803 /*                  ofst1     - offset value for source 1                    */
804 /*                  ofst2     - offset value for source 2                    */
805 /*                  ht        - height of the block                          */
806 /*                  wd        - width of the block                           */
807 /*                                                                           */
808 /*  Issues        : None                                                     */
809 /*                                                                           */
810 /*  Revision History:                                                        */
811 /*                                                                           */
812 /*         DD MM YYYY   Author(s)       Changes                              */
813 /*         04 02 2015   Kaushik         Initial Version                      */
814 /*                      Senthoor                                             */
815 /*                                                                           */
816 /*****************************************************************************/
ih264_weighted_bi_pred_luma_sse42(UWORD8 * pu1_src1,UWORD8 * pu1_src2,UWORD8 * pu1_dst,WORD32 src_strd1,WORD32 src_strd2,WORD32 dst_strd,WORD32 log_wd,WORD32 wt1,WORD32 wt2,WORD32 ofst1,WORD32 ofst2,WORD32 ht,WORD32 wd)817 void ih264_weighted_bi_pred_luma_sse42(UWORD8 *pu1_src1,
818                                        UWORD8 *pu1_src2,
819                                        UWORD8 *pu1_dst,
820                                        WORD32 src_strd1,
821                                        WORD32 src_strd2,
822                                        WORD32 dst_strd,
823                                        WORD32 log_wd,
824                                        WORD32 wt1,
825                                        WORD32 wt2,
826                                        WORD32 ofst1,
827                                        WORD32 ofst2,
828                                        WORD32 ht,
829                                        WORD32 wd)
830 {
831     __m128i y1_0_16x8b, y1_1_16x8b;
832     __m128i y2_0_16x8b, y2_1_16x8b;
833 
834     __m128i wt1_8x16b, wt2_8x16b;
835     __m128i ofst_8x16b, round_8x16b;
836 
837     WORD32 ofst;
838     WORD32 round_val, shft;
839 
840     wt1 = (WORD16)(wt1 & 0xffff);
841     wt2 = (WORD16)(wt2 & 0xffff);
842     round_val = 1 << log_wd;
843     shft = log_wd + 1;
844     ofst1 = (WORD8)(ofst1 & 0xff);
845     ofst2 = (WORD8)(ofst2 & 0xff);
846     ofst = (ofst1 + ofst2 + 1) >> 1;
847 
848     wt1_8x16b = _mm_set1_epi16(wt1);
849     wt2_8x16b = _mm_set1_epi16(wt2);
850     round_8x16b = _mm_set1_epi16(round_val);
851     ofst_8x16b = _mm_set1_epi16(ofst);
852 
853     if(wd == 4)
854     {
855         __m128i y1_2_16x8b, y1_3_16x8b;
856         __m128i y2_2_16x8b, y2_3_16x8b;
857 
858         __m128i y1_0_8x16b, y1_2_8x16b;
859         __m128i y2_0_8x16b, y2_2_8x16b;
860 
861         do
862         {
863             y1_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src1);
864             y1_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src1 + src_strd1));
865             y1_2_16x8b = _mm_loadl_epi64(
866                             (__m128i *)(pu1_src1 + (src_strd1 << 1)));
867             y1_3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src1 + src_strd1 * 3));
868 
869             y2_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src2);
870             y2_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src2 + src_strd2));
871             y2_2_16x8b = _mm_loadl_epi64(
872                             (__m128i *)(pu1_src2 + (src_strd2 << 1)));
873             y2_3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src2 + src_strd2 * 3));
874 
875             y1_0_16x8b = _mm_unpacklo_epi32(y1_0_16x8b, y1_1_16x8b);
876             y1_2_16x8b = _mm_unpacklo_epi32(y1_2_16x8b, y1_3_16x8b);
877             y2_0_16x8b = _mm_unpacklo_epi32(y2_0_16x8b, y2_1_16x8b);
878             y2_2_16x8b = _mm_unpacklo_epi32(y2_2_16x8b, y2_3_16x8b);
879 
880             y1_0_8x16b = _mm_cvtepu8_epi16(y1_0_16x8b);
881             y1_2_8x16b = _mm_cvtepu8_epi16(y1_2_16x8b);
882             y2_0_8x16b = _mm_cvtepu8_epi16(y2_0_16x8b);
883             y2_2_8x16b = _mm_cvtepu8_epi16(y2_2_16x8b);
884 
885             y1_0_8x16b = _mm_mullo_epi16(y1_0_8x16b, wt1_8x16b);
886             y2_0_8x16b = _mm_mullo_epi16(y2_0_8x16b, wt2_8x16b);
887             y1_2_8x16b = _mm_mullo_epi16(y1_2_8x16b, wt1_8x16b);
888             y2_2_8x16b = _mm_mullo_epi16(y2_2_8x16b, wt2_8x16b);
889 
890             y1_0_8x16b = _mm_adds_epi16(y1_0_8x16b, y2_0_8x16b);
891             y1_2_8x16b = _mm_adds_epi16(y1_2_8x16b, y2_2_8x16b);
892 
893             y1_0_8x16b = _mm_adds_epi16(round_8x16b, y1_0_8x16b);
894             y1_2_8x16b = _mm_adds_epi16(round_8x16b, y1_2_8x16b);
895 
896             y1_0_8x16b = _mm_srai_epi16(y1_0_8x16b, shft);
897             y1_2_8x16b = _mm_srai_epi16(y1_2_8x16b, shft);
898 
899             y1_0_8x16b = _mm_adds_epi16(ofst_8x16b, y1_0_8x16b);
900             y1_2_8x16b = _mm_adds_epi16(ofst_8x16b, y1_2_8x16b);
901 
902             y1_0_16x8b = _mm_packus_epi16(y1_0_8x16b, y1_2_8x16b);
903             y1_1_16x8b = _mm_srli_si128(y1_0_16x8b, 4);
904             y1_2_16x8b = _mm_srli_si128(y1_0_16x8b, 8);
905             y1_3_16x8b = _mm_srli_si128(y1_0_16x8b, 12);
906 
907             *((WORD32 *)(pu1_dst)) = _mm_cvtsi128_si32(y1_0_16x8b);
908             *((WORD32 *)(pu1_dst + dst_strd)) = _mm_cvtsi128_si32(y1_1_16x8b);
909             *((WORD32 *)(pu1_dst + (dst_strd << 1))) = _mm_cvtsi128_si32(y1_2_16x8b);
910             *((WORD32 *)(pu1_dst + dst_strd * 3)) = _mm_cvtsi128_si32(y1_3_16x8b);
911 
912 
913             ht -= 4;
914             pu1_src1 += src_strd1 << 2;
915             pu1_src2 += src_strd2 << 2;
916             pu1_dst += dst_strd << 2;
917         }
918         while(ht > 0);
919     }
920     else if(wd == 8)
921     {
922         __m128i y1_2_16x8b, y1_3_16x8b;
923         __m128i y2_2_16x8b, y2_3_16x8b;
924 
925         __m128i y1_0_8x16b, y1_1_8x16b, y1_2_8x16b, y1_3_8x16b;
926         __m128i y2_0_8x16b, y2_1_8x16b, y2_2_8x16b, y2_3_8x16b;
927 
928         do
929         {
930             y1_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src1);
931             y1_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src1 + src_strd1));
932             y1_2_16x8b = _mm_loadl_epi64(
933                             (__m128i *)(pu1_src1 + (src_strd1 << 1)));
934             y1_3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src1 + src_strd1 * 3));
935 
936             y2_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src2);
937             y2_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src2 + src_strd2));
938             y2_2_16x8b = _mm_loadl_epi64(
939                             (__m128i *)(pu1_src2 + (src_strd2 << 1)));
940             y2_3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src2 + src_strd2 * 3));
941 
942             y1_0_8x16b = _mm_cvtepu8_epi16(y1_0_16x8b);
943             y1_1_8x16b = _mm_cvtepu8_epi16(y1_1_16x8b);
944             y1_2_8x16b = _mm_cvtepu8_epi16(y1_2_16x8b);
945             y1_3_8x16b = _mm_cvtepu8_epi16(y1_3_16x8b);
946 
947             y2_0_8x16b = _mm_cvtepu8_epi16(y2_0_16x8b);
948             y2_1_8x16b = _mm_cvtepu8_epi16(y2_1_16x8b);
949             y2_2_8x16b = _mm_cvtepu8_epi16(y2_2_16x8b);
950             y2_3_8x16b = _mm_cvtepu8_epi16(y2_3_16x8b);
951 
952             y1_0_8x16b = _mm_mullo_epi16(y1_0_8x16b, wt1_8x16b);
953             y2_0_8x16b = _mm_mullo_epi16(y2_0_8x16b, wt2_8x16b);
954             y1_1_8x16b = _mm_mullo_epi16(y1_1_8x16b, wt1_8x16b);
955             y2_1_8x16b = _mm_mullo_epi16(y2_1_8x16b, wt2_8x16b);
956 
957             y1_2_8x16b = _mm_mullo_epi16(y1_2_8x16b, wt1_8x16b);
958             y2_2_8x16b = _mm_mullo_epi16(y2_2_8x16b, wt2_8x16b);
959             y1_3_8x16b = _mm_mullo_epi16(y1_3_8x16b, wt1_8x16b);
960             y2_3_8x16b = _mm_mullo_epi16(y2_3_8x16b, wt2_8x16b);
961 
962             y1_0_8x16b = _mm_adds_epi16(y1_0_8x16b, y2_0_8x16b);
963             y1_1_8x16b = _mm_adds_epi16(y1_1_8x16b, y2_1_8x16b);
964             y1_2_8x16b = _mm_adds_epi16(y1_2_8x16b, y2_2_8x16b);
965             y1_3_8x16b = _mm_adds_epi16(y1_3_8x16b, y2_3_8x16b);
966 
967             y1_0_8x16b = _mm_adds_epi16(round_8x16b, y1_0_8x16b);
968             y1_1_8x16b = _mm_adds_epi16(round_8x16b, y1_1_8x16b);
969             y1_2_8x16b = _mm_adds_epi16(round_8x16b, y1_2_8x16b);
970             y1_3_8x16b = _mm_adds_epi16(round_8x16b, y1_3_8x16b);
971 
972             y1_0_8x16b = _mm_srai_epi16(y1_0_8x16b, shft);
973             y1_1_8x16b = _mm_srai_epi16(y1_1_8x16b, shft);
974             y1_2_8x16b = _mm_srai_epi16(y1_2_8x16b, shft);
975             y1_3_8x16b = _mm_srai_epi16(y1_3_8x16b, shft);
976 
977             y1_0_8x16b = _mm_adds_epi16(ofst_8x16b, y1_0_8x16b);
978             y1_1_8x16b = _mm_adds_epi16(ofst_8x16b, y1_1_8x16b);
979             y1_2_8x16b = _mm_adds_epi16(ofst_8x16b, y1_2_8x16b);
980             y1_3_8x16b = _mm_adds_epi16(ofst_8x16b, y1_3_8x16b);
981 
982             y1_0_16x8b = _mm_packus_epi16(y1_0_8x16b, y1_1_8x16b);
983             y1_2_16x8b = _mm_packus_epi16(y1_2_8x16b, y1_3_8x16b);
984             y1_1_16x8b = _mm_srli_si128(y1_0_16x8b, 8);
985             y1_3_16x8b = _mm_srli_si128(y1_2_16x8b, 8);
986 
987             _mm_storel_epi64((__m128i *)pu1_dst, y1_0_16x8b);
988             _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), y1_1_16x8b);
989             _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd << 1)), y1_2_16x8b);
990             _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd * 3), y1_3_16x8b);
991 
992             ht -= 4;
993             pu1_src1 += src_strd1 << 2;
994             pu1_src2 += src_strd2 << 2;
995             pu1_dst += dst_strd << 2;
996         }
997         while(ht > 0);
998     }
999     else // wd == 16
1000     {
1001         __m128i y1_0L_8x16b, y1_0H_8x16b, y1_1L_8x16b, y1_1H_8x16b;
1002         __m128i y2_0L_8x16b, y2_0H_8x16b, y2_1L_8x16b, y2_1H_8x16b;
1003 
1004         __m128i zero_16x8b;
1005         zero_16x8b = _mm_set1_epi8(0);
1006 
1007         do
1008         {
1009             y1_0_16x8b = _mm_loadu_si128((__m128i *)pu1_src1);
1010             y1_1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src1 + src_strd1));
1011             y2_0_16x8b = _mm_loadu_si128((__m128i *)pu1_src2);
1012             y2_1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src2 + src_strd2));
1013 
1014             y1_0L_8x16b = _mm_cvtepu8_epi16(y1_0_16x8b);
1015             y1_0H_8x16b = _mm_unpackhi_epi8(y1_0_16x8b, zero_16x8b);
1016             y1_1L_8x16b = _mm_cvtepu8_epi16(y1_1_16x8b);
1017             y1_1H_8x16b = _mm_unpackhi_epi8(y1_1_16x8b, zero_16x8b);
1018 
1019             y2_0L_8x16b = _mm_cvtepu8_epi16(y2_0_16x8b);
1020             y2_0H_8x16b = _mm_unpackhi_epi8(y2_0_16x8b, zero_16x8b);
1021             y2_1L_8x16b = _mm_cvtepu8_epi16(y2_1_16x8b);
1022             y2_1H_8x16b = _mm_unpackhi_epi8(y2_1_16x8b, zero_16x8b);
1023 
1024             y1_0L_8x16b = _mm_mullo_epi16(y1_0L_8x16b, wt1_8x16b);
1025             y1_0H_8x16b = _mm_mullo_epi16(y1_0H_8x16b, wt1_8x16b);
1026             y1_1L_8x16b = _mm_mullo_epi16(y1_1L_8x16b, wt1_8x16b);
1027             y1_1H_8x16b = _mm_mullo_epi16(y1_1H_8x16b, wt1_8x16b);
1028 
1029             y2_0L_8x16b = _mm_mullo_epi16(y2_0L_8x16b, wt2_8x16b);
1030             y2_0H_8x16b = _mm_mullo_epi16(y2_0H_8x16b, wt2_8x16b);
1031             y2_1L_8x16b = _mm_mullo_epi16(y2_1L_8x16b, wt2_8x16b);
1032             y2_1H_8x16b = _mm_mullo_epi16(y2_1H_8x16b, wt2_8x16b);
1033 
1034             y1_0L_8x16b = _mm_adds_epi16(y1_0L_8x16b, y2_0L_8x16b);
1035             y1_0H_8x16b = _mm_adds_epi16(y1_0H_8x16b, y2_0H_8x16b);
1036             y1_1L_8x16b = _mm_adds_epi16(y1_1L_8x16b, y2_1L_8x16b);
1037             y1_1H_8x16b = _mm_adds_epi16(y1_1H_8x16b, y2_1H_8x16b);
1038 
1039             y1_0L_8x16b = _mm_adds_epi16(round_8x16b, y1_0L_8x16b);
1040             y1_0H_8x16b = _mm_adds_epi16(round_8x16b, y1_0H_8x16b);
1041             y1_1L_8x16b = _mm_adds_epi16(round_8x16b, y1_1L_8x16b);
1042             y1_1H_8x16b = _mm_adds_epi16(round_8x16b, y1_1H_8x16b);
1043 
1044             y1_0L_8x16b = _mm_srai_epi16(y1_0L_8x16b, shft);
1045             y1_0H_8x16b = _mm_srai_epi16(y1_0H_8x16b, shft);
1046             y1_1L_8x16b = _mm_srai_epi16(y1_1L_8x16b, shft);
1047             y1_1H_8x16b = _mm_srai_epi16(y1_1H_8x16b, shft);
1048 
1049             y1_0L_8x16b = _mm_adds_epi16(ofst_8x16b, y1_0L_8x16b);
1050             y1_0H_8x16b = _mm_adds_epi16(ofst_8x16b, y1_0H_8x16b);
1051             y1_1L_8x16b = _mm_adds_epi16(ofst_8x16b, y1_1L_8x16b);
1052             y1_1H_8x16b = _mm_adds_epi16(ofst_8x16b, y1_1H_8x16b);
1053 
1054             y1_0_16x8b = _mm_packus_epi16(y1_0L_8x16b, y1_0H_8x16b);
1055             y1_1_16x8b = _mm_packus_epi16(y1_1L_8x16b, y1_1H_8x16b);
1056 
1057             _mm_storeu_si128((__m128i *)pu1_dst, y1_0_16x8b);
1058             _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), y1_1_16x8b);
1059 
1060             ht -= 2;
1061             pu1_src1 += src_strd1 << 1;
1062             pu1_src2 += src_strd2 << 1;
1063             pu1_dst += dst_strd << 1;
1064         }
1065         while(ht > 0);
1066     }
1067 }
1068 
1069 /*****************************************************************************/
1070 /*                                                                           */
1071 /*  Function Name : ih264_weighted_bi_pred_chroma_sse42                      */
1072 /*                                                                           */
1073 /*  Description   : This function performs the weighted biprediction as      */
1074 /*                  described in sec 8.4.2.3.2 titled "Weighted sample       */
1075 /*                  prediction process" for chroma. The function gets two    */
1076 /*                  ht x wd blocks, weights them, adds them, rounds off the  */
1077 /*                  sum, offsets it, saturates it to unsigned 8-bit and      */
1078 /*                  stores it in the destination block. (ht,wd) can be       */
1079 /*                  (2,2), (4,2), (2,4), (4,4), (8,4), (4,8) or (8,8).       */
1080 /*                                                                           */
1081 /*  Inputs        : pu1_src1  - Pointer to source 1                          */
1082 /*                  pu1_src2  - Pointer to source 2                          */
1083 /*                  pu1_dst   - Pointer to destination                       */
1084 /*                  src_strd1 - stride for source 1                          */
1085 /*                  src_strd2 - stride for source 2                          */
1086 /*                  dst_strd2 - stride for destination                       */
1087 /*                  log_wd    - number of bits to be rounded off             */
1088 /*                  wt1       - weight values for u and v in source 1        */
1089 /*                  wt2       - weight values for u and v in source 2        */
1090 /*                  ofst1     - offset value for u and v in source 1         */
1091 /*                  ofst2     - offset value for u and v in source 2         */
1092 /*                  ht        - height of the block                          */
1093 /*                  wd        - width of the block                           */
1094 /*                                                                           */
1095 /*  Issues        : None                                                     */
1096 /*                                                                           */
1097 /*  Revision History:                                                        */
1098 /*                                                                           */
1099 /*         DD MM YYYY   Author(s)       Changes                              */
1100 /*         04 02 2015   Kaushik         Initial Version                      */
1101 /*                      Senthoor                                             */
1102 /*                                                                           */
1103 /*****************************************************************************/
ih264_weighted_bi_pred_chroma_sse42(UWORD8 * pu1_src1,UWORD8 * pu1_src2,UWORD8 * pu1_dst,WORD32 src_strd1,WORD32 src_strd2,WORD32 dst_strd,WORD32 log_wd,WORD32 wt1,WORD32 wt2,WORD32 ofst1,WORD32 ofst2,WORD32 ht,WORD32 wd)1104 void ih264_weighted_bi_pred_chroma_sse42(UWORD8 *pu1_src1,
1105                                          UWORD8 *pu1_src2,
1106                                          UWORD8 *pu1_dst,
1107                                          WORD32 src_strd1,
1108                                          WORD32 src_strd2,
1109                                          WORD32 dst_strd,
1110                                          WORD32 log_wd,
1111                                          WORD32 wt1,
1112                                          WORD32 wt2,
1113                                          WORD32 ofst1,
1114                                          WORD32 ofst2,
1115                                          WORD32 ht,
1116                                          WORD32 wd)
1117 {
1118     __m128i y1_0_16x8b, y1_1_16x8b;
1119     __m128i y2_0_16x8b, y2_1_16x8b;
1120 
1121     __m128i wt1_8x16b, wt2_8x16b;
1122     __m128i ofst_8x16b, round_8x16b;
1123 
1124     WORD32 ofst1_u, ofst2_u, ofst_u;
1125     WORD32 ofst1_v, ofst2_v, ofst_v;
1126     WORD32 round_val, shft, ofst_val;
1127 
1128     round_val = 1 << log_wd;
1129     shft = log_wd + 1;
1130 
1131     ofst1_u = (WORD8)(ofst1 & 0xff);
1132     ofst1_v = (WORD8)(ofst1 >> 8);
1133     ofst2_u = (WORD8)(ofst2 & 0xff);
1134     ofst2_v = (WORD8)(ofst2 >> 8);
1135 
1136     wt1_8x16b = _mm_set1_epi32(wt1);
1137     wt2_8x16b = _mm_set1_epi32(wt2);
1138 
1139     ofst_u = (ofst1_u + ofst2_u + 1) >> 1;
1140     ofst_v = (ofst1_v + ofst2_v + 1) >> 1;
1141     ofst_val = (ofst_u & 0xffff) | (ofst_v << 16);
1142 
1143     round_8x16b = _mm_set1_epi16(round_val);
1144     ofst_8x16b = _mm_set1_epi32(ofst_val);
1145 
1146     if(wd == 2)
1147     {
1148         __m128i y1_0_8x16b, y2_0_8x16b;
1149 
1150         do
1151         {
1152             y1_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src1);
1153             y1_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src1 + src_strd1));
1154 
1155             y2_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src2);
1156             y2_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src2 + src_strd2));
1157 
1158             y1_0_16x8b = _mm_unpacklo_epi32(y1_0_16x8b, y1_1_16x8b);
1159             y2_0_16x8b = _mm_unpacklo_epi32(y2_0_16x8b, y2_1_16x8b);
1160 
1161             y1_0_8x16b = _mm_cvtepu8_epi16(y1_0_16x8b);
1162             y2_0_8x16b = _mm_cvtepu8_epi16(y2_0_16x8b);
1163 
1164             y1_0_8x16b = _mm_mullo_epi16(y1_0_8x16b, wt1_8x16b);
1165             y2_0_8x16b = _mm_mullo_epi16(y2_0_8x16b, wt2_8x16b);
1166 
1167             y1_0_8x16b = _mm_adds_epi16(y1_0_8x16b, y2_0_8x16b);
1168             y1_0_8x16b = _mm_adds_epi16(round_8x16b, y1_0_8x16b);
1169 
1170             y1_0_8x16b = _mm_srai_epi16(y1_0_8x16b, shft);
1171             y1_0_8x16b = _mm_adds_epi16(ofst_8x16b, y1_0_8x16b);
1172 
1173             y1_0_16x8b = _mm_packus_epi16(y1_0_8x16b, y1_0_8x16b);
1174             y1_1_16x8b = _mm_srli_si128(y1_0_16x8b, 4);
1175 
1176             *((WORD32 *)(pu1_dst)) = _mm_cvtsi128_si32(y1_0_16x8b);
1177             *((WORD32 *)(pu1_dst + dst_strd)) = _mm_cvtsi128_si32(y1_1_16x8b);
1178 
1179             ht -= 2;
1180             pu1_src1 += src_strd1 << 1;
1181             pu1_src2 += src_strd2 << 1;
1182             pu1_dst += dst_strd << 1;
1183         }
1184         while(ht > 0);
1185     }
1186     else if(wd == 4)
1187     {
1188         __m128i y1_0_8x16b, y1_1_8x16b;
1189         __m128i y2_0_8x16b, y2_1_8x16b;
1190 
1191         do
1192         {
1193             y1_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src1);
1194             y1_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src1 + src_strd1));
1195 
1196             y2_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src2);
1197             y2_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src2 + src_strd2));
1198 
1199             y1_0_8x16b = _mm_cvtepu8_epi16(y1_0_16x8b);
1200             y1_1_8x16b = _mm_cvtepu8_epi16(y1_1_16x8b);
1201 
1202             y2_0_8x16b = _mm_cvtepu8_epi16(y2_0_16x8b);
1203             y2_1_8x16b = _mm_cvtepu8_epi16(y2_1_16x8b);
1204 
1205             y1_0_8x16b = _mm_mullo_epi16(y1_0_8x16b, wt1_8x16b);
1206             y2_0_8x16b = _mm_mullo_epi16(y2_0_8x16b, wt2_8x16b);
1207             y1_1_8x16b = _mm_mullo_epi16(y1_1_8x16b, wt1_8x16b);
1208             y2_1_8x16b = _mm_mullo_epi16(y2_1_8x16b, wt2_8x16b);
1209 
1210             y1_0_8x16b = _mm_adds_epi16(y1_0_8x16b, y2_0_8x16b);
1211             y1_1_8x16b = _mm_adds_epi16(y1_1_8x16b, y2_1_8x16b);
1212 
1213             y1_0_8x16b = _mm_adds_epi16(round_8x16b, y1_0_8x16b);
1214             y1_1_8x16b = _mm_adds_epi16(round_8x16b, y1_1_8x16b);
1215 
1216             y1_0_8x16b = _mm_srai_epi16(y1_0_8x16b, shft);
1217             y1_1_8x16b = _mm_srai_epi16(y1_1_8x16b, shft);
1218 
1219             y1_0_8x16b = _mm_adds_epi16(ofst_8x16b, y1_0_8x16b);
1220             y1_1_8x16b = _mm_adds_epi16(ofst_8x16b, y1_1_8x16b);
1221 
1222             y1_0_16x8b = _mm_packus_epi16(y1_0_8x16b, y1_1_8x16b);
1223             y1_1_16x8b = _mm_srli_si128(y1_0_16x8b, 8);
1224 
1225             _mm_storel_epi64((__m128i *)pu1_dst, y1_0_16x8b);
1226             _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), y1_1_16x8b);
1227 
1228             ht -= 2;
1229             pu1_src1 += src_strd1 << 1;
1230             pu1_src2 += src_strd2 << 1;
1231             pu1_dst += dst_strd << 1;
1232         }
1233         while(ht > 0);
1234     }
1235     else // wd == 8
1236     {
1237         __m128i y1_0L_8x16b, y1_0H_8x16b, y1_1L_8x16b, y1_1H_8x16b;
1238         __m128i y2_0L_8x16b, y2_0H_8x16b, y2_1L_8x16b, y2_1H_8x16b;
1239 
1240         __m128i zero_16x8b;
1241         zero_16x8b = _mm_set1_epi8(0);
1242 
1243         do
1244         {
1245             y1_0_16x8b = _mm_loadu_si128((__m128i *)pu1_src1);
1246             y1_1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src1 + src_strd1));
1247             y2_0_16x8b = _mm_loadu_si128((__m128i *)pu1_src2);
1248             y2_1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src2 + src_strd2));
1249 
1250             y1_0L_8x16b = _mm_cvtepu8_epi16(y1_0_16x8b);
1251             y1_0H_8x16b = _mm_unpackhi_epi8(y1_0_16x8b, zero_16x8b);
1252             y1_1L_8x16b = _mm_cvtepu8_epi16(y1_1_16x8b);
1253             y1_1H_8x16b = _mm_unpackhi_epi8(y1_1_16x8b, zero_16x8b);
1254 
1255             y2_0L_8x16b = _mm_cvtepu8_epi16(y2_0_16x8b);
1256             y2_0H_8x16b = _mm_unpackhi_epi8(y2_0_16x8b, zero_16x8b);
1257             y2_1L_8x16b = _mm_cvtepu8_epi16(y2_1_16x8b);
1258             y2_1H_8x16b = _mm_unpackhi_epi8(y2_1_16x8b, zero_16x8b);
1259 
1260             y1_0L_8x16b = _mm_mullo_epi16(y1_0L_8x16b, wt1_8x16b);
1261             y1_0H_8x16b = _mm_mullo_epi16(y1_0H_8x16b, wt1_8x16b);
1262             y1_1L_8x16b = _mm_mullo_epi16(y1_1L_8x16b, wt1_8x16b);
1263             y1_1H_8x16b = _mm_mullo_epi16(y1_1H_8x16b, wt1_8x16b);
1264 
1265             y2_0L_8x16b = _mm_mullo_epi16(y2_0L_8x16b, wt2_8x16b);
1266             y2_0H_8x16b = _mm_mullo_epi16(y2_0H_8x16b, wt2_8x16b);
1267             y2_1L_8x16b = _mm_mullo_epi16(y2_1L_8x16b, wt2_8x16b);
1268             y2_1H_8x16b = _mm_mullo_epi16(y2_1H_8x16b, wt2_8x16b);
1269 
1270             y1_0L_8x16b = _mm_adds_epi16(y1_0L_8x16b, y2_0L_8x16b);
1271             y1_0H_8x16b = _mm_adds_epi16(y1_0H_8x16b, y2_0H_8x16b);
1272             y1_1L_8x16b = _mm_adds_epi16(y1_1L_8x16b, y2_1L_8x16b);
1273             y1_1H_8x16b = _mm_adds_epi16(y1_1H_8x16b, y2_1H_8x16b);
1274 
1275             y1_0L_8x16b = _mm_adds_epi16(round_8x16b, y1_0L_8x16b);
1276             y1_0H_8x16b = _mm_adds_epi16(round_8x16b, y1_0H_8x16b);
1277             y1_1L_8x16b = _mm_adds_epi16(round_8x16b, y1_1L_8x16b);
1278             y1_1H_8x16b = _mm_adds_epi16(round_8x16b, y1_1H_8x16b);
1279 
1280             y1_0L_8x16b = _mm_srai_epi16(y1_0L_8x16b, shft);
1281             y1_0H_8x16b = _mm_srai_epi16(y1_0H_8x16b, shft);
1282             y1_1L_8x16b = _mm_srai_epi16(y1_1L_8x16b, shft);
1283             y1_1H_8x16b = _mm_srai_epi16(y1_1H_8x16b, shft);
1284 
1285             y1_0L_8x16b = _mm_adds_epi16(ofst_8x16b, y1_0L_8x16b);
1286             y1_0H_8x16b = _mm_adds_epi16(ofst_8x16b, y1_0H_8x16b);
1287             y1_1L_8x16b = _mm_adds_epi16(ofst_8x16b, y1_1L_8x16b);
1288             y1_1H_8x16b = _mm_adds_epi16(ofst_8x16b, y1_1H_8x16b);
1289 
1290             y1_0_16x8b = _mm_packus_epi16(y1_0L_8x16b, y1_0H_8x16b);
1291             y1_1_16x8b = _mm_packus_epi16(y1_1L_8x16b, y1_1H_8x16b);
1292 
1293             _mm_storeu_si128((__m128i *)pu1_dst, y1_0_16x8b);
1294             _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), y1_1_16x8b);
1295 
1296             ht -= 2;
1297             pu1_src1 += src_strd1 << 1;
1298             pu1_src2 += src_strd2 << 1;
1299             pu1_dst += dst_strd << 1;
1300         }
1301         while(ht > 0);
1302     }
1303 }
1304