1 /******************************************************************************
2 *
3 * Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at:
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 *
17 ******************************************************************************/
18 /**
19 *******************************************************************************
20 * @file
21 *  ihevc_weighted_pred_atom_intr.c
22 *
23 * @brief
24 *  Contains function definitions for weighted prediction used in inter
25 * prediction
26 *
27 * @author
28 *
29 *
30 * @par List of Functions:
31 *   - ihevc_weighted_pred_uni_ssse3()
32 *   - ihevc_weighted_pred_bi_ssse3()
33 *   - ihevc_weighted_pred_bi_default_ssse3()
34 *   - ihevc_weighted_pred_chroma_uni_ssse3()
35 *   - ihevc_weighted_pred_chroma_bi_ssse3()
36 *   - ihevc_weighted_pred_chroma_bi_default_ssse3()
37 *
38 * @remarks
39 *  None
40 *
41 *******************************************************************************
42 */
43 /*****************************************************************************/
44 /* File Includes                                                             */
45 /*****************************************************************************/
46 #include <stdio.h>
47 #include <assert.h>
48 
49 #include "ihevc_debug.h"
50 #include "ihevc_typedefs.h"
51 #include "ihevc_macros.h"
52 #include "ihevc_platform_macros.h"
53 #include "ihevc_func_selector.h"
54 #include "ihevc_defs.h"
55 #include "ihevc_weighted_pred.h"
56 #include "ihevc_inter_pred.h"
57 
58 
59 #include <immintrin.h>
60 
61 /**
62 *******************************************************************************
63 *
64 * @brief
65 *  Does uni-weighted prediction on the array pointed by  pi2_src and stores
66 * it at the location pointed by pi2_dst
67 *
68 * @par Description:
69 *  dst = ( (src + lvl_shift) * wgt0 + (1 << (shift - 1)) )  >> shift +
70 * offset
71 *
72 * @param[in] pi2_src
73 *  Pointer to the source
74 *
75 * @param[out] pu1_dst
76 *  Pointer to the destination
77 *
78 * @param[in] src_strd
79 *  Source stride
80 *
81 * @param[in] dst_strd
82 *  Destination stride
83 *
84 * @param[in] wgt0
85 *  weight to be multiplied to the source
86 *
87 * @param[in] off0
88 *  offset to be added after rounding and
89 *
90 * @param[in] shifting
91 *
92 *
93 * @param[in] shift
94 *  (14 Bit depth) + log2_weight_denominator
95 *
96 * @param[in] lvl_shift
97 *  added before shift and offset
98 *
99 * @param[in] ht
100 *  height of the source
101 *
102 * @param[in] wd
103 *  width of the source
104 *
105 * @returns
106 *
107 * @remarks
108 *  None
109 *
110 *******************************************************************************
111 */
112 
ihevc_weighted_pred_uni_ssse3(WORD16 * pi2_src,UWORD8 * pu1_dst,WORD32 src_strd,WORD32 dst_strd,WORD32 wgt0,WORD32 off0,WORD32 shift,WORD32 lvl_shift,WORD32 ht,WORD32 wd)113 void ihevc_weighted_pred_uni_ssse3(WORD16 *pi2_src,
114                                    UWORD8 *pu1_dst,
115                                    WORD32 src_strd,
116                                    WORD32 dst_strd,
117                                    WORD32 wgt0,
118                                    WORD32 off0,
119                                    WORD32 shift,
120                                    WORD32 lvl_shift,
121                                    WORD32 ht,
122                                    WORD32 wd)
123 {
124     WORD32 row, col, temp;
125 
126     /* all 128 bit registers are named with a suffix mxnb, where m is the */
127     /* number of n bits packed in the register                            */
128     __m128i src_temp0_8x16b, src_temp1_8x16b, src_temp2_8x16b, src_temp3_8x16b;
129     __m128i const_temp_4x32b, lvl_shift_4x32b, wgt0_8x16b, off0_4x32b;
130     __m128i res_temp0_4x32b, res_temp1_4x32b, res_temp2_4x32b, res_temp3_4x32b;
131 
132     ASSERT(wd % 4 == 0); /* checking assumption*/
133     ASSERT(ht % 4 == 0); /* checking assumption*/
134 
135     temp = 1 << (shift - 1);
136 
137     // seting values in register
138     lvl_shift_4x32b = _mm_set1_epi16(lvl_shift);
139     wgt0_8x16b = _mm_set1_epi16(wgt0);
140 
141     /* lvl_shift * wgt0 */
142     res_temp0_4x32b = _mm_mullo_epi16(lvl_shift_4x32b, wgt0_8x16b);
143     res_temp1_4x32b = _mm_mulhi_epi16(lvl_shift_4x32b, wgt0_8x16b);
144 
145     const_temp_4x32b = _mm_set1_epi32(temp);
146     off0_4x32b = _mm_set1_epi32(off0);
147 
148 
149     /* lvl_shift * wgt0 */
150     lvl_shift_4x32b = _mm_unpacklo_epi16(res_temp0_4x32b, res_temp1_4x32b);
151     /* lvl_shift * wgt0 + 1 << (shift - 1) */
152     lvl_shift_4x32b = _mm_add_epi32(lvl_shift_4x32b, const_temp_4x32b);
153 
154     if(0 == (wd & 7)) /* wd multiple of 8 case */
155     {
156         __m128i res_temp4_4x32b, res_temp5_4x32b, res_temp6_4x32b, res_temp7_4x32b;
157 
158         /*  outer for loop starts from here */
159         for(row = 0; row < ht; row += 4)
160         {
161             for(col = 0; col < wd; col += 8)
162             {   /* for row =0 ,1,2,3*/
163 
164                 /* row = 0 */ /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
165                 src_temp0_8x16b = _mm_loadu_si128((__m128i *)(pi2_src));
166                 /* row = 1 */
167                 src_temp1_8x16b = _mm_loadu_si128((__m128i *)(pi2_src + src_strd));
168                 /* row = 2 */
169                 src_temp2_8x16b = _mm_loadu_si128((__m128i *)(pi2_src + 2 * src_strd));
170                 /* row = 3 */
171                 src_temp3_8x16b = _mm_loadu_si128((__m128i *)(pi2_src + 3 * src_strd));
172 
173                 /*i4_tmp = (pi2_src[col]) * wgt0*/ /* Lower 16 bit */
174                 res_temp0_4x32b  = _mm_mullo_epi16(src_temp0_8x16b, wgt0_8x16b);
175                 res_temp1_4x32b  = _mm_mullo_epi16(src_temp1_8x16b, wgt0_8x16b);
176                 res_temp2_4x32b  = _mm_mullo_epi16(src_temp2_8x16b, wgt0_8x16b);
177                 res_temp3_4x32b  = _mm_mullo_epi16(src_temp3_8x16b, wgt0_8x16b);
178 
179                 /*i4_tmp = (pi2_src[col] ) * wgt0*/ /* Higher 16 bit */
180                 src_temp0_8x16b  = _mm_mulhi_epi16(src_temp0_8x16b, wgt0_8x16b);
181                 src_temp1_8x16b  = _mm_mulhi_epi16(src_temp1_8x16b, wgt0_8x16b);
182                 src_temp2_8x16b  = _mm_mulhi_epi16(src_temp2_8x16b, wgt0_8x16b);
183                 src_temp3_8x16b  = _mm_mulhi_epi16(src_temp3_8x16b, wgt0_8x16b);
184 
185                 /* Get 32 bit Result */
186                 res_temp4_4x32b = _mm_unpackhi_epi16(res_temp0_4x32b, src_temp0_8x16b);
187                 res_temp5_4x32b = _mm_unpackhi_epi16(res_temp1_4x32b, src_temp1_8x16b);
188                 res_temp6_4x32b = _mm_unpackhi_epi16(res_temp2_4x32b, src_temp2_8x16b);
189                 res_temp7_4x32b = _mm_unpackhi_epi16(res_temp3_4x32b, src_temp3_8x16b);
190 
191                 res_temp0_4x32b = _mm_unpacklo_epi16(res_temp0_4x32b, src_temp0_8x16b);
192                 res_temp1_4x32b = _mm_unpacklo_epi16(res_temp1_4x32b, src_temp1_8x16b);
193                 res_temp2_4x32b = _mm_unpacklo_epi16(res_temp2_4x32b, src_temp2_8x16b);
194                 res_temp3_4x32b = _mm_unpacklo_epi16(res_temp3_4x32b, src_temp3_8x16b);
195 
196                 /* i4_tmp = (pi2_src[col] + lvl_shift) * wgt0 + 1 << (shift - 1) */
197                 res_temp4_4x32b = _mm_add_epi32(res_temp4_4x32b, lvl_shift_4x32b);
198                 res_temp5_4x32b = _mm_add_epi32(res_temp5_4x32b, lvl_shift_4x32b);
199                 res_temp6_4x32b = _mm_add_epi32(res_temp6_4x32b, lvl_shift_4x32b);
200                 res_temp7_4x32b = _mm_add_epi32(res_temp7_4x32b, lvl_shift_4x32b);
201                 res_temp0_4x32b = _mm_add_epi32(res_temp0_4x32b, lvl_shift_4x32b);
202                 res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, lvl_shift_4x32b);
203                 res_temp2_4x32b = _mm_add_epi32(res_temp2_4x32b, lvl_shift_4x32b);
204                 res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, lvl_shift_4x32b);
205 
206                 /* (i4_tmp >> shift) */ /* First 4 pixels */
207                 res_temp0_4x32b = _mm_srai_epi32(res_temp0_4x32b, shift);
208                 res_temp1_4x32b = _mm_srai_epi32(res_temp1_4x32b, shift);
209                 res_temp2_4x32b = _mm_srai_epi32(res_temp2_4x32b, shift);
210                 res_temp3_4x32b = _mm_srai_epi32(res_temp3_4x32b, shift);
211 
212                 /* (i4_tmp >> shift) */ /* Last 4 pixels */
213                 res_temp4_4x32b = _mm_srai_epi32(res_temp4_4x32b, shift);
214                 res_temp5_4x32b = _mm_srai_epi32(res_temp5_4x32b, shift);
215                 res_temp6_4x32b = _mm_srai_epi32(res_temp6_4x32b, shift);
216                 res_temp7_4x32b = _mm_srai_epi32(res_temp7_4x32b, shift);
217 
218                 /*i4_tmp = (i4_tmp >> shift) + off0; */ /* First 4 pixels */
219                 res_temp0_4x32b = _mm_add_epi32(res_temp0_4x32b, off0_4x32b);
220                 res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, off0_4x32b);
221                 res_temp2_4x32b = _mm_add_epi32(res_temp2_4x32b, off0_4x32b);
222                 res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, off0_4x32b);
223 
224                 /*i4_tmp = (i4_tmp >> shift) + off0; */ /* Last 4 pixels */
225                 res_temp4_4x32b = _mm_add_epi32(res_temp4_4x32b, off0_4x32b);
226                 res_temp5_4x32b = _mm_add_epi32(res_temp5_4x32b, off0_4x32b);
227                 res_temp6_4x32b = _mm_add_epi32(res_temp6_4x32b, off0_4x32b);
228                 res_temp7_4x32b = _mm_add_epi32(res_temp7_4x32b, off0_4x32b);
229 
230                 res_temp0_4x32b = _mm_packs_epi32(res_temp0_4x32b, res_temp4_4x32b);
231                 res_temp1_4x32b = _mm_packs_epi32(res_temp1_4x32b, res_temp5_4x32b);
232                 res_temp2_4x32b = _mm_packs_epi32(res_temp2_4x32b, res_temp6_4x32b);
233                 res_temp3_4x32b = _mm_packs_epi32(res_temp3_4x32b, res_temp7_4x32b);
234                 /* pu1_dst[col] = CLIP_U8(i4_tmp); */
235                 res_temp0_4x32b = _mm_packus_epi16(res_temp0_4x32b, res_temp0_4x32b);
236                 res_temp1_4x32b = _mm_packus_epi16(res_temp1_4x32b, res_temp1_4x32b);
237                 res_temp2_4x32b = _mm_packus_epi16(res_temp2_4x32b, res_temp2_4x32b);
238                 res_temp3_4x32b = _mm_packus_epi16(res_temp3_4x32b, res_temp3_4x32b);
239 
240                 /* store four 8-bit output values  */
241                 _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd), res_temp0_4x32b); /* row = 0*/
242                 _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), res_temp1_4x32b); /* row = 2*/
243                 _mm_storel_epi64((__m128i *)(pu1_dst + 2 * dst_strd), res_temp2_4x32b); /* row = 1*/
244                 _mm_storel_epi64((__m128i *)(pu1_dst + 3 * dst_strd), res_temp3_4x32b); /* row = 3*/
245 
246                 /* To update pointer */
247                 pi2_src += 8;
248                 pu1_dst += 8;
249 
250             } /* inner loop ends here(4-output values in single iteration) */
251 
252             pi2_src = pi2_src - wd + 4 * src_strd;    /* Pointer update */
253             pu1_dst = pu1_dst - wd + 4 * dst_strd; /* Pointer update */
254 
255         }
256     }
257     else  /* wd multiple of 4 case */
258     {
259         WORD32 dst0, dst1, dst2, dst3;
260         /*  outer for loop starts from here */
261         for(row = 0; row < ht; row += 4)
262         {
263             for(col = 0; col < wd; col += 4)
264             {   /* for row =0 ,1,2,3*/
265 
266                 /* row = 0 */ /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
267                 src_temp0_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src));
268                 /* row = 1 */
269                 src_temp1_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src + src_strd));
270                 /* row = 2 */
271                 src_temp2_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src + 2 * src_strd));
272                 /* row = 3 */
273                 src_temp3_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src + 3 * src_strd));
274 
275                 /* 2 rows together */
276                 src_temp0_8x16b = _mm_unpacklo_epi64(src_temp0_8x16b, src_temp2_8x16b);
277                 src_temp1_8x16b = _mm_unpacklo_epi64(src_temp1_8x16b, src_temp3_8x16b);
278 
279                 /*i4_tmp = (pi2_src[col]) * wgt0*/ /* Lower 16 bit */
280                 res_temp0_4x32b  = _mm_mullo_epi16(src_temp0_8x16b, wgt0_8x16b);
281                 res_temp1_4x32b  = _mm_mullo_epi16(src_temp1_8x16b, wgt0_8x16b);
282                 /*i4_tmp = (pi2_src[col]) * wgt0*/ /* Higher 16 bit */
283                 src_temp0_8x16b  = _mm_mulhi_epi16(src_temp0_8x16b, wgt0_8x16b);
284                 src_temp1_8x16b  = _mm_mulhi_epi16(src_temp1_8x16b, wgt0_8x16b);
285 
286                 /* Get 32 bit Result */
287                 res_temp2_4x32b = _mm_unpackhi_epi16(res_temp0_4x32b, src_temp0_8x16b);
288                 res_temp3_4x32b = _mm_unpackhi_epi16(res_temp1_4x32b, src_temp1_8x16b);
289 
290                 res_temp0_4x32b = _mm_unpacklo_epi16(res_temp0_4x32b, src_temp0_8x16b);
291                 res_temp1_4x32b = _mm_unpacklo_epi16(res_temp1_4x32b, src_temp1_8x16b);
292 
293                 /* i4_tmp = (pi2_src[col] + lvl_shift) * wgt0 + 1 << (shift - 1) */
294                 res_temp2_4x32b = _mm_add_epi32(res_temp2_4x32b, lvl_shift_4x32b);
295                 res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, lvl_shift_4x32b);
296                 res_temp0_4x32b = _mm_add_epi32(res_temp0_4x32b, lvl_shift_4x32b);
297                 res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, lvl_shift_4x32b);
298 
299                 /* (i4_tmp >> shift) */
300                 res_temp0_4x32b = _mm_srai_epi32(res_temp0_4x32b, shift);
301                 res_temp1_4x32b = _mm_srai_epi32(res_temp1_4x32b, shift);
302                 res_temp2_4x32b = _mm_srai_epi32(res_temp2_4x32b, shift);
303                 res_temp3_4x32b = _mm_srai_epi32(res_temp3_4x32b, shift);
304 
305                 /*i4_tmp = (i4_tmp >> shift) + off0; */
306                 res_temp0_4x32b = _mm_add_epi32(res_temp0_4x32b, off0_4x32b);
307                 res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, off0_4x32b);
308                 res_temp2_4x32b = _mm_add_epi32(res_temp2_4x32b, off0_4x32b);
309                 res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, off0_4x32b);
310 
311                 res_temp0_4x32b = _mm_packs_epi32(res_temp0_4x32b, res_temp1_4x32b);
312                 res_temp2_4x32b = _mm_packs_epi32(res_temp2_4x32b, res_temp3_4x32b);
313 
314                 /* pu1_dst[col] = CLIP_U8(i4_tmp); */
315                 res_temp0_4x32b = _mm_packus_epi16(res_temp0_4x32b, res_temp2_4x32b);
316 
317                 dst0 = _mm_cvtsi128_si32(res_temp0_4x32b);
318                 /* dst row = 1 to 3 */
319                 res_temp1_4x32b = _mm_shuffle_epi32(res_temp0_4x32b, 1);
320                 res_temp2_4x32b = _mm_shuffle_epi32(res_temp0_4x32b, 2);
321                 res_temp3_4x32b = _mm_shuffle_epi32(res_temp0_4x32b, 3);
322 
323                 /* store four 8-bit output values  */
324                 *(WORD32 *)(&pu1_dst[0 * dst_strd]) = dst0;
325 
326                 dst1 = _mm_cvtsi128_si32(res_temp1_4x32b);
327                 dst2 = _mm_cvtsi128_si32(res_temp2_4x32b);
328                 dst3 = _mm_cvtsi128_si32(res_temp3_4x32b);
329 
330                 /* row = 1 to row = 3 */
331                 *(WORD32 *)(&pu1_dst[1 * dst_strd]) = dst1;
332                 *(WORD32 *)(&pu1_dst[2 * dst_strd]) = dst2;
333                 *(WORD32 *)(&pu1_dst[3 * dst_strd]) = dst3;
334 
335                 /* To update pointer */
336                 pi2_src += 4;
337                 pu1_dst += 4;
338 
339             } /* inner loop ends here(4-output values in single iteration) */
340 
341             pi2_src = pi2_src - wd + 4 * src_strd;    /* Pointer update */
342             pu1_dst = pu1_dst - wd + 4 * dst_strd; /* Pointer update */
343 
344         }
345     }
346 }
347 
348 /**
349 *******************************************************************************
350 *
351 * @brief
352 * Does chroma uni-weighted prediction on array pointed by pi2_src and stores
353 * it at the location pointed by pi2_dst
354 *
355 * @par Description:
356 *  dst = ( (src + lvl_shift) * wgt0 + (1 << (shift - 1)) )  >> shift +
357 * offset
358 *
359 * @param[in] pi2_src
360 *  Pointer to the source
361 *
362 * @param[out] pu1_dst
363 *  Pointer to the destination
364 *
365 * @param[in] src_strd
366 *  Source stride
367 *
368 * @param[in] dst_strd
369 *  Destination stride
370 *
371 * @param[in] wgt0
372 *  weight to be multiplied to the source
373 *
374 * @param[in] off0
375 *  offset to be added after rounding and
376 *
377 * @param[in] shifting
378 *
379 *
380 * @param[in] shift
381 *  (14 Bit depth) + log2_weight_denominator
382 *
383 * @param[in] lvl_shift
384 *  added before shift and offset
385 *
386 * @param[in] ht
387 *  height of the source
388 *
389 * @param[in] wd
390 *  width of the source (each colour component)
391 *
392 * @returns
393 *
394 * @remarks
395 *  None
396 *
397 *******************************************************************************
398 */
399 
400 
ihevc_weighted_pred_chroma_uni_ssse3(WORD16 * pi2_src,UWORD8 * pu1_dst,WORD32 src_strd,WORD32 dst_strd,WORD32 wgt0_cb,WORD32 wgt0_cr,WORD32 off0_cb,WORD32 off0_cr,WORD32 shift,WORD32 lvl_shift,WORD32 ht,WORD32 wd)401 void ihevc_weighted_pred_chroma_uni_ssse3(WORD16 *pi2_src,
402                                           UWORD8 *pu1_dst,
403                                           WORD32 src_strd,
404                                           WORD32 dst_strd,
405                                           WORD32 wgt0_cb,
406                                           WORD32 wgt0_cr,
407                                           WORD32 off0_cb,
408                                           WORD32 off0_cr,
409                                           WORD32 shift,
410                                           WORD32 lvl_shift,
411                                           WORD32 ht,
412                                           WORD32 wd)
413 {
414     WORD32 row, col, temp, wdx2;
415     /* all 128 bit registers are named with a suffix mxnb, where m is the */
416     /* number of n bits packed in the register                            */
417 
418     __m128i src_temp0_8x16b, src_temp1_8x16b;
419     __m128i const_temp_4x32b, lvl_shift_4x32b, wgt0_8x16b, off0_4x32b;
420     __m128i res_temp0_4x32b, res_temp1_4x32b;
421 
422     ASSERT(wd % 2 == 0); /* checking assumption*/
423     ASSERT(ht % 2 == 0); /* checking assumption*/
424 
425     temp = 1 << (shift - 1);
426     wdx2 = 2 * wd;
427 
428     // seting values in register
429     lvl_shift_4x32b = _mm_set1_epi16(lvl_shift);
430     wgt0_8x16b = _mm_set_epi16(wgt0_cr, wgt0_cb, wgt0_cr, wgt0_cb, wgt0_cr, wgt0_cb, wgt0_cr, wgt0_cb);
431 
432     /* lvl_shift * wgt0 */
433     res_temp0_4x32b = _mm_mullo_epi16(lvl_shift_4x32b, wgt0_8x16b);
434     res_temp1_4x32b = _mm_mulhi_epi16(lvl_shift_4x32b, wgt0_8x16b);
435 
436     const_temp_4x32b = _mm_set1_epi32(temp);
437     off0_4x32b = _mm_set_epi32(off0_cr, off0_cb, off0_cr, off0_cb);
438 
439     /* lvl_shift * wgt0 */
440     lvl_shift_4x32b = _mm_unpacklo_epi16(res_temp0_4x32b, res_temp1_4x32b);
441     /* lvl_shift * wgt0 + 1 << (shift - 1) */
442     lvl_shift_4x32b = _mm_add_epi32(lvl_shift_4x32b, const_temp_4x32b);
443 
444     {
445         if(0 == (wdx2 & 15)) /* 2*wd multiple of 16 case */
446         {
447             __m128i src_temp2_8x16b, src_temp3_8x16b;
448             __m128i res_temp2_4x32b, res_temp3_4x32b;
449             __m128i res_temp4_4x32b, res_temp5_4x32b, res_temp6_4x32b, res_temp7_4x32b;
450 
451             /*  outer for loop starts from here */
452             for(row = 0; row < ht; row += 2)
453             {
454                 for(col = 0; col < wdx2; col += 16)
455                 {
456                     /* row = 0 */ /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
457                     src_temp0_8x16b = _mm_loadu_si128((__m128i *)(pi2_src));
458                     /* row = 1 */
459                     src_temp1_8x16b = _mm_loadu_si128((__m128i *)(pi2_src + src_strd));
460                     /* row = 0 */ /* Next 8 pixels */
461                     src_temp2_8x16b = _mm_loadu_si128((__m128i *)(pi2_src + 8));
462                     /* row = 1 */
463                     src_temp3_8x16b = _mm_loadu_si128((__m128i *)(pi2_src + src_strd + 8));
464 
465                     /*i4_tmp = (pi2_src[col]) * wgt0*/ /* Lower 16 bit */
466                     res_temp0_4x32b  = _mm_mullo_epi16(src_temp0_8x16b, wgt0_8x16b);
467                     res_temp1_4x32b  = _mm_mullo_epi16(src_temp1_8x16b, wgt0_8x16b);
468                     res_temp4_4x32b  = _mm_mullo_epi16(src_temp2_8x16b, wgt0_8x16b);
469                     res_temp5_4x32b  = _mm_mullo_epi16(src_temp3_8x16b, wgt0_8x16b);
470 
471                     /*i4_tmp = (pi2_src[col] ) * wgt0*/ /* Higher 16 bit */
472                     src_temp0_8x16b  = _mm_mulhi_epi16(src_temp0_8x16b, wgt0_8x16b);
473                     src_temp1_8x16b  = _mm_mulhi_epi16(src_temp1_8x16b, wgt0_8x16b);
474                     src_temp2_8x16b  = _mm_mulhi_epi16(src_temp2_8x16b, wgt0_8x16b);
475                     src_temp3_8x16b  = _mm_mulhi_epi16(src_temp3_8x16b, wgt0_8x16b);
476 
477                     /* Get 32 bit Result */
478                     res_temp2_4x32b = _mm_unpackhi_epi16(res_temp0_4x32b, src_temp0_8x16b);
479                     res_temp3_4x32b = _mm_unpackhi_epi16(res_temp1_4x32b, src_temp1_8x16b);
480                     res_temp6_4x32b = _mm_unpackhi_epi16(res_temp4_4x32b, src_temp2_8x16b);
481                     res_temp7_4x32b = _mm_unpackhi_epi16(res_temp5_4x32b, src_temp3_8x16b);
482 
483                     res_temp0_4x32b = _mm_unpacklo_epi16(res_temp0_4x32b, src_temp0_8x16b);
484                     res_temp1_4x32b = _mm_unpacklo_epi16(res_temp1_4x32b, src_temp1_8x16b);
485                     res_temp4_4x32b = _mm_unpacklo_epi16(res_temp4_4x32b, src_temp2_8x16b);
486                     res_temp5_4x32b = _mm_unpacklo_epi16(res_temp5_4x32b, src_temp3_8x16b);
487 
488                     /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0 + 1 << (shift - 1) */
489                     res_temp0_4x32b = _mm_add_epi32(res_temp0_4x32b, lvl_shift_4x32b);
490                     res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, lvl_shift_4x32b);
491                     res_temp2_4x32b = _mm_add_epi32(res_temp2_4x32b, lvl_shift_4x32b);
492                     res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, lvl_shift_4x32b);
493                     res_temp4_4x32b = _mm_add_epi32(res_temp4_4x32b, lvl_shift_4x32b);
494                     res_temp5_4x32b = _mm_add_epi32(res_temp5_4x32b, lvl_shift_4x32b);
495                     res_temp6_4x32b = _mm_add_epi32(res_temp6_4x32b, lvl_shift_4x32b);
496                     res_temp7_4x32b = _mm_add_epi32(res_temp7_4x32b, lvl_shift_4x32b);
497 
498                     /* (i4_tmp >> shift) */
499                     res_temp0_4x32b = _mm_srai_epi32(res_temp0_4x32b,  shift);
500                     res_temp1_4x32b = _mm_srai_epi32(res_temp1_4x32b,  shift);
501                     res_temp2_4x32b = _mm_srai_epi32(res_temp2_4x32b,  shift);
502                     res_temp3_4x32b = _mm_srai_epi32(res_temp3_4x32b,  shift);
503                     /*i4_tmp = (i4_tmp >> shift) + off0; */
504                     res_temp0_4x32b = _mm_add_epi32(res_temp0_4x32b, off0_4x32b);
505                     res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, off0_4x32b);
506                     /*i4_tmp = (i4_tmp >> shift) + off0; */ /* Second 4 pixels */
507                     res_temp2_4x32b = _mm_add_epi32(res_temp2_4x32b, off0_4x32b);
508                     res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, off0_4x32b);
509 
510                     /* (i4_tmp >> shift) */
511                     res_temp4_4x32b = _mm_srai_epi32(res_temp4_4x32b,  shift);
512                     res_temp5_4x32b = _mm_srai_epi32(res_temp5_4x32b,  shift);
513                     res_temp6_4x32b = _mm_srai_epi32(res_temp6_4x32b,  shift);
514                     res_temp7_4x32b = _mm_srai_epi32(res_temp7_4x32b,  shift);
515                     /*i4_tmp = (i4_tmp >> shift) + off0; */ /* Third 4 pixels */
516                     res_temp4_4x32b = _mm_add_epi32(res_temp4_4x32b, off0_4x32b);
517                     res_temp5_4x32b = _mm_add_epi32(res_temp5_4x32b, off0_4x32b);
518                     /*i4_tmp = (i4_tmp >> shift) + off0; */ /* Last 4 pixels */
519                     res_temp6_4x32b = _mm_add_epi32(res_temp6_4x32b, off0_4x32b);
520                     res_temp7_4x32b = _mm_add_epi32(res_temp7_4x32b, off0_4x32b);
521 
522                     res_temp0_4x32b = _mm_packs_epi32(res_temp0_4x32b, res_temp2_4x32b);
523                     res_temp1_4x32b = _mm_packs_epi32(res_temp1_4x32b, res_temp3_4x32b);
524                     res_temp4_4x32b = _mm_packs_epi32(res_temp4_4x32b, res_temp6_4x32b);
525                     res_temp5_4x32b = _mm_packs_epi32(res_temp5_4x32b, res_temp7_4x32b);
526                     /* pu1_dst[col] = CLIP_U8(i4_tmp); */
527                     res_temp0_4x32b = _mm_packus_epi16(res_temp0_4x32b, res_temp4_4x32b);
528                     res_temp1_4x32b = _mm_packus_epi16(res_temp1_4x32b, res_temp5_4x32b);
529 
530                     /* store 16 8-bit output values  */
531                     _mm_storeu_si128((__m128i *)(pu1_dst + 0 * dst_strd), res_temp0_4x32b); /* row = 0*/
532                     _mm_storeu_si128((__m128i *)(pu1_dst + 1 * dst_strd), res_temp1_4x32b); /* row = 1*/
533 
534                     pi2_src += 16;  /* Pointer update */
535                     pu1_dst += 16; /* Pointer update */
536 
537                 } /* inner loop ends here(4-output values in single iteration) */
538                 pi2_src = pi2_src - wdx2 + 2 * src_strd;  /* Pointer update */
539                 pu1_dst = pu1_dst - wdx2 + 2 * dst_strd; /* Pointer update */
540             }
541         }
542         else if(0 == (wdx2 & 7)) /* 2*wd multiple of 8 case */
543         {
544             __m128i res_temp2_4x32b, res_temp3_4x32b;
545             /*  outer for loop starts from here */
546             for(row = 0; row < ht; row += 2)
547             {
548                 for(col = 0; col < wdx2; col += 8)
549                 {
550                     /* row = 0 */ /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
551                     src_temp0_8x16b = _mm_loadu_si128((__m128i *)(pi2_src));
552                     /* row = 1 */
553                     src_temp1_8x16b = _mm_loadu_si128((__m128i *)(pi2_src + src_strd));
554 
555                     /*i4_tmp = (pi2_src[col]) * wgt0*/ /* Lower 16 bit */
556                     res_temp0_4x32b  = _mm_mullo_epi16(src_temp0_8x16b, wgt0_8x16b);
557                     res_temp1_4x32b  = _mm_mullo_epi16(src_temp1_8x16b, wgt0_8x16b);
558                     /*i4_tmp = (pi2_src[col] ) * wgt0*/ /* Higher 16 bit */
559                     src_temp0_8x16b  = _mm_mulhi_epi16(src_temp0_8x16b, wgt0_8x16b);
560                     src_temp1_8x16b  = _mm_mulhi_epi16(src_temp1_8x16b, wgt0_8x16b);
561 
562                     /* Get 32 bit Result */
563                     res_temp2_4x32b = _mm_unpackhi_epi16(res_temp0_4x32b, src_temp0_8x16b);
564                     res_temp3_4x32b = _mm_unpackhi_epi16(res_temp1_4x32b, src_temp1_8x16b);
565 
566                     res_temp0_4x32b = _mm_unpacklo_epi16(res_temp0_4x32b, src_temp0_8x16b);
567                     res_temp1_4x32b = _mm_unpacklo_epi16(res_temp1_4x32b, src_temp1_8x16b);
568 
569                     /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0 + 1 << (shift - 1) */
570                     res_temp0_4x32b = _mm_add_epi32(res_temp0_4x32b, lvl_shift_4x32b);
571                     res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, lvl_shift_4x32b);
572                     res_temp2_4x32b = _mm_add_epi32(res_temp2_4x32b, lvl_shift_4x32b);
573                     res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, lvl_shift_4x32b);
574 
575                     /* (i4_tmp >> shift) */
576                     res_temp0_4x32b = _mm_srai_epi32(res_temp0_4x32b,  shift);
577                     res_temp1_4x32b = _mm_srai_epi32(res_temp1_4x32b,  shift);
578                     res_temp2_4x32b = _mm_srai_epi32(res_temp2_4x32b,  shift);
579                     res_temp3_4x32b = _mm_srai_epi32(res_temp3_4x32b,  shift);
580 
581                     /*i4_tmp = (i4_tmp >> shift) + off0; */
582                     res_temp0_4x32b = _mm_add_epi32(res_temp0_4x32b, off0_4x32b);
583                     res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, off0_4x32b);
584                     /*i4_tmp = (i4_tmp >> shift) + off0; */ /* Last 4 pixels */
585                     res_temp2_4x32b = _mm_add_epi32(res_temp2_4x32b, off0_4x32b);
586                     res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, off0_4x32b);
587 
588                     res_temp0_4x32b = _mm_packs_epi32(res_temp0_4x32b, res_temp2_4x32b);
589                     res_temp1_4x32b = _mm_packs_epi32(res_temp1_4x32b, res_temp3_4x32b);
590 
591                     /* pu1_dst[col] = CLIP_U8(i4_tmp); */
592                     res_temp0_4x32b = _mm_packus_epi16(res_temp0_4x32b, res_temp0_4x32b);
593                     res_temp1_4x32b = _mm_packus_epi16(res_temp1_4x32b, res_temp1_4x32b);
594 
595                     /* store four 8-bit output values  */
596                     _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd), res_temp0_4x32b); /* row = 0*/
597                     _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), res_temp1_4x32b); /* row = 1*/
598 
599                     pi2_src += 8;   /* Pointer update */
600                     pu1_dst += 8; /* Pointer update */
601 
602                 } /* inner loop ends here(4-output values in single iteration) */
603                 pi2_src = pi2_src - wdx2 + 2 * src_strd;  /* Pointer update */
604                 pu1_dst = pu1_dst - wdx2 + 2 * dst_strd; /* Pointer update */
605             }
606         }
607         else /* 2*wd multiple of 4 case */
608         {
609             WORD32 dst0, dst1;
610             /*  outer for loop starts from here */
611             for(row = 0; row < ht; row += 2)
612             {
613                 for(col = 0; col < wdx2; col += 4)
614                 {
615                     /* row = 0 */ /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
616                     src_temp0_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src));
617                     /* row = 1 */
618                     src_temp1_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src + src_strd));
619 
620                     /* 2 rows together */
621                     src_temp0_8x16b = _mm_unpacklo_epi64(src_temp0_8x16b, src_temp1_8x16b);
622 
623                     /*i4_tmp = (pi2_src[col]) * wgt0*/ /* Lower 16 bit */
624                     res_temp0_4x32b  = _mm_mullo_epi16(src_temp0_8x16b, wgt0_8x16b);
625                     /*i4_tmp = (pi2_src[col] ) * wgt0*/ /* Higher 16 bit */
626                     src_temp0_8x16b  = _mm_mulhi_epi16(src_temp0_8x16b, wgt0_8x16b);
627 
628                     /* Get 32 bit Result */
629                     res_temp1_4x32b = _mm_unpackhi_epi16(res_temp0_4x32b, src_temp0_8x16b);
630                     res_temp0_4x32b = _mm_unpacklo_epi16(res_temp0_4x32b, src_temp0_8x16b);
631 
632                     /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0 + 1 << (shift - 1) */
633                     res_temp0_4x32b = _mm_add_epi32(res_temp0_4x32b, lvl_shift_4x32b);
634                     res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, lvl_shift_4x32b);
635 
636                     /* (i4_tmp >> shift) */
637                     res_temp0_4x32b = _mm_srai_epi32(res_temp0_4x32b,  shift);
638                     res_temp1_4x32b = _mm_srai_epi32(res_temp1_4x32b,  shift);
639 
640                     /*i4_tmp = (i4_tmp >> shift) + off0; */
641                     res_temp0_4x32b = _mm_add_epi32(res_temp0_4x32b, off0_4x32b);
642                     res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, off0_4x32b);
643 
644                     res_temp0_4x32b = _mm_packs_epi32(res_temp0_4x32b, res_temp1_4x32b);
645 
646                     /* pu1_dst[col] = CLIP_U8(i4_tmp); */
647                     res_temp0_4x32b = _mm_packus_epi16(res_temp0_4x32b, res_temp0_4x32b);
648 
649                     dst0 = _mm_cvtsi128_si32(res_temp0_4x32b);
650                     /* dst row = 1 to 3 */
651                     res_temp1_4x32b = _mm_shuffle_epi32(res_temp0_4x32b, 1);
652 
653                     /* store four 8-bit output values  */
654                     *(WORD32 *)(&pu1_dst[0 * dst_strd]) = dst0;
655 
656                     dst1 = _mm_cvtsi128_si32(res_temp1_4x32b);
657                     /* row = 1 */
658                     *(WORD32 *)(&pu1_dst[1 * dst_strd]) = dst1;
659 
660                     pi2_src += 4;   /* Pointer update */
661                     pu1_dst += 4; /* Pointer update */
662 
663                 } /* inner loop ends here(4-output values in single iteration) */
664                 pi2_src = pi2_src - wdx2 + 2 * src_strd;  /* Pointer update */
665                 pu1_dst = pu1_dst - wdx2 + 2 * dst_strd; /* Pointer update */
666             }
667         }
668     }
669 }
670 
671 /**
672 *******************************************************************************
673 *
674 * @brief
675 *  Does bi-weighted prediction on the arrays pointed by  pi2_src1 and
676 * pi2_src2 and stores it at location pointed  by pi2_dst
677 *
678 * @par Description:
679 *  dst = ( (src1 + lvl_shift1)*wgt0 +  (src2 + lvl_shift2)*wgt1 +  (off0 +
680 * off1 + 1) << (shift - 1) ) >> shift
681 *
682 * @param[in] pi2_src1
683 *  Pointer to source 1
684 *
685 * @param[in] pi2_src2
686 *  Pointer to source 2
687 *
688 * @param[out] pu1_dst
689 *  Pointer to destination
690 *
691 * @param[in] src_strd1
692 *  Source stride 1
693 *
694 * @param[in] src_strd2
695 *  Source stride 2
696 *
697 * @param[in] dst_strd
698 *  Destination stride
699 *
700 * @param[in] wgt0
701 *  weight to be multiplied to source 1
702 *
703 * @param[in] off0
704 *  offset 0
705 *
706 * @param[in] wgt1
707 *  weight to be multiplied to source 2
708 *
709 * @param[in] off1
710 *  offset 1
711 *
712 * @param[in] shift
713 *  (14 Bit depth) + log2_weight_denominator
714 *
715 * @param[in] lvl_shift1
716 *  added before shift and offset
717 *
718 * @param[in] lvl_shift2
719 *  added before shift and offset
720 *
721 * @param[in] ht
722 *  height of the source
723 *
724 * @param[in] wd
725 *  width of the source
726 *
727 * @returns
728 *
729 * @remarks
730 *  None
731 *
732 *******************************************************************************
733 */
734 
735 
ihevc_weighted_pred_bi_ssse3(WORD16 * pi2_src1,WORD16 * pi2_src2,UWORD8 * pu1_dst,WORD32 src_strd1,WORD32 src_strd2,WORD32 dst_strd,WORD32 wgt0,WORD32 off0,WORD32 wgt1,WORD32 off1,WORD32 shift,WORD32 lvl_shift1,WORD32 lvl_shift2,WORD32 ht,WORD32 wd)736 void ihevc_weighted_pred_bi_ssse3(WORD16 *pi2_src1,
737                                   WORD16 *pi2_src2,
738                                   UWORD8 *pu1_dst,
739                                   WORD32 src_strd1,
740                                   WORD32 src_strd2,
741                                   WORD32 dst_strd,
742                                   WORD32 wgt0,
743                                   WORD32 off0,
744                                   WORD32 wgt1,
745                                   WORD32 off1,
746                                   WORD32 shift,
747                                   WORD32 lvl_shift1,
748                                   WORD32 lvl_shift2,
749                                   WORD32 ht,
750                                   WORD32 wd)
751 {
752     WORD32 row, col, temp;
753 
754     __m128i src_temp1_8x16b, src_temp2_8x16b, src_temp3_8x16b, src_temp4_8x16b;
755     __m128i const_temp_4x32b, lvl_shift1_4x32b, lvl_shift2_4x32b, wgt0_8x16b, wgt1_8x16b;
756     __m128i res_temp1_4x32b, res_temp2_4x32b, res_temp3_4x32b, res_temp4_4x32b;
757 
758 #include <assert.h>
759     ASSERT(wd % 4 == 0); /* checking assumption*/
760     ASSERT(ht % 4 == 0); /* checking assumption*/
761 
762     temp = (off0 + off1 + 1) << (shift - 1);
763 
764     // seting values in register
765     lvl_shift1_4x32b = _mm_set1_epi16(lvl_shift1);
766     wgt0_8x16b = _mm_set1_epi16(wgt0);
767     lvl_shift2_4x32b = _mm_set1_epi16(lvl_shift2);
768     wgt1_8x16b = _mm_set1_epi16(wgt1);
769 
770     /* lvl_shift1 * wgt0 */
771     res_temp1_4x32b = _mm_mullo_epi16(lvl_shift1_4x32b, wgt0_8x16b);
772     res_temp2_4x32b = _mm_mulhi_epi16(lvl_shift1_4x32b, wgt0_8x16b);
773     /* lvl_shift2 * wgt1 */
774     res_temp3_4x32b = _mm_mullo_epi16(lvl_shift2_4x32b, wgt1_8x16b);
775     res_temp4_4x32b = _mm_mulhi_epi16(lvl_shift2_4x32b, wgt1_8x16b);
776 
777     const_temp_4x32b = _mm_set1_epi32(temp);
778 
779     /* lvl_shift1 * wgt0 */
780     lvl_shift1_4x32b = _mm_unpacklo_epi16(res_temp1_4x32b, res_temp2_4x32b);
781     /* lvl_shift2 * wgt1 */
782     lvl_shift2_4x32b = _mm_unpacklo_epi16(res_temp3_4x32b, res_temp4_4x32b);
783 
784     if(0 == (wd & 7)) /* wd multiple of 8 case */
785     {
786         __m128i res_temp5_4x32b, res_temp6_4x32b, res_temp7_4x32b, res_temp8_4x32b;
787         /*  outer for loop starts from here */
788         for(row = 0; row < ht; row += 2)
789         {
790             for(col = 0; col < wd; col += 8)
791             {
792                 /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
793                 src_temp1_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1)); /* row = 0 */
794                 src_temp2_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2)); /* row = 0 */
795                 src_temp3_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 1 * src_strd1)); /* row = 1 */
796                 src_temp4_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 1 * src_strd2)); /* row = 1 */
797 
798                 /*i4_tmp = (pi2_src[col]) * wgt*/ /* Lower 16 bit */
799                 res_temp1_4x32b  = _mm_mullo_epi16(src_temp1_8x16b, wgt0_8x16b);
800                 res_temp2_4x32b  = _mm_mullo_epi16(src_temp2_8x16b, wgt1_8x16b);
801                 res_temp3_4x32b  = _mm_mullo_epi16(src_temp3_8x16b, wgt0_8x16b);
802                 res_temp4_4x32b  = _mm_mullo_epi16(src_temp4_8x16b, wgt1_8x16b);
803                 /*i4_tmp = (pi2_src[col] ) * wgt*/ /* Higher 16 bit */
804                 src_temp1_8x16b  = _mm_mulhi_epi16(src_temp1_8x16b, wgt0_8x16b);
805                 src_temp2_8x16b  = _mm_mulhi_epi16(src_temp2_8x16b, wgt1_8x16b);
806                 src_temp3_8x16b  = _mm_mulhi_epi16(src_temp3_8x16b, wgt0_8x16b);
807                 src_temp4_8x16b  = _mm_mulhi_epi16(src_temp4_8x16b, wgt1_8x16b);
808 
809                 /* Get 32 bit Result */
810                 res_temp5_4x32b = _mm_unpackhi_epi16(res_temp1_4x32b, src_temp1_8x16b);
811                 res_temp6_4x32b = _mm_unpackhi_epi16(res_temp2_4x32b, src_temp2_8x16b);
812                 res_temp7_4x32b = _mm_unpackhi_epi16(res_temp3_4x32b, src_temp3_8x16b);
813                 res_temp8_4x32b = _mm_unpackhi_epi16(res_temp4_4x32b, src_temp4_8x16b);
814 
815                 res_temp1_4x32b = _mm_unpacklo_epi16(res_temp1_4x32b, src_temp1_8x16b);
816                 res_temp2_4x32b = _mm_unpacklo_epi16(res_temp2_4x32b, src_temp2_8x16b);
817                 res_temp3_4x32b = _mm_unpacklo_epi16(res_temp3_4x32b, src_temp3_8x16b);
818                 res_temp4_4x32b = _mm_unpacklo_epi16(res_temp4_4x32b, src_temp4_8x16b);
819 
820                 /* (pi2_src[col] + lvl_shift) * wgt */
821                 res_temp5_4x32b = _mm_add_epi32(res_temp5_4x32b, lvl_shift1_4x32b);
822                 res_temp6_4x32b = _mm_add_epi32(res_temp6_4x32b, lvl_shift2_4x32b);
823                 res_temp7_4x32b = _mm_add_epi32(res_temp7_4x32b, lvl_shift1_4x32b);
824                 res_temp8_4x32b = _mm_add_epi32(res_temp8_4x32b, lvl_shift2_4x32b);
825                 res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, lvl_shift1_4x32b);
826                 res_temp2_4x32b = _mm_add_epi32(res_temp2_4x32b, lvl_shift2_4x32b);
827                 res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, lvl_shift1_4x32b);
828                 res_temp4_4x32b = _mm_add_epi32(res_temp4_4x32b, lvl_shift2_4x32b);
829 
830                 /* (pi2_src1[col] + lvl_shift1) * wgt0 + (pi2_src2[col] + lvl_shift2) * wgt1 */
831                 res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, res_temp2_4x32b);
832                 res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, res_temp4_4x32b);
833                 /* i4_tmp += (off0 + off1 + 1) << (shift - 1); */
834                 res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, const_temp_4x32b);
835                 res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, const_temp_4x32b);
836                 /* (i4_tmp >> shift) */
837                 res_temp1_4x32b = _mm_srai_epi32(res_temp1_4x32b,  shift);
838                 res_temp3_4x32b = _mm_srai_epi32(res_temp3_4x32b,  shift);
839 
840                 /* Next 4 Pixels */
841                 res_temp5_4x32b = _mm_add_epi32(res_temp5_4x32b, res_temp6_4x32b);
842                 res_temp7_4x32b = _mm_add_epi32(res_temp7_4x32b, res_temp8_4x32b);
843                 res_temp5_4x32b = _mm_add_epi32(res_temp5_4x32b, const_temp_4x32b);
844                 res_temp7_4x32b = _mm_add_epi32(res_temp7_4x32b, const_temp_4x32b);
845                 res_temp5_4x32b = _mm_srai_epi32(res_temp5_4x32b,  shift);
846                 res_temp7_4x32b = _mm_srai_epi32(res_temp7_4x32b,  shift);
847 
848                 res_temp1_4x32b = _mm_packs_epi32(res_temp1_4x32b, res_temp5_4x32b);
849                 res_temp3_4x32b = _mm_packs_epi32(res_temp3_4x32b, res_temp7_4x32b);
850 
851                 /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */
852                 res_temp1_4x32b = _mm_packus_epi16(res_temp1_4x32b, res_temp1_4x32b);
853                 res_temp3_4x32b = _mm_packus_epi16(res_temp3_4x32b, res_temp3_4x32b);
854 
855                 /* store four 8-bit output values  */
856                 _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd), res_temp1_4x32b); /* row = 0*/
857                 _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), res_temp3_4x32b); /* row = 1*/
858 
859                 pi2_src1 += 8;  /* Pointer update */
860                 pi2_src2 += 8;  /* Pointer update */
861                 pu1_dst  += 8;  /* Pointer update */
862 
863             } /* inner loop ends here(4-output values in single iteration) */
864 
865             pi2_src1 = pi2_src1 - wd + 2 * src_strd1;  /* Pointer update */
866             pi2_src2 = pi2_src2 - wd + 2 * src_strd2;  /* Pointer update */
867             pu1_dst  = pu1_dst  - wd + 2 * dst_strd;   /* Pointer update */
868 
869         } /* outer loop ends */
870     }
871     else /* wd multiple of 4 case */
872     {
873         WORD32 dst0, dst1;
874         /*  outer for loop starts from here */
875         for(row = 0; row < ht; row += 2)
876         {
877             for(col = 0; col < wd; col += 4)
878             {
879                 /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
880                 src_temp1_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1)); /* row = 0 */
881                 src_temp2_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2)); /* row = 0 */
882                 src_temp3_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1 + 1 * src_strd1)); /* row = 1 */
883                 src_temp4_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2 + 1 * src_strd2)); /* row = 1 */
884 
885                 /* 2 rows together */
886                 src_temp1_8x16b = _mm_unpacklo_epi64(src_temp1_8x16b, src_temp3_8x16b);
887                 src_temp2_8x16b = _mm_unpacklo_epi64(src_temp2_8x16b, src_temp4_8x16b);
888 
889                 /*i4_tmp = (pi2_src[col]) * wgt*/ /* Lower 16 bit */
890                 res_temp1_4x32b  = _mm_mullo_epi16(src_temp1_8x16b, wgt0_8x16b);
891                 res_temp2_4x32b  = _mm_mullo_epi16(src_temp2_8x16b, wgt1_8x16b);
892                 /*i4_tmp = (pi2_src[col] ) * wgt*/ /* Higher 16 bit */
893                 src_temp1_8x16b  = _mm_mulhi_epi16(src_temp1_8x16b, wgt0_8x16b);
894                 src_temp2_8x16b  = _mm_mulhi_epi16(src_temp2_8x16b, wgt1_8x16b);
895 
896                 /* Get 32 bit Result */
897                 res_temp3_4x32b = _mm_unpackhi_epi16(res_temp1_4x32b, src_temp1_8x16b);
898                 res_temp4_4x32b = _mm_unpackhi_epi16(res_temp2_4x32b, src_temp2_8x16b);
899 
900                 res_temp1_4x32b = _mm_unpacklo_epi16(res_temp1_4x32b, src_temp1_8x16b);
901                 res_temp2_4x32b = _mm_unpacklo_epi16(res_temp2_4x32b, src_temp2_8x16b);
902 
903                 /* (pi2_src[col] + lvl_shift) * wgt */
904                 res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, lvl_shift1_4x32b);
905                 res_temp4_4x32b = _mm_add_epi32(res_temp4_4x32b, lvl_shift2_4x32b);
906                 res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, lvl_shift1_4x32b);
907                 res_temp2_4x32b = _mm_add_epi32(res_temp2_4x32b, lvl_shift2_4x32b);
908 
909                 /* (pi2_src1[col] + lvl_shift1) * wgt0 + (pi2_src2[col] + lvl_shift2) * wgt1 */
910                 res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, res_temp2_4x32b);
911                 res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, res_temp4_4x32b);
912 
913                 /* i4_tmp += (off0 + off1 + 1) << (shift - 1); */
914                 res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, const_temp_4x32b);
915                 res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, const_temp_4x32b);
916 
917                 /* (i4_tmp >> shift) */
918                 res_temp1_4x32b = _mm_srai_epi32(res_temp1_4x32b,  shift);
919                 res_temp3_4x32b = _mm_srai_epi32(res_temp3_4x32b,  shift);
920 
921                 res_temp1_4x32b = _mm_packs_epi32(res_temp1_4x32b, res_temp3_4x32b);
922 
923                 /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */
924                 res_temp1_4x32b = _mm_packus_epi16(res_temp1_4x32b, res_temp1_4x32b);
925 
926                 dst0 = _mm_cvtsi128_si32(res_temp1_4x32b);
927 
928                 /* dst row = 1 to 3 */
929                 res_temp2_4x32b = _mm_shuffle_epi32(res_temp1_4x32b, 1);
930 
931                 /* store four 8-bit output values  */
932                 *(WORD32 *)(&pu1_dst[0 * dst_strd]) = dst0;
933 
934                 dst1 = _mm_cvtsi128_si32(res_temp2_4x32b);
935 
936                 /* row = 1 */
937                 *(WORD32 *)(&pu1_dst[1 * dst_strd]) = dst1;
938 
939                 pi2_src1 += 4;  /* Pointer update */
940                 pi2_src2 += 4;  /* Pointer update */
941                 pu1_dst  += 4;  /* Pointer update */
942 
943             } /* inner loop ends here(4-output values in single iteration) */
944 
945             pi2_src1 = pi2_src1 - wd + 2 * src_strd1;  /* Pointer update */
946             pi2_src2 = pi2_src2 - wd + 2 * src_strd2;  /* Pointer update */
947             pu1_dst  = pu1_dst  - wd + 2 * dst_strd;   /* Pointer update */
948 
949         } /* outer loop ends */
950     }
951 
952 }
953 
954 /**
955 *******************************************************************************
956 *
957 * @brief
958 * Does chroma bi-weighted prediction on the arrays pointed by  pi2_src1 and
959 * pi2_src2 and stores it at location pointed  by pi2_dst
960 *
961 * @par Description:
962 *  dst = ( (src1 + lvl_shift1)*wgt0 +  (src2 + lvl_shift2)*wgt1 +  (off0 +
963 * off1 + 1) << (shift - 1) ) >> shift
964 *
965 * @param[in] pi2_src1
966 *  Pointer to source 1
967 *
968 * @param[in] pi2_src2
969 *  Pointer to source 2
970 *
971 * @param[out] pu1_dst
972 *  Pointer to destination
973 *
974 * @param[in] src_strd1
975 *  Source stride 1
976 *
977 * @param[in] src_strd2
978 *  Source stride 2
979 *
980 * @param[in] dst_strd
981 *  Destination stride
982 *
983 * @param[in] wgt0
984 *  weight to be multiplied to source 1
985 *
986 * @param[in] off0
987 *  offset 0
988 *
989 * @param[in] wgt1
990 *  weight to be multiplied to source 2
991 *
992 * @param[in] off1
993 *  offset 1
994 *
995 * @param[in] shift
996 *  (14 Bit depth) + log2_weight_denominator
997 *
998 * @param[in] lvl_shift1
999 *  added before shift and offset
1000 *
1001 * @param[in] lvl_shift2
1002 *  added before shift and offset
1003 *
1004 * @param[in] ht
1005 *  height of the source
1006 *
1007 * @param[in] wd
1008 *  width of the source (each colour component)
1009 *
1010 * @returns
1011 *
1012 * @remarks
1013 *  None
1014 *
1015 *******************************************************************************
1016 */
1017 
1018 
ihevc_weighted_pred_chroma_bi_ssse3(WORD16 * pi2_src1,WORD16 * pi2_src2,UWORD8 * pu1_dst,WORD32 src_strd1,WORD32 src_strd2,WORD32 dst_strd,WORD32 wgt0_cb,WORD32 wgt0_cr,WORD32 off0_cb,WORD32 off0_cr,WORD32 wgt1_cb,WORD32 wgt1_cr,WORD32 off1_cb,WORD32 off1_cr,WORD32 shift,WORD32 lvl_shift1,WORD32 lvl_shift2,WORD32 ht,WORD32 wd)1019 void ihevc_weighted_pred_chroma_bi_ssse3(WORD16 *pi2_src1,
1020                                          WORD16 *pi2_src2,
1021                                          UWORD8 *pu1_dst,
1022                                          WORD32 src_strd1,
1023                                          WORD32 src_strd2,
1024                                          WORD32 dst_strd,
1025                                          WORD32 wgt0_cb,
1026                                          WORD32 wgt0_cr,
1027                                          WORD32 off0_cb,
1028                                          WORD32 off0_cr,
1029                                          WORD32 wgt1_cb,
1030                                          WORD32 wgt1_cr,
1031                                          WORD32 off1_cb,
1032                                          WORD32 off1_cr,
1033                                          WORD32 shift,
1034                                          WORD32 lvl_shift1,
1035                                          WORD32 lvl_shift2,
1036                                          WORD32 ht,
1037                                          WORD32 wd)
1038 {
1039     WORD32 row, col, temp1, temp2;
1040     WORD32 wdx2;
1041 
1042     __m128i src_temp1_8x16b, src_temp2_8x16b, src_temp3_8x16b, src_temp4_8x16b;
1043     __m128i const_temp_4x32b, lvl_shift1_4x32b, lvl_shift2_4x32b, wgt0_8x16b, wgt1_8x16b;
1044     __m128i res_temp1_4x32b, res_temp2_4x32b, res_temp3_4x32b, res_temp4_4x32b;
1045 
1046     ASSERT(wd % 2 == 0); /* checking assumption*/
1047     ASSERT(ht % 2 == 0); /* checking assumption*/
1048 
1049     temp1 = (off0_cb + off1_cb + 1) << (shift - 1);
1050     temp2 = (off0_cr + off1_cr + 1) << (shift - 1);
1051 
1052     // seting values in register
1053     lvl_shift1_4x32b = _mm_set1_epi16(lvl_shift1);
1054     wgt0_8x16b = _mm_set_epi16(wgt0_cr, wgt0_cb, wgt0_cr, wgt0_cb, wgt0_cr, wgt0_cb, wgt0_cr, wgt0_cb);
1055     lvl_shift2_4x32b = _mm_set1_epi16(lvl_shift2);
1056     wgt1_8x16b = _mm_set_epi16(wgt1_cr, wgt1_cb, wgt1_cr, wgt1_cb, wgt1_cr, wgt1_cb, wgt1_cr, wgt1_cb);
1057 
1058     /* lvl_shift1 * wgt0 */
1059     res_temp1_4x32b = _mm_mullo_epi16(lvl_shift1_4x32b, wgt0_8x16b);
1060     res_temp2_4x32b = _mm_mulhi_epi16(lvl_shift1_4x32b, wgt0_8x16b);
1061     /* lvl_shift2 * wgt1 */
1062     res_temp3_4x32b = _mm_mullo_epi16(lvl_shift2_4x32b, wgt1_8x16b);
1063     res_temp4_4x32b = _mm_mulhi_epi16(lvl_shift2_4x32b, wgt1_8x16b);
1064 
1065     const_temp_4x32b = _mm_set_epi32(temp2, temp1, temp2, temp1);
1066     wdx2 = wd * 2;
1067 
1068     /* lvl_shift1 * wgt0 */
1069     lvl_shift1_4x32b = _mm_unpacklo_epi16(res_temp1_4x32b, res_temp2_4x32b);
1070     /* lvl_shift2 * wgt1 */
1071     lvl_shift2_4x32b = _mm_unpacklo_epi16(res_temp3_4x32b, res_temp4_4x32b);
1072 
1073     if(0 == (wdx2 & 7)) /* wdx2 multiple of 8 case */
1074     {
1075         __m128i res_temp5_4x32b, res_temp6_4x32b, res_temp7_4x32b, res_temp8_4x32b;
1076         /*  outer for loop starts from here */
1077         for(row = 0; row < ht; row += 2)
1078         {
1079             for(col = 0; col < wdx2; col += 8)
1080             {
1081                 /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
1082                 src_temp1_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1)); /* row = 0 */
1083                 src_temp2_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2)); /* row = 0 */
1084                 src_temp3_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 1 * src_strd1)); /* row = 1 */
1085                 src_temp4_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 1 * src_strd2)); /* row = 1 */
1086 
1087                 /*i4_tmp = (pi2_src[col]) * wgt*/ /* Lower 16 bit */
1088                 res_temp1_4x32b  = _mm_mullo_epi16(src_temp1_8x16b, wgt0_8x16b);
1089                 res_temp2_4x32b  = _mm_mullo_epi16(src_temp2_8x16b, wgt1_8x16b);
1090                 res_temp3_4x32b  = _mm_mullo_epi16(src_temp3_8x16b, wgt0_8x16b);
1091                 res_temp4_4x32b  = _mm_mullo_epi16(src_temp4_8x16b, wgt1_8x16b);
1092                 /*i4_tmp = (pi2_src[col] ) * wgt*/ /* Higher 16 bit */
1093                 src_temp1_8x16b  = _mm_mulhi_epi16(src_temp1_8x16b, wgt0_8x16b);
1094                 src_temp2_8x16b  = _mm_mulhi_epi16(src_temp2_8x16b, wgt1_8x16b);
1095                 src_temp3_8x16b  = _mm_mulhi_epi16(src_temp3_8x16b, wgt0_8x16b);
1096                 src_temp4_8x16b  = _mm_mulhi_epi16(src_temp4_8x16b, wgt1_8x16b);
1097 
1098                 /* Get 32 bit Result */
1099                 res_temp5_4x32b = _mm_unpackhi_epi16(res_temp1_4x32b, src_temp1_8x16b);
1100                 res_temp6_4x32b = _mm_unpackhi_epi16(res_temp2_4x32b, src_temp2_8x16b);
1101                 res_temp7_4x32b = _mm_unpackhi_epi16(res_temp3_4x32b, src_temp3_8x16b);
1102                 res_temp8_4x32b = _mm_unpackhi_epi16(res_temp4_4x32b, src_temp4_8x16b);
1103 
1104                 res_temp1_4x32b = _mm_unpacklo_epi16(res_temp1_4x32b, src_temp1_8x16b);
1105                 res_temp2_4x32b = _mm_unpacklo_epi16(res_temp2_4x32b, src_temp2_8x16b);
1106                 res_temp3_4x32b = _mm_unpacklo_epi16(res_temp3_4x32b, src_temp3_8x16b);
1107                 res_temp4_4x32b = _mm_unpacklo_epi16(res_temp4_4x32b, src_temp4_8x16b);
1108 
1109                 /* (pi2_src[col] + lvl_shift) * wgt */
1110                 res_temp5_4x32b = _mm_add_epi32(res_temp5_4x32b, lvl_shift1_4x32b);
1111                 res_temp6_4x32b = _mm_add_epi32(res_temp6_4x32b, lvl_shift2_4x32b);
1112                 res_temp7_4x32b = _mm_add_epi32(res_temp7_4x32b, lvl_shift1_4x32b);
1113                 res_temp8_4x32b = _mm_add_epi32(res_temp8_4x32b, lvl_shift2_4x32b);
1114                 res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, lvl_shift1_4x32b);
1115                 res_temp2_4x32b = _mm_add_epi32(res_temp2_4x32b, lvl_shift2_4x32b);
1116                 res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, lvl_shift1_4x32b);
1117                 res_temp4_4x32b = _mm_add_epi32(res_temp4_4x32b, lvl_shift2_4x32b);
1118 
1119                 /* (pi2_src1[col] + lvl_shift1) * wgt0 + (pi2_src2[col] + lvl_shift2) * wgt1 */
1120                 res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, res_temp2_4x32b);
1121                 res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, res_temp4_4x32b);
1122                 /* i4_tmp += (off0 + off1 + 1) << (shift - 1); */
1123                 res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, const_temp_4x32b);
1124                 res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, const_temp_4x32b);
1125                 /* (i4_tmp >> shift) */
1126                 res_temp1_4x32b = _mm_srai_epi32(res_temp1_4x32b,  shift);
1127                 res_temp3_4x32b = _mm_srai_epi32(res_temp3_4x32b,  shift);
1128 
1129                 /* Next 4 Pixels */
1130                 res_temp5_4x32b = _mm_add_epi32(res_temp5_4x32b, res_temp6_4x32b);
1131                 res_temp7_4x32b = _mm_add_epi32(res_temp7_4x32b, res_temp8_4x32b);
1132                 res_temp5_4x32b = _mm_add_epi32(res_temp5_4x32b, const_temp_4x32b);
1133                 res_temp7_4x32b = _mm_add_epi32(res_temp7_4x32b, const_temp_4x32b);
1134                 res_temp5_4x32b = _mm_srai_epi32(res_temp5_4x32b,  shift);
1135                 res_temp7_4x32b = _mm_srai_epi32(res_temp7_4x32b,  shift);
1136 
1137                 res_temp1_4x32b = _mm_packs_epi32(res_temp1_4x32b, res_temp5_4x32b);
1138                 res_temp3_4x32b = _mm_packs_epi32(res_temp3_4x32b, res_temp7_4x32b);
1139 
1140                 /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */
1141                 res_temp1_4x32b = _mm_packus_epi16(res_temp1_4x32b, res_temp1_4x32b);
1142                 res_temp3_4x32b = _mm_packus_epi16(res_temp3_4x32b, res_temp3_4x32b);
1143 
1144                 /* store four 8-bit output values  */
1145                 _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd), res_temp1_4x32b); /* row = 0*/
1146                 _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), res_temp3_4x32b); /* row = 1*/
1147 
1148                 pi2_src1 += 8;  /* Pointer update */
1149                 pi2_src2 += 8;  /* Pointer update */
1150                 pu1_dst  += 8;  /* Pointer update */
1151 
1152             } /* inner loop ends here(4-output values in single iteration) */
1153 
1154             pi2_src1 = pi2_src1 - wdx2 + 2 * src_strd1;    /* Pointer update */
1155             pi2_src2 = pi2_src2 - wdx2 + 2 * src_strd2;    /* Pointer update */
1156             pu1_dst  = pu1_dst  - wdx2 + 2 * dst_strd;   /* Pointer update */
1157 
1158         } /* outer loop ends */
1159     }
1160     else /* wdx2 multiple of 4 case */
1161     {
1162         WORD32 dst0, dst1;
1163         /*  outer for loop starts from here */
1164         for(row = 0; row < ht; row += 2)
1165         {
1166             for(col = 0; col < wdx2; col += 4)
1167             {
1168                 /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
1169                 src_temp1_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1)); /* row = 0 */
1170                 src_temp2_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2)); /* row = 0 */
1171                 src_temp3_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1 + 1 * src_strd1)); /* row = 1 */
1172                 src_temp4_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2 + 1 * src_strd2)); /* row = 1 */
1173 
1174                 /* 2 rows together */
1175                 src_temp1_8x16b = _mm_unpacklo_epi64(src_temp1_8x16b, src_temp3_8x16b);
1176                 src_temp2_8x16b = _mm_unpacklo_epi64(src_temp2_8x16b, src_temp4_8x16b);
1177 
1178                 /*i4_tmp = (pi2_src[col]) * wgt*/ /* Lower 16 bit */
1179                 res_temp1_4x32b  = _mm_mullo_epi16(src_temp1_8x16b, wgt0_8x16b);
1180                 res_temp2_4x32b  = _mm_mullo_epi16(src_temp2_8x16b, wgt1_8x16b);
1181                 /*i4_tmp = (pi2_src[col] ) * wgt*/ /* Higher 16 bit */
1182                 src_temp1_8x16b  = _mm_mulhi_epi16(src_temp1_8x16b, wgt0_8x16b);
1183                 src_temp2_8x16b  = _mm_mulhi_epi16(src_temp2_8x16b, wgt1_8x16b);
1184 
1185                 /* Get 32 bit Result */
1186                 res_temp3_4x32b = _mm_unpackhi_epi16(res_temp1_4x32b, src_temp1_8x16b);
1187                 res_temp4_4x32b = _mm_unpackhi_epi16(res_temp2_4x32b, src_temp2_8x16b);
1188 
1189                 res_temp1_4x32b = _mm_unpacklo_epi16(res_temp1_4x32b, src_temp1_8x16b);
1190                 res_temp2_4x32b = _mm_unpacklo_epi16(res_temp2_4x32b, src_temp2_8x16b);
1191 
1192                 /* (pi2_src[col] + lvl_shift) * wgt */
1193                 res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, lvl_shift1_4x32b);
1194                 res_temp4_4x32b = _mm_add_epi32(res_temp4_4x32b, lvl_shift2_4x32b);
1195                 res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, lvl_shift1_4x32b);
1196                 res_temp2_4x32b = _mm_add_epi32(res_temp2_4x32b, lvl_shift2_4x32b);
1197 
1198                 /* (pi2_src1[col] + lvl_shift1) * wgt0 + (pi2_src2[col] + lvl_shift2) * wgt1 */
1199                 res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, res_temp2_4x32b);
1200                 res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, res_temp4_4x32b);
1201 
1202                 /* i4_tmp += (off0 + off1 + 1) << (shift - 1); */
1203                 res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, const_temp_4x32b);
1204                 res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, const_temp_4x32b);
1205 
1206                 /* (i4_tmp >> shift) */
1207                 res_temp1_4x32b = _mm_srai_epi32(res_temp1_4x32b,  shift);
1208                 res_temp3_4x32b = _mm_srai_epi32(res_temp3_4x32b,  shift);
1209 
1210                 res_temp1_4x32b = _mm_packs_epi32(res_temp1_4x32b, res_temp3_4x32b);
1211 
1212                 /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */
1213                 res_temp1_4x32b = _mm_packus_epi16(res_temp1_4x32b, res_temp1_4x32b);
1214 
1215                 dst0 = _mm_cvtsi128_si32(res_temp1_4x32b);
1216 
1217                 /* dst row = 1 to 3 */
1218                 res_temp2_4x32b = _mm_shuffle_epi32(res_temp1_4x32b, 1);
1219 
1220                 /* store four 8-bit output values  */
1221                 *(WORD32 *)(&pu1_dst[0 * dst_strd]) = dst0;
1222 
1223                 dst1 = _mm_cvtsi128_si32(res_temp2_4x32b);
1224 
1225                 /* row = 1 */
1226                 *(WORD32 *)(&pu1_dst[1 * dst_strd]) = dst1;
1227 
1228                 pi2_src1 += 4;  /* Pointer update */
1229                 pi2_src2 += 4;  /* Pointer update */
1230                 pu1_dst  += 4;  /* Pointer update */
1231 
1232             } /* inner loop ends here(4-output values in single iteration) */
1233 
1234             pi2_src1 = pi2_src1 - wdx2 + 2 * src_strd1;    /* Pointer update */
1235             pi2_src2 = pi2_src2 - wdx2 + 2 * src_strd2;    /* Pointer update */
1236             pu1_dst  = pu1_dst  - wdx2 + 2 * dst_strd;   /* Pointer update */
1237         }
1238     }
1239 
1240 }
1241 
1242 /**
1243 *******************************************************************************
1244 *
1245 * @brief
1246 *  Does default bi-weighted prediction on the arrays pointed by pi2_src1 and
1247 * pi2_src2 and stores it at location  pointed by pi2_dst
1248 *
1249 * @par Description:
1250 *  dst = ( (src1 + lvl_shift1) +  (src2 + lvl_shift2) +  1 << (shift - 1) )
1251 * >> shift  where shift = 15 - BitDepth
1252 *
1253 * @param[in] pi2_src1
1254 *  Pointer to source 1
1255 *
1256 * @param[in] pi2_src2
1257 *  Pointer to source 2
1258 *
1259 * @param[out] pu1_dst
1260 *  Pointer to destination
1261 *
1262 * @param[in] src_strd1
1263 *  Source stride 1
1264 *
1265 * @param[in] src_strd2
1266 *  Source stride 2
1267 *
1268 * @param[in] dst_strd
1269 *  Destination stride
1270 *
1271 * @param[in] lvl_shift1
1272 *  added before shift and offset
1273 *
1274 * @param[in] lvl_shift2
1275 *  added before shift and offset
1276 *
1277 * @param[in] ht
1278 *  height of the source
1279 *
1280 * @param[in] wd
1281 *  width of the source
1282 *
1283 * @returns
1284 *
1285 * @remarks
1286 *  None
1287 *
1288 * Assumption : ht%4 == 0, wd%4 == 0
1289 * shift == 7, (lvl_shift1+lvl_shift2) can take {0, 8K, 16K}. In that case,
1290 * final result will match even if intermediate precision is in 16 bit.
1291 *
1292 *******************************************************************************
1293 */
ihevc_weighted_pred_bi_default_ssse3(WORD16 * pi2_src1,WORD16 * pi2_src2,UWORD8 * pu1_dst,WORD32 src_strd1,WORD32 src_strd2,WORD32 dst_strd,WORD32 lvl_shift1,WORD32 lvl_shift2,WORD32 ht,WORD32 wd)1294 void ihevc_weighted_pred_bi_default_ssse3(WORD16 *pi2_src1,
1295                                           WORD16 *pi2_src2,
1296                                           UWORD8 *pu1_dst,
1297                                           WORD32 src_strd1,
1298                                           WORD32 src_strd2,
1299                                           WORD32 dst_strd,
1300                                           WORD32 lvl_shift1,
1301                                           WORD32 lvl_shift2,
1302                                           WORD32 ht,
1303                                           WORD32 wd)
1304 {
1305     {
1306         WORD32 row, col, temp;
1307         WORD32 shift;
1308 
1309         __m128i src_temp1_8x16b, src_temp2_8x16b, src_temp3_8x16b, src_temp4_8x16b;
1310         __m128i const_temp_8x16b, lvl_shift1_8x16b, lvl_shift2_8x16b;
1311         __m128i src_temp5_8x16b, src_temp6_8x16b, src_temp7_8x16b, src_temp8_8x16b;
1312 
1313         ASSERT(wd % 4 == 0); /* checking assumption*/
1314         ASSERT(ht % 2 == 0); /* checking assumption*/
1315 
1316         shift = SHIFT_14_MINUS_BIT_DEPTH + 1;
1317         temp = 1 << (shift - 1);
1318 
1319         // seting values in register
1320         lvl_shift1_8x16b = _mm_set1_epi16(lvl_shift1);
1321         lvl_shift2_8x16b = _mm_set1_epi16(lvl_shift2);
1322         const_temp_8x16b = _mm_set1_epi16(temp);
1323 
1324         lvl_shift1_8x16b = _mm_adds_epi16(lvl_shift1_8x16b, lvl_shift2_8x16b);
1325         lvl_shift1_8x16b = _mm_adds_epi16(lvl_shift1_8x16b, const_temp_8x16b);
1326 
1327         if(0 == (ht & 3)) /* ht multiple of 4*/
1328         {
1329             if(0 == (wd & 15)) /* wd multiple of 16 case */
1330             {
1331                 __m128i src_temp9_8x16b,  src_temp10_8x16b, src_temp11_8x16b, src_temp12_8x16b;
1332                 __m128i src_temp13_8x16b, src_temp14_8x16b, src_temp15_8x16b, src_temp16_8x16b;
1333                 /*  outer for loop starts from here */
1334                 for(row = 0; row < ht; row += 4)
1335                 {
1336                     for(col = 0; col < wd; col += 16)
1337                     {
1338                         /*load 8 pixel values */ /* First 8 Values */
1339                         src_temp1_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1));
1340                         src_temp2_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2));
1341                         /* row = 1 */
1342                         src_temp3_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + src_strd1));
1343                         src_temp4_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + src_strd2));
1344                         /* row = 2 */
1345                         src_temp5_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 2 * src_strd1));
1346                         src_temp6_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 2 * src_strd2));
1347                         /* row = 3 */
1348                         src_temp7_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 3 * src_strd1));
1349                         src_temp8_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 3 * src_strd2));
1350 
1351                         /*load 8 pixel values */ /* Second 8 Values */
1352                         src_temp9_8x16b  = _mm_loadu_si128((__m128i *)(pi2_src1 + 8));
1353                         src_temp10_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 8));
1354                         /* row = 1 */
1355                         src_temp11_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + src_strd1 + 8));
1356                         src_temp12_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + src_strd2 + 8));
1357                         /* row = 2 */
1358                         src_temp13_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 2 * src_strd1 + 8));
1359                         src_temp14_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 2 * src_strd2 + 8));
1360 
1361                         /* (pi2_src1[col] + pi2_src2[col]) */ /* First 8 Values */
1362                         src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, src_temp2_8x16b);
1363                         src_temp3_8x16b = _mm_adds_epi16(src_temp3_8x16b, src_temp4_8x16b);
1364                         src_temp5_8x16b = _mm_adds_epi16(src_temp5_8x16b, src_temp6_8x16b);
1365                         src_temp7_8x16b = _mm_adds_epi16(src_temp7_8x16b, src_temp8_8x16b);
1366 
1367                         /*load 8 pixel values */ /* Second 8 Values */
1368                         /* row = 3 */
1369                         src_temp15_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 3 * src_strd1 + 8));
1370                         src_temp16_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 3 * src_strd2 + 8));
1371 
1372                         /* i4_tmp = (pi2_src1[col] + pi2_src2[col] + lvl_shift1 + lvl_shift2 + shift_value) */ /* First 8 Values */
1373                         src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, lvl_shift1_8x16b);
1374                         src_temp3_8x16b = _mm_adds_epi16(src_temp3_8x16b, lvl_shift1_8x16b);
1375                         src_temp5_8x16b = _mm_adds_epi16(src_temp5_8x16b, lvl_shift1_8x16b);
1376                         src_temp7_8x16b = _mm_adds_epi16(src_temp7_8x16b, lvl_shift1_8x16b);
1377 
1378                         /* (pi2_src1[col] + pi2_src2[col]) */ /* Second 8 Values */
1379                         src_temp9_8x16b  = _mm_adds_epi16(src_temp9_8x16b,  src_temp10_8x16b);
1380                         src_temp11_8x16b = _mm_adds_epi16(src_temp11_8x16b, src_temp12_8x16b);
1381                         src_temp13_8x16b = _mm_adds_epi16(src_temp13_8x16b, src_temp14_8x16b);
1382                         src_temp15_8x16b = _mm_adds_epi16(src_temp15_8x16b, src_temp16_8x16b);
1383 
1384                         /* (i4_tmp >> shift) */ /* First 8 Values */
1385                         src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b,  shift);
1386                         src_temp3_8x16b = _mm_srai_epi16(src_temp3_8x16b,  shift);
1387                         src_temp5_8x16b = _mm_srai_epi16(src_temp5_8x16b,  shift);
1388                         src_temp7_8x16b = _mm_srai_epi16(src_temp7_8x16b,  shift);
1389 
1390                         /* i4_tmp = (pi2_src1[col] + pi2_src2[col] + lvl_shift1 + lvl_shift2 + shift_value) */ /* Second 8 Values */
1391                         src_temp9_8x16b  = _mm_adds_epi16(src_temp9_8x16b, lvl_shift1_8x16b);
1392                         src_temp11_8x16b = _mm_adds_epi16(src_temp11_8x16b, lvl_shift1_8x16b);
1393                         src_temp13_8x16b = _mm_adds_epi16(src_temp13_8x16b, lvl_shift1_8x16b);
1394                         src_temp15_8x16b = _mm_adds_epi16(src_temp15_8x16b, lvl_shift1_8x16b);
1395 
1396                         /* (i4_tmp >> shift) */ /* Second 8 Values */
1397                         src_temp9_8x16b  = _mm_srai_epi16(src_temp9_8x16b,  shift);
1398                         src_temp11_8x16b = _mm_srai_epi16(src_temp11_8x16b,  shift);
1399                         src_temp13_8x16b = _mm_srai_epi16(src_temp13_8x16b,  shift);
1400                         src_temp15_8x16b = _mm_srai_epi16(src_temp15_8x16b,  shift);
1401 
1402                         /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */ /* 16 8 Values */
1403                         src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp9_8x16b);
1404                         src_temp3_8x16b = _mm_packus_epi16(src_temp3_8x16b, src_temp11_8x16b);
1405                         src_temp5_8x16b = _mm_packus_epi16(src_temp5_8x16b, src_temp13_8x16b);
1406                         src_temp7_8x16b = _mm_packus_epi16(src_temp7_8x16b, src_temp15_8x16b);
1407 
1408                         /* store four 8-bit output values  */ /* 16 8 Values */
1409                         _mm_storeu_si128((__m128i *)(pu1_dst + 0 * dst_strd), src_temp1_8x16b); /* row = 0*/
1410                         _mm_storeu_si128((__m128i *)(pu1_dst + 1 * dst_strd), src_temp3_8x16b); /* row = 2*/
1411                         _mm_storeu_si128((__m128i *)(pu1_dst + 2 * dst_strd), src_temp5_8x16b); /* row = 1*/
1412                         _mm_storeu_si128((__m128i *)(pu1_dst + 3 * dst_strd), src_temp7_8x16b); /* row = 3*/
1413 
1414                         /* To update pointer */
1415                         pi2_src1 += 16;
1416                         pi2_src2 += 16;
1417                         pu1_dst  += 16;
1418 
1419                     } /* inner loop ends here(8-output values in single iteration) */
1420 
1421                     pi2_src1 = pi2_src1 - wd + 4 * src_strd1;  /* Pointer update */
1422                     pi2_src2 = pi2_src2 - wd + 4 * src_strd2;  /* Pointer update */
1423                     pu1_dst  = pu1_dst - wd + 4 * dst_strd;   /* Pointer update */
1424 
1425                 }
1426             }
1427             else if(0 == (wd & 7)) /* multiple of 8 case */
1428             {
1429                 /*  outer for loop starts from here */
1430                 for(row = 0; row < ht; row += 4)
1431                 {
1432                     for(col = 0; col < wd; col += 8)
1433                     {
1434                         /*load 8 pixel values */
1435                         src_temp1_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1));
1436                         src_temp2_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2));
1437                         /* row = 1 */
1438                         src_temp3_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + src_strd1));
1439                         src_temp4_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + src_strd2));
1440                         /* row = 2 */
1441                         src_temp5_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 2 * src_strd1));
1442                         src_temp6_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 2 * src_strd2));
1443                         /* row = 3 */
1444                         src_temp7_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 3 * src_strd1));
1445                         src_temp8_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 3 * src_strd2));
1446 
1447                         /* (pi2_src1[col] + pi2_src2[col]) */
1448                         src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, src_temp2_8x16b);
1449                         src_temp3_8x16b = _mm_adds_epi16(src_temp3_8x16b, src_temp4_8x16b);
1450                         src_temp5_8x16b = _mm_adds_epi16(src_temp5_8x16b, src_temp6_8x16b);
1451                         src_temp7_8x16b = _mm_adds_epi16(src_temp7_8x16b, src_temp8_8x16b);
1452 
1453                         /* i4_tmp = (pi2_src1[col] + pi2_src2[col] + lvl_shift1 + lvl_shift2 + shift_value) */
1454                         src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, lvl_shift1_8x16b);
1455                         src_temp3_8x16b = _mm_adds_epi16(src_temp3_8x16b, lvl_shift1_8x16b);
1456                         src_temp5_8x16b = _mm_adds_epi16(src_temp5_8x16b, lvl_shift1_8x16b);
1457                         src_temp7_8x16b = _mm_adds_epi16(src_temp7_8x16b, lvl_shift1_8x16b);
1458 
1459                         /* (i4_tmp >> shift) */
1460                         src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b,  shift);
1461                         src_temp3_8x16b = _mm_srai_epi16(src_temp3_8x16b,  shift);
1462                         src_temp5_8x16b = _mm_srai_epi16(src_temp5_8x16b,  shift);
1463                         src_temp7_8x16b = _mm_srai_epi16(src_temp7_8x16b,  shift);
1464 
1465                         /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */
1466                         src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp1_8x16b);
1467                         src_temp3_8x16b = _mm_packus_epi16(src_temp3_8x16b, src_temp3_8x16b);
1468                         src_temp5_8x16b = _mm_packus_epi16(src_temp5_8x16b, src_temp5_8x16b);
1469                         src_temp7_8x16b = _mm_packus_epi16(src_temp7_8x16b, src_temp7_8x16b);
1470 
1471                         /* store four 8-bit output values  */
1472                         _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd), src_temp1_8x16b); /* row = 0*/
1473                         _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), src_temp3_8x16b); /* row = 2*/
1474                         _mm_storel_epi64((__m128i *)(pu1_dst + 2 * dst_strd), src_temp5_8x16b); /* row = 1*/
1475                         _mm_storel_epi64((__m128i *)(pu1_dst + 3 * dst_strd), src_temp7_8x16b); /* row = 3*/
1476 
1477                         /* To update pointer */
1478                         pi2_src1 += 8;
1479                         pi2_src2 += 8;
1480                         pu1_dst  += 8;
1481 
1482                     } /* inner loop ends here(8-output values in single iteration) */
1483 
1484                     pi2_src1 = pi2_src1 - wd + 4 * src_strd1;  /* Pointer update */
1485                     pi2_src2 = pi2_src2 - wd + 4 * src_strd2;  /* Pointer update */
1486                     pu1_dst  = pu1_dst - wd + 4 * dst_strd;   /* Pointer update */
1487 
1488                 }
1489             }
1490             else /* wd multiple of 4 case*/
1491             {
1492                 WORD32 dst0, dst1, dst2, dst3;
1493 
1494                 /*  outer for loop starts from here */
1495                 for(row = 0; row < ht; row += 4)
1496                 {
1497                     for(col = 0; col < wd; col += 4)
1498                     {
1499                         /*load 4 pixel values from 7:0 pos. relative to cur. pos.*/
1500                         src_temp1_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1));
1501                         /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
1502                         src_temp2_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2));
1503 
1504                         /* row = 1 */
1505                         src_temp3_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1 + src_strd1));
1506                         src_temp4_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2 + src_strd2));
1507                         /* row = 2 */
1508                         src_temp5_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1 + 2 * src_strd1));
1509                         src_temp6_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2 + 2 * src_strd2));
1510                         /* row = 3 */
1511                         src_temp7_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1 + 3 * src_strd1));
1512                         src_temp8_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2 + 3 * src_strd2));
1513 
1514                         /* Pack two rows together */
1515                         src_temp1_8x16b = _mm_unpacklo_epi64(src_temp1_8x16b, src_temp3_8x16b);
1516                         src_temp2_8x16b = _mm_unpacklo_epi64(src_temp2_8x16b, src_temp4_8x16b);
1517                         src_temp5_8x16b = _mm_unpacklo_epi64(src_temp5_8x16b, src_temp7_8x16b);
1518                         src_temp6_8x16b = _mm_unpacklo_epi64(src_temp6_8x16b, src_temp8_8x16b);
1519 
1520                         /* (pi2_src1[col] + pi2_src2[col]) */
1521                         src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, src_temp2_8x16b);
1522                         src_temp5_8x16b = _mm_adds_epi16(src_temp5_8x16b, src_temp6_8x16b);
1523 
1524                         /* i4_tmp = (pi2_src1[col] + pi2_src2[col] + lvl_shift1 + lvl_shift2 + shift_value) */
1525                         src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, lvl_shift1_8x16b);
1526                         src_temp5_8x16b = _mm_adds_epi16(src_temp5_8x16b, lvl_shift1_8x16b);
1527 
1528                         /* (i4_tmp >> shift) */
1529                         src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b,  shift);
1530                         src_temp5_8x16b = _mm_srai_epi16(src_temp5_8x16b,  shift);
1531 
1532                         /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */
1533                         src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp1_8x16b);
1534                         src_temp5_8x16b = _mm_packus_epi16(src_temp5_8x16b, src_temp5_8x16b);
1535 
1536                         dst0 = _mm_cvtsi128_si32(src_temp1_8x16b);
1537                         /* dst row = 1 to 3 */
1538                         src_temp2_8x16b = _mm_shuffle_epi32(src_temp1_8x16b, 1);
1539                         src_temp4_8x16b = _mm_shuffle_epi32(src_temp5_8x16b, 1);
1540 
1541                         /* store four 8-bit output values  */
1542                         *(WORD32 *)(&pu1_dst[0 * dst_strd]) = dst0;
1543 
1544                         dst1 = _mm_cvtsi128_si32(src_temp2_8x16b);
1545                         dst2 = _mm_cvtsi128_si32(src_temp5_8x16b);
1546                         dst3 = _mm_cvtsi128_si32(src_temp4_8x16b);
1547 
1548                         /* row = 1 to row = 3 */
1549                         *(WORD32 *)(&pu1_dst[1 * dst_strd]) = dst1;
1550                         *(WORD32 *)(&pu1_dst[2 * dst_strd]) = dst2;
1551                         *(WORD32 *)(&pu1_dst[3 * dst_strd]) = dst3;
1552 
1553                         /* To update pointer */
1554                         pi2_src1 += 4;
1555                         pi2_src2 += 4;
1556                         pu1_dst  += 4;
1557 
1558                     } /* inner loop ends here(4-output values in single iteration) */
1559 
1560                     pi2_src1 = pi2_src1 - wd + 4 * src_strd1; /* Pointer update */
1561                     pi2_src2 = pi2_src2 - wd + 4 * src_strd2; /* Pointer update */
1562                     pu1_dst  = pu1_dst  - wd + 4 * dst_strd;  /* Pointer update */
1563 
1564                 }
1565             }
1566         }
1567         else /* ht multiple of 2 case and wd multiple of 4 case*/
1568         {
1569 
1570             WORD32 dst0, dst1;
1571 
1572             /*  outer for loop starts from here */
1573             for(row = 0; row < ht; row += 2)
1574             {
1575                 for(col = 0; col < wd; col += 4)
1576                 {
1577                     /*load 4 pixel values from 7:0 pos. relative to cur. pos.*/
1578                     src_temp1_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1));
1579                     /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
1580                     src_temp2_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2));
1581 
1582                     /* row = 1 */
1583                     src_temp3_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1 + src_strd1));
1584                     src_temp4_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2 + src_strd2));
1585 
1586                     /* Pack two rows together */
1587                     src_temp1_8x16b = _mm_unpacklo_epi64(src_temp1_8x16b, src_temp3_8x16b);
1588                     src_temp2_8x16b = _mm_unpacklo_epi64(src_temp2_8x16b, src_temp4_8x16b);
1589 
1590                     /* (pi2_src1[col] + pi2_src2[col]) */
1591                     src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, src_temp2_8x16b);
1592 
1593                     /* i4_tmp = (pi2_src1[col] + pi2_src2[col] + lvl_shift1 + lvl_shift2 + shift_value) */
1594                     src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, lvl_shift1_8x16b);
1595 
1596                     /* (i4_tmp >> shift) */
1597                     src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b,  shift);
1598 
1599                     /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */
1600                     src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp1_8x16b);
1601 
1602                     dst0 = _mm_cvtsi128_si32(src_temp1_8x16b);
1603                     /* dst row = 1 to 3 */
1604                     src_temp2_8x16b = _mm_shuffle_epi32(src_temp1_8x16b, 1);
1605 
1606                     /* store four 8-bit output values  */
1607                     *(WORD32 *)(&pu1_dst[0 * dst_strd]) = dst0;
1608 
1609                     dst1 = _mm_cvtsi128_si32(src_temp2_8x16b);
1610 
1611                     /* row = 1 to row = 3 */
1612                     *(WORD32 *)(&pu1_dst[1 * dst_strd]) = dst1;
1613 
1614                     /* To update pointer */
1615                     pi2_src1 += 4;
1616                     pi2_src2 += 4;
1617                     pu1_dst  += 4;
1618 
1619                 } /* inner loop ends here(4-output values in single iteration) */
1620 
1621                 pi2_src1 = pi2_src1 - wd + 2 * src_strd1; /* Pointer update */
1622                 pi2_src2 = pi2_src2 - wd + 2 * src_strd2; /* Pointer update */
1623                 pu1_dst  = pu1_dst  - wd + 2 * dst_strd;  /* Pointer update */
1624 
1625             }
1626 
1627         }
1628 
1629     }
1630 }
1631 
1632 
1633 /**
1634 *******************************************************************************
1635 *
1636 * @brief
1637 *  Does chroma default bi-weighted prediction on arrays pointed by pi2_src1 and
1638 * pi2_src2 and stores it at location  pointed by pi2_dst
1639 *
1640 * @par Description:
1641 *  dst = ( (src1 + lvl_shift1) +  (src2 + lvl_shift2) +  1 << (shift - 1) )
1642 * >> shift  where shift = 15 - BitDepth
1643 *
1644 * @param[in] pi2_src1
1645 *  Pointer to source 1
1646 *
1647 * @param[in] pi2_src2
1648 *  Pointer to source 2
1649 *
1650 * @param[out] pu1_dst
1651 *  Pointer to destination
1652 *
1653 * @param[in] src_strd1
1654 *  Source stride 1
1655 *
1656 * @param[in] src_strd2
1657 *  Source stride 2
1658 *
1659 * @param[in] dst_strd
1660 *  Destination stride
1661 *
1662 * @param[in] lvl_shift1
1663 *  added before shift and offset
1664 *
1665 * @param[in] lvl_shift2
1666 *  added before shift and offset
1667 *
1668 * @param[in] ht
1669 *  height of the source
1670 *
1671 * @param[in] wd
1672 *  width of the source (each colour component)
1673 *
1674 * @returns
1675 *
1676 * @remarks
1677 *  None
1678 *
1679 * Assumption : ht%2 == 0, wd%2 == 0, lvl_shift1==0, lvl_shift2==0.
1680 * shift == 7, (lvl_shift1+lvl_shift2) can take {0, 8K, 16K}. In that case,
1681 * final result will match even if intermediate precision is in 16 bit.
1682 *******************************************************************************
1683 */
1684 
ihevc_weighted_pred_chroma_bi_default_ssse3(WORD16 * pi2_src1,WORD16 * pi2_src2,UWORD8 * pu1_dst,WORD32 src_strd1,WORD32 src_strd2,WORD32 dst_strd,WORD32 lvl_shift1,WORD32 lvl_shift2,WORD32 ht,WORD32 wd)1685 void ihevc_weighted_pred_chroma_bi_default_ssse3(WORD16 *pi2_src1,
1686                                                  WORD16 *pi2_src2,
1687                                                  UWORD8 *pu1_dst,
1688                                                  WORD32 src_strd1,
1689                                                  WORD32 src_strd2,
1690                                                  WORD32 dst_strd,
1691                                                  WORD32 lvl_shift1,
1692                                                  WORD32 lvl_shift2,
1693                                                  WORD32 ht,
1694                                                  WORD32 wd)
1695 {
1696     WORD32 row, col, temp;
1697     WORD32 shift, wdx2;
1698 
1699     __m128i src_temp1_8x16b, src_temp2_8x16b, src_temp3_8x16b, src_temp4_8x16b;
1700     __m128i lvl_shift1_8x16b;
1701     __m128i src_temp5_8x16b, src_temp6_8x16b, src_temp7_8x16b, src_temp8_8x16b;
1702 
1703     ASSERT(wd % 2 == 0); /* checking assumption*/
1704     ASSERT(ht % 2 == 0); /* checking assumption*/
1705     UNUSED(lvl_shift1);
1706     UNUSED(lvl_shift2);
1707     shift = SHIFT_14_MINUS_BIT_DEPTH + 1;
1708     temp = 1 << (shift - 1);
1709     wdx2 = wd * 2;
1710 
1711     // seting values in register
1712     lvl_shift1_8x16b = _mm_set1_epi16(temp);
1713 
1714     if(0 == (ht & 3)) /* ht multiple of 4 case */
1715     {
1716         if(0 == (wdx2 & 15)) /* 2*wd multiple of 16 case */
1717         {
1718             __m128i src_temp9_8x16b,  src_temp10_8x16b, src_temp11_8x16b, src_temp12_8x16b;
1719             __m128i src_temp13_8x16b, src_temp14_8x16b, src_temp15_8x16b, src_temp16_8x16b;
1720             /*  outer for loop starts from here */
1721             for(row = 0; row < ht; row += 4)
1722             {
1723                 for(col = 0; col < wdx2; col += 16)
1724                 {
1725                     /*load 8 pixel values */ /* First 8 Values */
1726                     src_temp1_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1));
1727                     src_temp2_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2));
1728                     /* row = 1 */
1729                     src_temp3_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + src_strd1));
1730                     src_temp4_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + src_strd2));
1731                     /* row = 2 */
1732                     src_temp5_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 2 * src_strd1));
1733                     src_temp6_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 2 * src_strd2));
1734                     /* row = 3 */
1735                     src_temp7_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 3 * src_strd1));
1736                     src_temp8_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 3 * src_strd2));
1737 
1738                     /*load 8 pixel values */ /* Second 8 Values */
1739                     src_temp9_8x16b  = _mm_loadu_si128((__m128i *)(pi2_src1 + 8));
1740                     src_temp10_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 8));
1741                     /* row = 1 */
1742                     src_temp11_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + src_strd1 + 8));
1743                     src_temp12_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + src_strd2 + 8));
1744                     /* row = 2 */
1745                     src_temp13_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 2 * src_strd1 + 8));
1746                     src_temp14_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 2 * src_strd2 + 8));
1747 
1748                     /* (pi2_src1[col] + pi2_src2[col]) */ /* First 8 Values */
1749                     src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, src_temp2_8x16b);
1750                     src_temp3_8x16b = _mm_adds_epi16(src_temp3_8x16b, src_temp4_8x16b);
1751                     src_temp5_8x16b = _mm_adds_epi16(src_temp5_8x16b, src_temp6_8x16b);
1752                     src_temp7_8x16b = _mm_adds_epi16(src_temp7_8x16b, src_temp8_8x16b);
1753 
1754                     /*load 8 pixel values */ /* Second 8 Values */
1755                     /* row = 3 */
1756                     src_temp15_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 3 * src_strd1 + 8));
1757                     src_temp16_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 3 * src_strd2 + 8));
1758 
1759                     /* i4_tmp = (pi2_src1[col] + pi2_src2[col] + lvl_shift1 + lvl_shift2 + shift_value) */ /* First 8 Values */
1760                     src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, lvl_shift1_8x16b);
1761                     src_temp3_8x16b = _mm_adds_epi16(src_temp3_8x16b, lvl_shift1_8x16b);
1762                     src_temp5_8x16b = _mm_adds_epi16(src_temp5_8x16b, lvl_shift1_8x16b);
1763                     src_temp7_8x16b = _mm_adds_epi16(src_temp7_8x16b, lvl_shift1_8x16b);
1764 
1765                     /* (pi2_src1[col] + pi2_src2[col]) */ /* Second 8 Values */
1766                     src_temp9_8x16b  = _mm_adds_epi16(src_temp9_8x16b,  src_temp10_8x16b);
1767                     src_temp11_8x16b = _mm_adds_epi16(src_temp11_8x16b, src_temp12_8x16b);
1768                     src_temp13_8x16b = _mm_adds_epi16(src_temp13_8x16b, src_temp14_8x16b);
1769                     src_temp15_8x16b = _mm_adds_epi16(src_temp15_8x16b, src_temp16_8x16b);
1770 
1771                     /* (i4_tmp >> shift) */ /* First 8 Values */
1772                     src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b,  shift);
1773                     src_temp3_8x16b = _mm_srai_epi16(src_temp3_8x16b,  shift);
1774                     src_temp5_8x16b = _mm_srai_epi16(src_temp5_8x16b,  shift);
1775                     src_temp7_8x16b = _mm_srai_epi16(src_temp7_8x16b,  shift);
1776 
1777                     /* i4_tmp = (pi2_src1[col] + pi2_src2[col] + lvl_shift1 + lvl_shift2 + shift_value) */ /* Second 8 Values */
1778                     src_temp9_8x16b  = _mm_adds_epi16(src_temp9_8x16b, lvl_shift1_8x16b);
1779                     src_temp11_8x16b = _mm_adds_epi16(src_temp11_8x16b, lvl_shift1_8x16b);
1780                     src_temp13_8x16b = _mm_adds_epi16(src_temp13_8x16b, lvl_shift1_8x16b);
1781                     src_temp15_8x16b = _mm_adds_epi16(src_temp15_8x16b, lvl_shift1_8x16b);
1782 
1783                     /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */ /* First 8 Values */
1784                     src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp1_8x16b);
1785                     src_temp3_8x16b = _mm_packus_epi16(src_temp3_8x16b, src_temp3_8x16b);
1786                     src_temp5_8x16b = _mm_packus_epi16(src_temp5_8x16b, src_temp5_8x16b);
1787                     src_temp7_8x16b = _mm_packus_epi16(src_temp7_8x16b, src_temp7_8x16b);
1788 
1789                     /* (i4_tmp >> shift) */ /* Second 8 Values */
1790                     src_temp9_8x16b  = _mm_srai_epi16(src_temp9_8x16b,  shift);
1791                     src_temp11_8x16b = _mm_srai_epi16(src_temp11_8x16b,  shift);
1792                     src_temp13_8x16b = _mm_srai_epi16(src_temp13_8x16b,  shift);
1793                     src_temp15_8x16b = _mm_srai_epi16(src_temp15_8x16b,  shift);
1794 
1795                     /* store four 8-bit output values  */ /* First 8 Values */
1796                     _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd), src_temp1_8x16b); /* row = 0*/
1797                     _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), src_temp3_8x16b); /* row = 2*/
1798                     _mm_storel_epi64((__m128i *)(pu1_dst + 2 * dst_strd), src_temp5_8x16b); /* row = 1*/
1799                     _mm_storel_epi64((__m128i *)(pu1_dst + 3 * dst_strd), src_temp7_8x16b); /* row = 3*/
1800 
1801                     /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */ /* Second 8 Values */
1802                     src_temp9_8x16b  = _mm_packus_epi16(src_temp9_8x16b, src_temp9_8x16b);
1803                     src_temp11_8x16b = _mm_packus_epi16(src_temp11_8x16b, src_temp11_8x16b);
1804                     src_temp13_8x16b = _mm_packus_epi16(src_temp13_8x16b, src_temp13_8x16b);
1805                     src_temp15_8x16b = _mm_packus_epi16(src_temp15_8x16b, src_temp15_8x16b);
1806 
1807                     /* store four 8-bit output values  */ /* Second 8 Values */
1808                     _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd + 8), src_temp9_8x16b); /* row = 0*/
1809                     _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd + 8), src_temp11_8x16b); /* row = 2*/
1810                     _mm_storel_epi64((__m128i *)(pu1_dst + 2 * dst_strd + 8), src_temp13_8x16b); /* row = 1*/
1811                     _mm_storel_epi64((__m128i *)(pu1_dst + 3 * dst_strd + 8), src_temp15_8x16b); /* row = 3*/
1812 
1813                     /* To update pointer */
1814                     pi2_src1 += 16;
1815                     pi2_src2 += 16;
1816                     pu1_dst  += 16;
1817 
1818                 } /* inner loop ends here(8-output values in single iteration) */
1819 
1820                 pi2_src1 = pi2_src1 - wdx2 + 4 * src_strd1;    /* Pointer update */
1821                 pi2_src2 = pi2_src2 - wdx2 + 4 * src_strd2;    /* Pointer update */
1822                 pu1_dst  = pu1_dst - wdx2 + 4 * dst_strd; /* Pointer update */
1823 
1824             }
1825         }
1826         else if(0 == (wdx2 & 7)) /* multiple of 8 case */
1827         {
1828             /*  outer for loop starts from here */
1829             for(row = 0; row < ht; row += 4)
1830             {
1831                 for(col = 0; col < wdx2; col += 8)
1832                 {
1833                     /*load 8 pixel values */
1834                     src_temp1_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1));
1835                     src_temp2_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2));
1836                     /* row = 1 */
1837                     src_temp3_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + src_strd1));
1838                     src_temp4_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + src_strd2));
1839                     /* row = 2 */
1840                     src_temp5_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 2 * src_strd1));
1841                     src_temp6_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 2 * src_strd2));
1842                     /* row = 3 */
1843                     src_temp7_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 3 * src_strd1));
1844                     src_temp8_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 3 * src_strd2));
1845 
1846                     /* (pi2_src1[col] + pi2_src2[col]) */
1847                     src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, src_temp2_8x16b);
1848                     src_temp3_8x16b = _mm_adds_epi16(src_temp3_8x16b, src_temp4_8x16b);
1849                     src_temp5_8x16b = _mm_adds_epi16(src_temp5_8x16b, src_temp6_8x16b);
1850                     src_temp7_8x16b = _mm_adds_epi16(src_temp7_8x16b, src_temp8_8x16b);
1851 
1852                     /* i4_tmp = (pi2_src1[col] + pi2_src2[col] + lvl_shift1 + lvl_shift2 + shift_value) */
1853                     src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, lvl_shift1_8x16b);
1854                     src_temp3_8x16b = _mm_adds_epi16(src_temp3_8x16b, lvl_shift1_8x16b);
1855                     src_temp5_8x16b = _mm_adds_epi16(src_temp5_8x16b, lvl_shift1_8x16b);
1856                     src_temp7_8x16b = _mm_adds_epi16(src_temp7_8x16b, lvl_shift1_8x16b);
1857 
1858                     /* (i4_tmp >> shift) */
1859                     src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b,  shift);
1860                     src_temp3_8x16b = _mm_srai_epi16(src_temp3_8x16b,  shift);
1861                     src_temp5_8x16b = _mm_srai_epi16(src_temp5_8x16b,  shift);
1862                     src_temp7_8x16b = _mm_srai_epi16(src_temp7_8x16b,  shift);
1863 
1864                     /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */
1865                     src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp1_8x16b);
1866                     src_temp3_8x16b = _mm_packus_epi16(src_temp3_8x16b, src_temp3_8x16b);
1867                     src_temp5_8x16b = _mm_packus_epi16(src_temp5_8x16b, src_temp5_8x16b);
1868                     src_temp7_8x16b = _mm_packus_epi16(src_temp7_8x16b, src_temp7_8x16b);
1869 
1870                     /* store four 8-bit output values  */
1871                     _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd), src_temp1_8x16b); /* row = 0*/
1872                     _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), src_temp3_8x16b); /* row = 2*/
1873                     _mm_storel_epi64((__m128i *)(pu1_dst + 2 * dst_strd), src_temp5_8x16b); /* row = 1*/
1874                     _mm_storel_epi64((__m128i *)(pu1_dst + 3 * dst_strd), src_temp7_8x16b); /* row = 3*/
1875 
1876                     /* To update pointer */
1877                     pi2_src1 += 8;
1878                     pi2_src2 += 8;
1879                     pu1_dst  += 8;
1880 
1881                 } /* inner loop ends here(8-output values in single iteration) */
1882 
1883                 pi2_src1 = pi2_src1 - wdx2 + 4 * src_strd1;    /* Pointer update */
1884                 pi2_src2 = pi2_src2 - wdx2 + 4 * src_strd2;    /* Pointer update */
1885                 pu1_dst  = pu1_dst - wdx2 + 4 * dst_strd; /* Pointer update */
1886 
1887             }
1888         }
1889         else /* 2*wd multiple of 4 case */
1890         {
1891             WORD32 dst0, dst1, dst2, dst3;
1892             /*  outer for loop starts from here */
1893             for(row = 0; row < ht; row += 4)
1894             {
1895                 for(col = 0; col < wdx2; col += 4)
1896                 {
1897                     /*load 4 pixel values from 7:0 pos. relative to cur. pos.*/
1898                     src_temp1_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1));
1899                     /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
1900                     src_temp2_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2));
1901 
1902                     /* row = 1 */
1903                     src_temp3_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1 + src_strd1));
1904                     src_temp4_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2 + src_strd2));
1905                     /* row = 2 */
1906                     src_temp5_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1 + 2 * src_strd1));
1907                     src_temp6_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2 + 2 * src_strd2));
1908                     /* row = 3 */
1909                     src_temp7_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1 + 3 * src_strd1));
1910                     src_temp8_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2 + 3 * src_strd2));
1911 
1912                     /* Pack two rows together */
1913                     src_temp1_8x16b = _mm_unpacklo_epi64(src_temp1_8x16b, src_temp3_8x16b);
1914                     src_temp2_8x16b = _mm_unpacklo_epi64(src_temp2_8x16b, src_temp4_8x16b);
1915                     src_temp5_8x16b = _mm_unpacklo_epi64(src_temp5_8x16b, src_temp7_8x16b);
1916                     src_temp6_8x16b = _mm_unpacklo_epi64(src_temp6_8x16b, src_temp8_8x16b);
1917 
1918                     /* (pi2_src1[col] + pi2_src2[col]) */
1919                     src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, src_temp2_8x16b);
1920                     src_temp5_8x16b = _mm_adds_epi16(src_temp5_8x16b, src_temp6_8x16b);
1921 
1922                     /* i4_tmp = (pi2_src1[col] + pi2_src2[col] + lvl_shift1 + lvl_shift2 + shift_value) */
1923                     src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, lvl_shift1_8x16b);
1924                     src_temp5_8x16b = _mm_adds_epi16(src_temp5_8x16b, lvl_shift1_8x16b);
1925 
1926                     /* (i4_tmp >> shift) */
1927                     src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b,  shift);
1928                     src_temp5_8x16b = _mm_srai_epi16(src_temp5_8x16b,  shift);
1929 
1930                     /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */
1931                     src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp1_8x16b);
1932                     src_temp5_8x16b = _mm_packus_epi16(src_temp5_8x16b, src_temp5_8x16b);
1933 
1934                     dst0 = _mm_cvtsi128_si32(src_temp1_8x16b);
1935                     /* dst row = 1 to 3 */
1936                     src_temp2_8x16b = _mm_shuffle_epi32(src_temp1_8x16b, 1);
1937                     src_temp4_8x16b = _mm_shuffle_epi32(src_temp5_8x16b, 1);
1938 
1939                     /* store four 8-bit output values  */
1940                     *(WORD32 *)(&pu1_dst[0 * dst_strd]) = dst0;
1941 
1942                     dst1 = _mm_cvtsi128_si32(src_temp2_8x16b);
1943                     dst2 = _mm_cvtsi128_si32(src_temp5_8x16b);
1944                     dst3 = _mm_cvtsi128_si32(src_temp4_8x16b);
1945 
1946                     /* row = 1 to row = 3 */
1947                     *(WORD32 *)(&pu1_dst[1 * dst_strd]) = dst1;
1948                     *(WORD32 *)(&pu1_dst[2 * dst_strd]) = dst2;
1949                     *(WORD32 *)(&pu1_dst[3 * dst_strd]) = dst3;
1950 
1951                     /* To update pointer */
1952                     pi2_src1 += 4;
1953                     pi2_src2 += 4;
1954                     pu1_dst  += 4;
1955 
1956                 } /* inner loop ends here(4-output values in single iteration) */
1957 
1958                 pi2_src1 = pi2_src1 - wdx2 + 4 * src_strd1;   /* Pointer update */
1959                 pi2_src2 = pi2_src2 - wdx2 + 4 * src_strd2;   /* Pointer update */
1960                 pu1_dst  = pu1_dst  - wdx2 + 4 * dst_strd;    /* Pointer update */
1961 
1962             }
1963         }
1964     }
1965     else /* ht multiple of 2 case */
1966     {
1967         if(0 == (wdx2 & 15)) /* 2*wd multiple of 16 case */
1968         {
1969             __m128i src_temp9_8x16b,  src_temp10_8x16b, src_temp11_8x16b, src_temp12_8x16b;
1970             /*  outer for loop starts from here */
1971             for(row = 0; row < ht; row += 2)
1972             {
1973                 for(col = 0; col < wdx2; col += 16)
1974                 {
1975                     /*load 8 pixel values */ /* First 8 Values */
1976                     src_temp1_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1));
1977                     src_temp2_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2));
1978                     /* row = 1 */
1979                     src_temp3_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + src_strd1));
1980                     src_temp4_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + src_strd2));
1981 
1982                     /*load 8 pixel values */ /* Second 8 Values */
1983                     src_temp9_8x16b  = _mm_loadu_si128((__m128i *)(pi2_src1 + 8));
1984                     src_temp10_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 8));
1985                     /* row = 1 */
1986                     src_temp11_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + src_strd1 + 8));
1987                     src_temp12_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + src_strd2 + 8));
1988 
1989                     /* (pi2_src1[col] + pi2_src2[col]) */ /* First 8 Values */
1990                     src_temp1_8x16b  = _mm_adds_epi16(src_temp1_8x16b,  src_temp2_8x16b);
1991                     src_temp3_8x16b  = _mm_adds_epi16(src_temp3_8x16b,  src_temp4_8x16b);
1992 
1993                     /* i4_tmp = (pi2_src1[col] + pi2_src2[col] + lvl_shift1 + lvl_shift2 + shift_value) */ /* First 8 Values */
1994                     src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, lvl_shift1_8x16b);
1995                     src_temp3_8x16b = _mm_adds_epi16(src_temp3_8x16b, lvl_shift1_8x16b);
1996 
1997                     /* (pi2_src1[col] + pi2_src2[col]) */ /* Second 8 Values */
1998                     src_temp9_8x16b  = _mm_adds_epi16(src_temp9_8x16b,  src_temp10_8x16b);
1999                     src_temp11_8x16b = _mm_adds_epi16(src_temp11_8x16b, src_temp12_8x16b);
2000 
2001                     /* (i4_tmp >> shift) */ /* First 8 Values */
2002                     src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b,  shift);
2003                     src_temp3_8x16b = _mm_srai_epi16(src_temp3_8x16b,  shift);
2004 
2005                     /* i4_tmp = (pi2_src1[col] + pi2_src2[col] + lvl_shift1 + lvl_shift2 + shift_value) */ /* Second 8 Values */
2006                     src_temp9_8x16b  = _mm_adds_epi16(src_temp9_8x16b, lvl_shift1_8x16b);
2007                     src_temp11_8x16b = _mm_adds_epi16(src_temp11_8x16b, lvl_shift1_8x16b);
2008 
2009                     /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */ /* First 8 Values */
2010                     src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp1_8x16b);
2011                     src_temp3_8x16b = _mm_packus_epi16(src_temp3_8x16b, src_temp3_8x16b);
2012 
2013                     /* (i4_tmp >> shift) */ /* Second 8 Values */
2014                     src_temp9_8x16b  = _mm_srai_epi16(src_temp9_8x16b,  shift);
2015                     src_temp11_8x16b = _mm_srai_epi16(src_temp11_8x16b,  shift);
2016 
2017                     /* store four 8-bit output values  */ /* First 8 Values */
2018                     _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd), src_temp1_8x16b); /* row = 0*/
2019                     _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), src_temp3_8x16b); /* row = 2*/
2020 
2021                     /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */ /* Second 8 Values */
2022                     src_temp9_8x16b  = _mm_packus_epi16(src_temp9_8x16b, src_temp9_8x16b);
2023                     src_temp11_8x16b = _mm_packus_epi16(src_temp11_8x16b, src_temp11_8x16b);
2024 
2025                     /* store four 8-bit output values  */ /* Second 8 Values */
2026                     _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd + 8), src_temp9_8x16b); /* row = 0*/
2027                     _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd + 8), src_temp11_8x16b); /* row = 2*/
2028 
2029                     /* To update pointer */
2030                     pi2_src1 += 16;
2031                     pi2_src2 += 16;
2032                     pu1_dst  += 16;
2033 
2034                 } /* inner loop ends here(8-output values in single iteration) */
2035 
2036                 pi2_src1 = pi2_src1 - wdx2 + 2 * src_strd1;    /* Pointer update */
2037                 pi2_src2 = pi2_src2 - wdx2 + 2 * src_strd2;    /* Pointer update */
2038                 pu1_dst  = pu1_dst - wdx2 + 2 * dst_strd; /* Pointer update */
2039 
2040             }
2041         }
2042         else if(0 == (wdx2 & 7)) /* multiple of 8 case */
2043         {
2044             /*  outer for loop starts from here */
2045             for(row = 0; row < ht; row += 2)
2046             {
2047                 for(col = 0; col < wdx2; col += 8)
2048                 {
2049                     /*load 8 pixel values */
2050                     src_temp1_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1));
2051                     src_temp2_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2));
2052                     /* row = 1 */
2053                     src_temp3_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + src_strd1));
2054                     src_temp4_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + src_strd2));
2055 
2056                     /* (pi2_src1[col] + pi2_src2[col]) */
2057                     src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, src_temp2_8x16b);
2058                     src_temp3_8x16b = _mm_adds_epi16(src_temp3_8x16b, src_temp4_8x16b);
2059 
2060                     /* i4_tmp = (pi2_src1[col] + pi2_src2[col] + lvl_shift1 + lvl_shift2 + shift_value) */
2061                     src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, lvl_shift1_8x16b);
2062                     src_temp3_8x16b = _mm_adds_epi16(src_temp3_8x16b, lvl_shift1_8x16b);
2063 
2064                     /* (i4_tmp >> shift) */
2065                     src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b,  shift);
2066                     src_temp3_8x16b = _mm_srai_epi16(src_temp3_8x16b,  shift);
2067 
2068                     /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */
2069                     src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp1_8x16b);
2070                     src_temp3_8x16b = _mm_packus_epi16(src_temp3_8x16b, src_temp3_8x16b);
2071 
2072                     /* store four 8-bit output values  */
2073                     _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd), src_temp1_8x16b); /* row = 0*/
2074                     _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), src_temp3_8x16b); /* row = 1*/
2075 
2076                     /* To update pointer */
2077                     pi2_src1 += 8;
2078                     pi2_src2 += 8;
2079                     pu1_dst  += 8;
2080 
2081                 } /* inner loop ends here(8-output values in single iteration) */
2082 
2083                 pi2_src1 = pi2_src1 - wdx2 + 2 * src_strd1;    /* Pointer update */
2084                 pi2_src2 = pi2_src2 - wdx2 + 2 * src_strd2;    /* Pointer update */
2085                 pu1_dst  = pu1_dst - wdx2 + 2 * dst_strd; /* Pointer update */
2086 
2087             }
2088         }
2089         else /* 2*wd multiple of 4 case */
2090         {
2091             WORD32 dst0, dst1;
2092             /*  outer for loop starts from here */
2093             for(row = 0; row < ht; row += 2)
2094             {
2095                 for(col = 0; col < wdx2; col += 4)
2096                 {
2097                     /*load 4 pixel values from 7:0 pos. relative to cur. pos.*/
2098                     src_temp1_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1));
2099                     /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
2100                     src_temp2_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2));
2101                     /* row = 1 */
2102                     src_temp3_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1 + src_strd1));
2103                     src_temp4_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2 + src_strd2));
2104 
2105                     /* Pack two rows together */
2106                     src_temp1_8x16b = _mm_unpacklo_epi64(src_temp1_8x16b, src_temp3_8x16b);
2107                     src_temp2_8x16b = _mm_unpacklo_epi64(src_temp2_8x16b, src_temp4_8x16b);
2108 
2109                     /* (pi2_src1[col] + pi2_src2[col]) */
2110                     src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, src_temp2_8x16b);
2111                     /* i4_tmp = (pi2_src1[col] + pi2_src2[col] + lvl_shift1 + lvl_shift2 + shift_value) */
2112                     src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, lvl_shift1_8x16b);
2113 
2114                     /* (i4_tmp >> shift) */
2115                     src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b,  shift);
2116                     /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */
2117                     src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp1_8x16b);
2118 
2119                     dst0 = _mm_cvtsi128_si32(src_temp1_8x16b);
2120                     /* dst row = 1 */
2121                     src_temp2_8x16b = _mm_shuffle_epi32(src_temp1_8x16b, 1);
2122 
2123                     /* store four 8-bit output values  */
2124                     *(WORD32 *)(&pu1_dst[0 * dst_strd]) = dst0;
2125 
2126                     dst1 = _mm_cvtsi128_si32(src_temp2_8x16b);
2127                     /* row = 1 */
2128                     *(WORD32 *)(&pu1_dst[1 * dst_strd]) = dst1;
2129 
2130                     /* To update pointer */
2131                     pi2_src1 += 4;
2132                     pi2_src2 += 4;
2133                     pu1_dst  += 4;
2134                 } /* inner loop ends here(4-output values in single iteration) */
2135 
2136                 pi2_src1 = pi2_src1 - wdx2 + 2 * src_strd1;   /* Pointer update */
2137                 pi2_src2 = pi2_src2 - wdx2 + 2 * src_strd2;   /* Pointer update */
2138                 pu1_dst  = pu1_dst  - wdx2 + 2 * dst_strd;    /* Pointer update */
2139 
2140             }
2141         }
2142     }
2143 }
2144