1 /******************************************************************************
2 *
3 * Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at:
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 *
17 ******************************************************************************/
18 /**
19 *******************************************************************************
20 * @file
21 * ihevc_weighted_pred_atom_intr.c
22 *
23 * @brief
24 * Contains function definitions for weighted prediction used in inter
25 * prediction
26 *
27 * @author
28 *
29 *
30 * @par List of Functions:
31 * - ihevc_weighted_pred_uni_ssse3()
32 * - ihevc_weighted_pred_bi_ssse3()
33 * - ihevc_weighted_pred_bi_default_ssse3()
34 * - ihevc_weighted_pred_chroma_uni_ssse3()
35 * - ihevc_weighted_pred_chroma_bi_ssse3()
36 * - ihevc_weighted_pred_chroma_bi_default_ssse3()
37 *
38 * @remarks
39 * None
40 *
41 *******************************************************************************
42 */
43 /*****************************************************************************/
44 /* File Includes */
45 /*****************************************************************************/
46 #include <stdio.h>
47 #include <assert.h>
48
49 #include "ihevc_debug.h"
50 #include "ihevc_typedefs.h"
51 #include "ihevc_macros.h"
52 #include "ihevc_platform_macros.h"
53 #include "ihevc_func_selector.h"
54 #include "ihevc_defs.h"
55 #include "ihevc_weighted_pred.h"
56 #include "ihevc_inter_pred.h"
57
58
59 #include <immintrin.h>
60
61 /**
62 *******************************************************************************
63 *
64 * @brief
65 * Does uni-weighted prediction on the array pointed by pi2_src and stores
66 * it at the location pointed by pi2_dst
67 *
68 * @par Description:
69 * dst = ( (src + lvl_shift) * wgt0 + (1 << (shift - 1)) ) >> shift +
70 * offset
71 *
72 * @param[in] pi2_src
73 * Pointer to the source
74 *
75 * @param[out] pu1_dst
76 * Pointer to the destination
77 *
78 * @param[in] src_strd
79 * Source stride
80 *
81 * @param[in] dst_strd
82 * Destination stride
83 *
84 * @param[in] wgt0
85 * weight to be multiplied to the source
86 *
87 * @param[in] off0
88 * offset to be added after rounding and
89 *
90 * @param[in] shifting
91 *
92 *
93 * @param[in] shift
94 * (14 Bit depth) + log2_weight_denominator
95 *
96 * @param[in] lvl_shift
97 * added before shift and offset
98 *
99 * @param[in] ht
100 * height of the source
101 *
102 * @param[in] wd
103 * width of the source
104 *
105 * @returns
106 *
107 * @remarks
108 * None
109 *
110 *******************************************************************************
111 */
112
ihevc_weighted_pred_uni_ssse3(WORD16 * pi2_src,UWORD8 * pu1_dst,WORD32 src_strd,WORD32 dst_strd,WORD32 wgt0,WORD32 off0,WORD32 shift,WORD32 lvl_shift,WORD32 ht,WORD32 wd)113 void ihevc_weighted_pred_uni_ssse3(WORD16 *pi2_src,
114 UWORD8 *pu1_dst,
115 WORD32 src_strd,
116 WORD32 dst_strd,
117 WORD32 wgt0,
118 WORD32 off0,
119 WORD32 shift,
120 WORD32 lvl_shift,
121 WORD32 ht,
122 WORD32 wd)
123 {
124 WORD32 row, col, temp;
125
126 /* all 128 bit registers are named with a suffix mxnb, where m is the */
127 /* number of n bits packed in the register */
128 __m128i src_temp0_8x16b, src_temp1_8x16b, src_temp2_8x16b, src_temp3_8x16b;
129 __m128i const_temp_4x32b, lvl_shift_4x32b, wgt0_8x16b, off0_4x32b;
130 __m128i res_temp0_4x32b, res_temp1_4x32b, res_temp2_4x32b, res_temp3_4x32b;
131
132 ASSERT(wd % 4 == 0); /* checking assumption*/
133 ASSERT(ht % 4 == 0); /* checking assumption*/
134
135 temp = 1 << (shift - 1);
136
137 // seting values in register
138 lvl_shift_4x32b = _mm_set1_epi16(lvl_shift);
139 wgt0_8x16b = _mm_set1_epi16(wgt0);
140
141 /* lvl_shift * wgt0 */
142 res_temp0_4x32b = _mm_mullo_epi16(lvl_shift_4x32b, wgt0_8x16b);
143 res_temp1_4x32b = _mm_mulhi_epi16(lvl_shift_4x32b, wgt0_8x16b);
144
145 const_temp_4x32b = _mm_set1_epi32(temp);
146 off0_4x32b = _mm_set1_epi32(off0);
147
148
149 /* lvl_shift * wgt0 */
150 lvl_shift_4x32b = _mm_unpacklo_epi16(res_temp0_4x32b, res_temp1_4x32b);
151 /* lvl_shift * wgt0 + 1 << (shift - 1) */
152 lvl_shift_4x32b = _mm_add_epi32(lvl_shift_4x32b, const_temp_4x32b);
153
154 if(0 == (wd & 7)) /* wd multiple of 8 case */
155 {
156 __m128i res_temp4_4x32b, res_temp5_4x32b, res_temp6_4x32b, res_temp7_4x32b;
157
158 /* outer for loop starts from here */
159 for(row = 0; row < ht; row += 4)
160 {
161 for(col = 0; col < wd; col += 8)
162 { /* for row =0 ,1,2,3*/
163
164 /* row = 0 */ /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
165 src_temp0_8x16b = _mm_loadu_si128((__m128i *)(pi2_src));
166 /* row = 1 */
167 src_temp1_8x16b = _mm_loadu_si128((__m128i *)(pi2_src + src_strd));
168 /* row = 2 */
169 src_temp2_8x16b = _mm_loadu_si128((__m128i *)(pi2_src + 2 * src_strd));
170 /* row = 3 */
171 src_temp3_8x16b = _mm_loadu_si128((__m128i *)(pi2_src + 3 * src_strd));
172
173 /*i4_tmp = (pi2_src[col]) * wgt0*/ /* Lower 16 bit */
174 res_temp0_4x32b = _mm_mullo_epi16(src_temp0_8x16b, wgt0_8x16b);
175 res_temp1_4x32b = _mm_mullo_epi16(src_temp1_8x16b, wgt0_8x16b);
176 res_temp2_4x32b = _mm_mullo_epi16(src_temp2_8x16b, wgt0_8x16b);
177 res_temp3_4x32b = _mm_mullo_epi16(src_temp3_8x16b, wgt0_8x16b);
178
179 /*i4_tmp = (pi2_src[col] ) * wgt0*/ /* Higher 16 bit */
180 src_temp0_8x16b = _mm_mulhi_epi16(src_temp0_8x16b, wgt0_8x16b);
181 src_temp1_8x16b = _mm_mulhi_epi16(src_temp1_8x16b, wgt0_8x16b);
182 src_temp2_8x16b = _mm_mulhi_epi16(src_temp2_8x16b, wgt0_8x16b);
183 src_temp3_8x16b = _mm_mulhi_epi16(src_temp3_8x16b, wgt0_8x16b);
184
185 /* Get 32 bit Result */
186 res_temp4_4x32b = _mm_unpackhi_epi16(res_temp0_4x32b, src_temp0_8x16b);
187 res_temp5_4x32b = _mm_unpackhi_epi16(res_temp1_4x32b, src_temp1_8x16b);
188 res_temp6_4x32b = _mm_unpackhi_epi16(res_temp2_4x32b, src_temp2_8x16b);
189 res_temp7_4x32b = _mm_unpackhi_epi16(res_temp3_4x32b, src_temp3_8x16b);
190
191 res_temp0_4x32b = _mm_unpacklo_epi16(res_temp0_4x32b, src_temp0_8x16b);
192 res_temp1_4x32b = _mm_unpacklo_epi16(res_temp1_4x32b, src_temp1_8x16b);
193 res_temp2_4x32b = _mm_unpacklo_epi16(res_temp2_4x32b, src_temp2_8x16b);
194 res_temp3_4x32b = _mm_unpacklo_epi16(res_temp3_4x32b, src_temp3_8x16b);
195
196 /* i4_tmp = (pi2_src[col] + lvl_shift) * wgt0 + 1 << (shift - 1) */
197 res_temp4_4x32b = _mm_add_epi32(res_temp4_4x32b, lvl_shift_4x32b);
198 res_temp5_4x32b = _mm_add_epi32(res_temp5_4x32b, lvl_shift_4x32b);
199 res_temp6_4x32b = _mm_add_epi32(res_temp6_4x32b, lvl_shift_4x32b);
200 res_temp7_4x32b = _mm_add_epi32(res_temp7_4x32b, lvl_shift_4x32b);
201 res_temp0_4x32b = _mm_add_epi32(res_temp0_4x32b, lvl_shift_4x32b);
202 res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, lvl_shift_4x32b);
203 res_temp2_4x32b = _mm_add_epi32(res_temp2_4x32b, lvl_shift_4x32b);
204 res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, lvl_shift_4x32b);
205
206 /* (i4_tmp >> shift) */ /* First 4 pixels */
207 res_temp0_4x32b = _mm_srai_epi32(res_temp0_4x32b, shift);
208 res_temp1_4x32b = _mm_srai_epi32(res_temp1_4x32b, shift);
209 res_temp2_4x32b = _mm_srai_epi32(res_temp2_4x32b, shift);
210 res_temp3_4x32b = _mm_srai_epi32(res_temp3_4x32b, shift);
211
212 /* (i4_tmp >> shift) */ /* Last 4 pixels */
213 res_temp4_4x32b = _mm_srai_epi32(res_temp4_4x32b, shift);
214 res_temp5_4x32b = _mm_srai_epi32(res_temp5_4x32b, shift);
215 res_temp6_4x32b = _mm_srai_epi32(res_temp6_4x32b, shift);
216 res_temp7_4x32b = _mm_srai_epi32(res_temp7_4x32b, shift);
217
218 /*i4_tmp = (i4_tmp >> shift) + off0; */ /* First 4 pixels */
219 res_temp0_4x32b = _mm_add_epi32(res_temp0_4x32b, off0_4x32b);
220 res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, off0_4x32b);
221 res_temp2_4x32b = _mm_add_epi32(res_temp2_4x32b, off0_4x32b);
222 res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, off0_4x32b);
223
224 /*i4_tmp = (i4_tmp >> shift) + off0; */ /* Last 4 pixels */
225 res_temp4_4x32b = _mm_add_epi32(res_temp4_4x32b, off0_4x32b);
226 res_temp5_4x32b = _mm_add_epi32(res_temp5_4x32b, off0_4x32b);
227 res_temp6_4x32b = _mm_add_epi32(res_temp6_4x32b, off0_4x32b);
228 res_temp7_4x32b = _mm_add_epi32(res_temp7_4x32b, off0_4x32b);
229
230 res_temp0_4x32b = _mm_packs_epi32(res_temp0_4x32b, res_temp4_4x32b);
231 res_temp1_4x32b = _mm_packs_epi32(res_temp1_4x32b, res_temp5_4x32b);
232 res_temp2_4x32b = _mm_packs_epi32(res_temp2_4x32b, res_temp6_4x32b);
233 res_temp3_4x32b = _mm_packs_epi32(res_temp3_4x32b, res_temp7_4x32b);
234 /* pu1_dst[col] = CLIP_U8(i4_tmp); */
235 res_temp0_4x32b = _mm_packus_epi16(res_temp0_4x32b, res_temp0_4x32b);
236 res_temp1_4x32b = _mm_packus_epi16(res_temp1_4x32b, res_temp1_4x32b);
237 res_temp2_4x32b = _mm_packus_epi16(res_temp2_4x32b, res_temp2_4x32b);
238 res_temp3_4x32b = _mm_packus_epi16(res_temp3_4x32b, res_temp3_4x32b);
239
240 /* store four 8-bit output values */
241 _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd), res_temp0_4x32b); /* row = 0*/
242 _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), res_temp1_4x32b); /* row = 2*/
243 _mm_storel_epi64((__m128i *)(pu1_dst + 2 * dst_strd), res_temp2_4x32b); /* row = 1*/
244 _mm_storel_epi64((__m128i *)(pu1_dst + 3 * dst_strd), res_temp3_4x32b); /* row = 3*/
245
246 /* To update pointer */
247 pi2_src += 8;
248 pu1_dst += 8;
249
250 } /* inner loop ends here(4-output values in single iteration) */
251
252 pi2_src = pi2_src - wd + 4 * src_strd; /* Pointer update */
253 pu1_dst = pu1_dst - wd + 4 * dst_strd; /* Pointer update */
254
255 }
256 }
257 else /* wd multiple of 4 case */
258 {
259 WORD32 dst0, dst1, dst2, dst3;
260 /* outer for loop starts from here */
261 for(row = 0; row < ht; row += 4)
262 {
263 for(col = 0; col < wd; col += 4)
264 { /* for row =0 ,1,2,3*/
265
266 /* row = 0 */ /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
267 src_temp0_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src));
268 /* row = 1 */
269 src_temp1_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src + src_strd));
270 /* row = 2 */
271 src_temp2_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src + 2 * src_strd));
272 /* row = 3 */
273 src_temp3_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src + 3 * src_strd));
274
275 /* 2 rows together */
276 src_temp0_8x16b = _mm_unpacklo_epi64(src_temp0_8x16b, src_temp2_8x16b);
277 src_temp1_8x16b = _mm_unpacklo_epi64(src_temp1_8x16b, src_temp3_8x16b);
278
279 /*i4_tmp = (pi2_src[col]) * wgt0*/ /* Lower 16 bit */
280 res_temp0_4x32b = _mm_mullo_epi16(src_temp0_8x16b, wgt0_8x16b);
281 res_temp1_4x32b = _mm_mullo_epi16(src_temp1_8x16b, wgt0_8x16b);
282 /*i4_tmp = (pi2_src[col]) * wgt0*/ /* Higher 16 bit */
283 src_temp0_8x16b = _mm_mulhi_epi16(src_temp0_8x16b, wgt0_8x16b);
284 src_temp1_8x16b = _mm_mulhi_epi16(src_temp1_8x16b, wgt0_8x16b);
285
286 /* Get 32 bit Result */
287 res_temp2_4x32b = _mm_unpackhi_epi16(res_temp0_4x32b, src_temp0_8x16b);
288 res_temp3_4x32b = _mm_unpackhi_epi16(res_temp1_4x32b, src_temp1_8x16b);
289
290 res_temp0_4x32b = _mm_unpacklo_epi16(res_temp0_4x32b, src_temp0_8x16b);
291 res_temp1_4x32b = _mm_unpacklo_epi16(res_temp1_4x32b, src_temp1_8x16b);
292
293 /* i4_tmp = (pi2_src[col] + lvl_shift) * wgt0 + 1 << (shift - 1) */
294 res_temp2_4x32b = _mm_add_epi32(res_temp2_4x32b, lvl_shift_4x32b);
295 res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, lvl_shift_4x32b);
296 res_temp0_4x32b = _mm_add_epi32(res_temp0_4x32b, lvl_shift_4x32b);
297 res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, lvl_shift_4x32b);
298
299 /* (i4_tmp >> shift) */
300 res_temp0_4x32b = _mm_srai_epi32(res_temp0_4x32b, shift);
301 res_temp1_4x32b = _mm_srai_epi32(res_temp1_4x32b, shift);
302 res_temp2_4x32b = _mm_srai_epi32(res_temp2_4x32b, shift);
303 res_temp3_4x32b = _mm_srai_epi32(res_temp3_4x32b, shift);
304
305 /*i4_tmp = (i4_tmp >> shift) + off0; */
306 res_temp0_4x32b = _mm_add_epi32(res_temp0_4x32b, off0_4x32b);
307 res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, off0_4x32b);
308 res_temp2_4x32b = _mm_add_epi32(res_temp2_4x32b, off0_4x32b);
309 res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, off0_4x32b);
310
311 res_temp0_4x32b = _mm_packs_epi32(res_temp0_4x32b, res_temp1_4x32b);
312 res_temp2_4x32b = _mm_packs_epi32(res_temp2_4x32b, res_temp3_4x32b);
313
314 /* pu1_dst[col] = CLIP_U8(i4_tmp); */
315 res_temp0_4x32b = _mm_packus_epi16(res_temp0_4x32b, res_temp2_4x32b);
316
317 dst0 = _mm_cvtsi128_si32(res_temp0_4x32b);
318 /* dst row = 1 to 3 */
319 res_temp1_4x32b = _mm_shuffle_epi32(res_temp0_4x32b, 1);
320 res_temp2_4x32b = _mm_shuffle_epi32(res_temp0_4x32b, 2);
321 res_temp3_4x32b = _mm_shuffle_epi32(res_temp0_4x32b, 3);
322
323 /* store four 8-bit output values */
324 *(WORD32 *)(&pu1_dst[0 * dst_strd]) = dst0;
325
326 dst1 = _mm_cvtsi128_si32(res_temp1_4x32b);
327 dst2 = _mm_cvtsi128_si32(res_temp2_4x32b);
328 dst3 = _mm_cvtsi128_si32(res_temp3_4x32b);
329
330 /* row = 1 to row = 3 */
331 *(WORD32 *)(&pu1_dst[1 * dst_strd]) = dst1;
332 *(WORD32 *)(&pu1_dst[2 * dst_strd]) = dst2;
333 *(WORD32 *)(&pu1_dst[3 * dst_strd]) = dst3;
334
335 /* To update pointer */
336 pi2_src += 4;
337 pu1_dst += 4;
338
339 } /* inner loop ends here(4-output values in single iteration) */
340
341 pi2_src = pi2_src - wd + 4 * src_strd; /* Pointer update */
342 pu1_dst = pu1_dst - wd + 4 * dst_strd; /* Pointer update */
343
344 }
345 }
346 }
347
348 /**
349 *******************************************************************************
350 *
351 * @brief
352 * Does chroma uni-weighted prediction on array pointed by pi2_src and stores
353 * it at the location pointed by pi2_dst
354 *
355 * @par Description:
356 * dst = ( (src + lvl_shift) * wgt0 + (1 << (shift - 1)) ) >> shift +
357 * offset
358 *
359 * @param[in] pi2_src
360 * Pointer to the source
361 *
362 * @param[out] pu1_dst
363 * Pointer to the destination
364 *
365 * @param[in] src_strd
366 * Source stride
367 *
368 * @param[in] dst_strd
369 * Destination stride
370 *
371 * @param[in] wgt0
372 * weight to be multiplied to the source
373 *
374 * @param[in] off0
375 * offset to be added after rounding and
376 *
377 * @param[in] shifting
378 *
379 *
380 * @param[in] shift
381 * (14 Bit depth) + log2_weight_denominator
382 *
383 * @param[in] lvl_shift
384 * added before shift and offset
385 *
386 * @param[in] ht
387 * height of the source
388 *
389 * @param[in] wd
390 * width of the source (each colour component)
391 *
392 * @returns
393 *
394 * @remarks
395 * None
396 *
397 *******************************************************************************
398 */
399
400
ihevc_weighted_pred_chroma_uni_ssse3(WORD16 * pi2_src,UWORD8 * pu1_dst,WORD32 src_strd,WORD32 dst_strd,WORD32 wgt0_cb,WORD32 wgt0_cr,WORD32 off0_cb,WORD32 off0_cr,WORD32 shift,WORD32 lvl_shift,WORD32 ht,WORD32 wd)401 void ihevc_weighted_pred_chroma_uni_ssse3(WORD16 *pi2_src,
402 UWORD8 *pu1_dst,
403 WORD32 src_strd,
404 WORD32 dst_strd,
405 WORD32 wgt0_cb,
406 WORD32 wgt0_cr,
407 WORD32 off0_cb,
408 WORD32 off0_cr,
409 WORD32 shift,
410 WORD32 lvl_shift,
411 WORD32 ht,
412 WORD32 wd)
413 {
414 WORD32 row, col, temp, wdx2;
415 /* all 128 bit registers are named with a suffix mxnb, where m is the */
416 /* number of n bits packed in the register */
417
418 __m128i src_temp0_8x16b, src_temp1_8x16b;
419 __m128i const_temp_4x32b, lvl_shift_4x32b, wgt0_8x16b, off0_4x32b;
420 __m128i res_temp0_4x32b, res_temp1_4x32b;
421
422 ASSERT(wd % 2 == 0); /* checking assumption*/
423 ASSERT(ht % 2 == 0); /* checking assumption*/
424
425 temp = 1 << (shift - 1);
426 wdx2 = 2 * wd;
427
428 // seting values in register
429 lvl_shift_4x32b = _mm_set1_epi16(lvl_shift);
430 wgt0_8x16b = _mm_set_epi16(wgt0_cr, wgt0_cb, wgt0_cr, wgt0_cb, wgt0_cr, wgt0_cb, wgt0_cr, wgt0_cb);
431
432 /* lvl_shift * wgt0 */
433 res_temp0_4x32b = _mm_mullo_epi16(lvl_shift_4x32b, wgt0_8x16b);
434 res_temp1_4x32b = _mm_mulhi_epi16(lvl_shift_4x32b, wgt0_8x16b);
435
436 const_temp_4x32b = _mm_set1_epi32(temp);
437 off0_4x32b = _mm_set_epi32(off0_cr, off0_cb, off0_cr, off0_cb);
438
439 /* lvl_shift * wgt0 */
440 lvl_shift_4x32b = _mm_unpacklo_epi16(res_temp0_4x32b, res_temp1_4x32b);
441 /* lvl_shift * wgt0 + 1 << (shift - 1) */
442 lvl_shift_4x32b = _mm_add_epi32(lvl_shift_4x32b, const_temp_4x32b);
443
444 {
445 if(0 == (wdx2 & 15)) /* 2*wd multiple of 16 case */
446 {
447 __m128i src_temp2_8x16b, src_temp3_8x16b;
448 __m128i res_temp2_4x32b, res_temp3_4x32b;
449 __m128i res_temp4_4x32b, res_temp5_4x32b, res_temp6_4x32b, res_temp7_4x32b;
450
451 /* outer for loop starts from here */
452 for(row = 0; row < ht; row += 2)
453 {
454 for(col = 0; col < wdx2; col += 16)
455 {
456 /* row = 0 */ /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
457 src_temp0_8x16b = _mm_loadu_si128((__m128i *)(pi2_src));
458 /* row = 1 */
459 src_temp1_8x16b = _mm_loadu_si128((__m128i *)(pi2_src + src_strd));
460 /* row = 0 */ /* Next 8 pixels */
461 src_temp2_8x16b = _mm_loadu_si128((__m128i *)(pi2_src + 8));
462 /* row = 1 */
463 src_temp3_8x16b = _mm_loadu_si128((__m128i *)(pi2_src + src_strd + 8));
464
465 /*i4_tmp = (pi2_src[col]) * wgt0*/ /* Lower 16 bit */
466 res_temp0_4x32b = _mm_mullo_epi16(src_temp0_8x16b, wgt0_8x16b);
467 res_temp1_4x32b = _mm_mullo_epi16(src_temp1_8x16b, wgt0_8x16b);
468 res_temp4_4x32b = _mm_mullo_epi16(src_temp2_8x16b, wgt0_8x16b);
469 res_temp5_4x32b = _mm_mullo_epi16(src_temp3_8x16b, wgt0_8x16b);
470
471 /*i4_tmp = (pi2_src[col] ) * wgt0*/ /* Higher 16 bit */
472 src_temp0_8x16b = _mm_mulhi_epi16(src_temp0_8x16b, wgt0_8x16b);
473 src_temp1_8x16b = _mm_mulhi_epi16(src_temp1_8x16b, wgt0_8x16b);
474 src_temp2_8x16b = _mm_mulhi_epi16(src_temp2_8x16b, wgt0_8x16b);
475 src_temp3_8x16b = _mm_mulhi_epi16(src_temp3_8x16b, wgt0_8x16b);
476
477 /* Get 32 bit Result */
478 res_temp2_4x32b = _mm_unpackhi_epi16(res_temp0_4x32b, src_temp0_8x16b);
479 res_temp3_4x32b = _mm_unpackhi_epi16(res_temp1_4x32b, src_temp1_8x16b);
480 res_temp6_4x32b = _mm_unpackhi_epi16(res_temp4_4x32b, src_temp2_8x16b);
481 res_temp7_4x32b = _mm_unpackhi_epi16(res_temp5_4x32b, src_temp3_8x16b);
482
483 res_temp0_4x32b = _mm_unpacklo_epi16(res_temp0_4x32b, src_temp0_8x16b);
484 res_temp1_4x32b = _mm_unpacklo_epi16(res_temp1_4x32b, src_temp1_8x16b);
485 res_temp4_4x32b = _mm_unpacklo_epi16(res_temp4_4x32b, src_temp2_8x16b);
486 res_temp5_4x32b = _mm_unpacklo_epi16(res_temp5_4x32b, src_temp3_8x16b);
487
488 /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0 + 1 << (shift - 1) */
489 res_temp0_4x32b = _mm_add_epi32(res_temp0_4x32b, lvl_shift_4x32b);
490 res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, lvl_shift_4x32b);
491 res_temp2_4x32b = _mm_add_epi32(res_temp2_4x32b, lvl_shift_4x32b);
492 res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, lvl_shift_4x32b);
493 res_temp4_4x32b = _mm_add_epi32(res_temp4_4x32b, lvl_shift_4x32b);
494 res_temp5_4x32b = _mm_add_epi32(res_temp5_4x32b, lvl_shift_4x32b);
495 res_temp6_4x32b = _mm_add_epi32(res_temp6_4x32b, lvl_shift_4x32b);
496 res_temp7_4x32b = _mm_add_epi32(res_temp7_4x32b, lvl_shift_4x32b);
497
498 /* (i4_tmp >> shift) */
499 res_temp0_4x32b = _mm_srai_epi32(res_temp0_4x32b, shift);
500 res_temp1_4x32b = _mm_srai_epi32(res_temp1_4x32b, shift);
501 res_temp2_4x32b = _mm_srai_epi32(res_temp2_4x32b, shift);
502 res_temp3_4x32b = _mm_srai_epi32(res_temp3_4x32b, shift);
503 /*i4_tmp = (i4_tmp >> shift) + off0; */
504 res_temp0_4x32b = _mm_add_epi32(res_temp0_4x32b, off0_4x32b);
505 res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, off0_4x32b);
506 /*i4_tmp = (i4_tmp >> shift) + off0; */ /* Second 4 pixels */
507 res_temp2_4x32b = _mm_add_epi32(res_temp2_4x32b, off0_4x32b);
508 res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, off0_4x32b);
509
510 /* (i4_tmp >> shift) */
511 res_temp4_4x32b = _mm_srai_epi32(res_temp4_4x32b, shift);
512 res_temp5_4x32b = _mm_srai_epi32(res_temp5_4x32b, shift);
513 res_temp6_4x32b = _mm_srai_epi32(res_temp6_4x32b, shift);
514 res_temp7_4x32b = _mm_srai_epi32(res_temp7_4x32b, shift);
515 /*i4_tmp = (i4_tmp >> shift) + off0; */ /* Third 4 pixels */
516 res_temp4_4x32b = _mm_add_epi32(res_temp4_4x32b, off0_4x32b);
517 res_temp5_4x32b = _mm_add_epi32(res_temp5_4x32b, off0_4x32b);
518 /*i4_tmp = (i4_tmp >> shift) + off0; */ /* Last 4 pixels */
519 res_temp6_4x32b = _mm_add_epi32(res_temp6_4x32b, off0_4x32b);
520 res_temp7_4x32b = _mm_add_epi32(res_temp7_4x32b, off0_4x32b);
521
522 res_temp0_4x32b = _mm_packs_epi32(res_temp0_4x32b, res_temp2_4x32b);
523 res_temp1_4x32b = _mm_packs_epi32(res_temp1_4x32b, res_temp3_4x32b);
524 res_temp4_4x32b = _mm_packs_epi32(res_temp4_4x32b, res_temp6_4x32b);
525 res_temp5_4x32b = _mm_packs_epi32(res_temp5_4x32b, res_temp7_4x32b);
526 /* pu1_dst[col] = CLIP_U8(i4_tmp); */
527 res_temp0_4x32b = _mm_packus_epi16(res_temp0_4x32b, res_temp4_4x32b);
528 res_temp1_4x32b = _mm_packus_epi16(res_temp1_4x32b, res_temp5_4x32b);
529
530 /* store 16 8-bit output values */
531 _mm_storeu_si128((__m128i *)(pu1_dst + 0 * dst_strd), res_temp0_4x32b); /* row = 0*/
532 _mm_storeu_si128((__m128i *)(pu1_dst + 1 * dst_strd), res_temp1_4x32b); /* row = 1*/
533
534 pi2_src += 16; /* Pointer update */
535 pu1_dst += 16; /* Pointer update */
536
537 } /* inner loop ends here(4-output values in single iteration) */
538 pi2_src = pi2_src - wdx2 + 2 * src_strd; /* Pointer update */
539 pu1_dst = pu1_dst - wdx2 + 2 * dst_strd; /* Pointer update */
540 }
541 }
542 else if(0 == (wdx2 & 7)) /* 2*wd multiple of 8 case */
543 {
544 __m128i res_temp2_4x32b, res_temp3_4x32b;
545 /* outer for loop starts from here */
546 for(row = 0; row < ht; row += 2)
547 {
548 for(col = 0; col < wdx2; col += 8)
549 {
550 /* row = 0 */ /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
551 src_temp0_8x16b = _mm_loadu_si128((__m128i *)(pi2_src));
552 /* row = 1 */
553 src_temp1_8x16b = _mm_loadu_si128((__m128i *)(pi2_src + src_strd));
554
555 /*i4_tmp = (pi2_src[col]) * wgt0*/ /* Lower 16 bit */
556 res_temp0_4x32b = _mm_mullo_epi16(src_temp0_8x16b, wgt0_8x16b);
557 res_temp1_4x32b = _mm_mullo_epi16(src_temp1_8x16b, wgt0_8x16b);
558 /*i4_tmp = (pi2_src[col] ) * wgt0*/ /* Higher 16 bit */
559 src_temp0_8x16b = _mm_mulhi_epi16(src_temp0_8x16b, wgt0_8x16b);
560 src_temp1_8x16b = _mm_mulhi_epi16(src_temp1_8x16b, wgt0_8x16b);
561
562 /* Get 32 bit Result */
563 res_temp2_4x32b = _mm_unpackhi_epi16(res_temp0_4x32b, src_temp0_8x16b);
564 res_temp3_4x32b = _mm_unpackhi_epi16(res_temp1_4x32b, src_temp1_8x16b);
565
566 res_temp0_4x32b = _mm_unpacklo_epi16(res_temp0_4x32b, src_temp0_8x16b);
567 res_temp1_4x32b = _mm_unpacklo_epi16(res_temp1_4x32b, src_temp1_8x16b);
568
569 /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0 + 1 << (shift - 1) */
570 res_temp0_4x32b = _mm_add_epi32(res_temp0_4x32b, lvl_shift_4x32b);
571 res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, lvl_shift_4x32b);
572 res_temp2_4x32b = _mm_add_epi32(res_temp2_4x32b, lvl_shift_4x32b);
573 res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, lvl_shift_4x32b);
574
575 /* (i4_tmp >> shift) */
576 res_temp0_4x32b = _mm_srai_epi32(res_temp0_4x32b, shift);
577 res_temp1_4x32b = _mm_srai_epi32(res_temp1_4x32b, shift);
578 res_temp2_4x32b = _mm_srai_epi32(res_temp2_4x32b, shift);
579 res_temp3_4x32b = _mm_srai_epi32(res_temp3_4x32b, shift);
580
581 /*i4_tmp = (i4_tmp >> shift) + off0; */
582 res_temp0_4x32b = _mm_add_epi32(res_temp0_4x32b, off0_4x32b);
583 res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, off0_4x32b);
584 /*i4_tmp = (i4_tmp >> shift) + off0; */ /* Last 4 pixels */
585 res_temp2_4x32b = _mm_add_epi32(res_temp2_4x32b, off0_4x32b);
586 res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, off0_4x32b);
587
588 res_temp0_4x32b = _mm_packs_epi32(res_temp0_4x32b, res_temp2_4x32b);
589 res_temp1_4x32b = _mm_packs_epi32(res_temp1_4x32b, res_temp3_4x32b);
590
591 /* pu1_dst[col] = CLIP_U8(i4_tmp); */
592 res_temp0_4x32b = _mm_packus_epi16(res_temp0_4x32b, res_temp0_4x32b);
593 res_temp1_4x32b = _mm_packus_epi16(res_temp1_4x32b, res_temp1_4x32b);
594
595 /* store four 8-bit output values */
596 _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd), res_temp0_4x32b); /* row = 0*/
597 _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), res_temp1_4x32b); /* row = 1*/
598
599 pi2_src += 8; /* Pointer update */
600 pu1_dst += 8; /* Pointer update */
601
602 } /* inner loop ends here(4-output values in single iteration) */
603 pi2_src = pi2_src - wdx2 + 2 * src_strd; /* Pointer update */
604 pu1_dst = pu1_dst - wdx2 + 2 * dst_strd; /* Pointer update */
605 }
606 }
607 else /* 2*wd multiple of 4 case */
608 {
609 WORD32 dst0, dst1;
610 /* outer for loop starts from here */
611 for(row = 0; row < ht; row += 2)
612 {
613 for(col = 0; col < wdx2; col += 4)
614 {
615 /* row = 0 */ /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
616 src_temp0_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src));
617 /* row = 1 */
618 src_temp1_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src + src_strd));
619
620 /* 2 rows together */
621 src_temp0_8x16b = _mm_unpacklo_epi64(src_temp0_8x16b, src_temp1_8x16b);
622
623 /*i4_tmp = (pi2_src[col]) * wgt0*/ /* Lower 16 bit */
624 res_temp0_4x32b = _mm_mullo_epi16(src_temp0_8x16b, wgt0_8x16b);
625 /*i4_tmp = (pi2_src[col] ) * wgt0*/ /* Higher 16 bit */
626 src_temp0_8x16b = _mm_mulhi_epi16(src_temp0_8x16b, wgt0_8x16b);
627
628 /* Get 32 bit Result */
629 res_temp1_4x32b = _mm_unpackhi_epi16(res_temp0_4x32b, src_temp0_8x16b);
630 res_temp0_4x32b = _mm_unpacklo_epi16(res_temp0_4x32b, src_temp0_8x16b);
631
632 /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0 + 1 << (shift - 1) */
633 res_temp0_4x32b = _mm_add_epi32(res_temp0_4x32b, lvl_shift_4x32b);
634 res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, lvl_shift_4x32b);
635
636 /* (i4_tmp >> shift) */
637 res_temp0_4x32b = _mm_srai_epi32(res_temp0_4x32b, shift);
638 res_temp1_4x32b = _mm_srai_epi32(res_temp1_4x32b, shift);
639
640 /*i4_tmp = (i4_tmp >> shift) + off0; */
641 res_temp0_4x32b = _mm_add_epi32(res_temp0_4x32b, off0_4x32b);
642 res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, off0_4x32b);
643
644 res_temp0_4x32b = _mm_packs_epi32(res_temp0_4x32b, res_temp1_4x32b);
645
646 /* pu1_dst[col] = CLIP_U8(i4_tmp); */
647 res_temp0_4x32b = _mm_packus_epi16(res_temp0_4x32b, res_temp0_4x32b);
648
649 dst0 = _mm_cvtsi128_si32(res_temp0_4x32b);
650 /* dst row = 1 to 3 */
651 res_temp1_4x32b = _mm_shuffle_epi32(res_temp0_4x32b, 1);
652
653 /* store four 8-bit output values */
654 *(WORD32 *)(&pu1_dst[0 * dst_strd]) = dst0;
655
656 dst1 = _mm_cvtsi128_si32(res_temp1_4x32b);
657 /* row = 1 */
658 *(WORD32 *)(&pu1_dst[1 * dst_strd]) = dst1;
659
660 pi2_src += 4; /* Pointer update */
661 pu1_dst += 4; /* Pointer update */
662
663 } /* inner loop ends here(4-output values in single iteration) */
664 pi2_src = pi2_src - wdx2 + 2 * src_strd; /* Pointer update */
665 pu1_dst = pu1_dst - wdx2 + 2 * dst_strd; /* Pointer update */
666 }
667 }
668 }
669 }
670
671 /**
672 *******************************************************************************
673 *
674 * @brief
675 * Does bi-weighted prediction on the arrays pointed by pi2_src1 and
676 * pi2_src2 and stores it at location pointed by pi2_dst
677 *
678 * @par Description:
679 * dst = ( (src1 + lvl_shift1)*wgt0 + (src2 + lvl_shift2)*wgt1 + (off0 +
680 * off1 + 1) << (shift - 1) ) >> shift
681 *
682 * @param[in] pi2_src1
683 * Pointer to source 1
684 *
685 * @param[in] pi2_src2
686 * Pointer to source 2
687 *
688 * @param[out] pu1_dst
689 * Pointer to destination
690 *
691 * @param[in] src_strd1
692 * Source stride 1
693 *
694 * @param[in] src_strd2
695 * Source stride 2
696 *
697 * @param[in] dst_strd
698 * Destination stride
699 *
700 * @param[in] wgt0
701 * weight to be multiplied to source 1
702 *
703 * @param[in] off0
704 * offset 0
705 *
706 * @param[in] wgt1
707 * weight to be multiplied to source 2
708 *
709 * @param[in] off1
710 * offset 1
711 *
712 * @param[in] shift
713 * (14 Bit depth) + log2_weight_denominator
714 *
715 * @param[in] lvl_shift1
716 * added before shift and offset
717 *
718 * @param[in] lvl_shift2
719 * added before shift and offset
720 *
721 * @param[in] ht
722 * height of the source
723 *
724 * @param[in] wd
725 * width of the source
726 *
727 * @returns
728 *
729 * @remarks
730 * None
731 *
732 *******************************************************************************
733 */
734
735
ihevc_weighted_pred_bi_ssse3(WORD16 * pi2_src1,WORD16 * pi2_src2,UWORD8 * pu1_dst,WORD32 src_strd1,WORD32 src_strd2,WORD32 dst_strd,WORD32 wgt0,WORD32 off0,WORD32 wgt1,WORD32 off1,WORD32 shift,WORD32 lvl_shift1,WORD32 lvl_shift2,WORD32 ht,WORD32 wd)736 void ihevc_weighted_pred_bi_ssse3(WORD16 *pi2_src1,
737 WORD16 *pi2_src2,
738 UWORD8 *pu1_dst,
739 WORD32 src_strd1,
740 WORD32 src_strd2,
741 WORD32 dst_strd,
742 WORD32 wgt0,
743 WORD32 off0,
744 WORD32 wgt1,
745 WORD32 off1,
746 WORD32 shift,
747 WORD32 lvl_shift1,
748 WORD32 lvl_shift2,
749 WORD32 ht,
750 WORD32 wd)
751 {
752 WORD32 row, col, temp;
753
754 __m128i src_temp1_8x16b, src_temp2_8x16b, src_temp3_8x16b, src_temp4_8x16b;
755 __m128i const_temp_4x32b, lvl_shift1_4x32b, lvl_shift2_4x32b, wgt0_8x16b, wgt1_8x16b;
756 __m128i res_temp1_4x32b, res_temp2_4x32b, res_temp3_4x32b, res_temp4_4x32b;
757
758 #include <assert.h>
759 ASSERT(wd % 4 == 0); /* checking assumption*/
760 ASSERT(ht % 4 == 0); /* checking assumption*/
761
762 temp = (off0 + off1 + 1) << (shift - 1);
763
764 // seting values in register
765 lvl_shift1_4x32b = _mm_set1_epi16(lvl_shift1);
766 wgt0_8x16b = _mm_set1_epi16(wgt0);
767 lvl_shift2_4x32b = _mm_set1_epi16(lvl_shift2);
768 wgt1_8x16b = _mm_set1_epi16(wgt1);
769
770 /* lvl_shift1 * wgt0 */
771 res_temp1_4x32b = _mm_mullo_epi16(lvl_shift1_4x32b, wgt0_8x16b);
772 res_temp2_4x32b = _mm_mulhi_epi16(lvl_shift1_4x32b, wgt0_8x16b);
773 /* lvl_shift2 * wgt1 */
774 res_temp3_4x32b = _mm_mullo_epi16(lvl_shift2_4x32b, wgt1_8x16b);
775 res_temp4_4x32b = _mm_mulhi_epi16(lvl_shift2_4x32b, wgt1_8x16b);
776
777 const_temp_4x32b = _mm_set1_epi32(temp);
778
779 /* lvl_shift1 * wgt0 */
780 lvl_shift1_4x32b = _mm_unpacklo_epi16(res_temp1_4x32b, res_temp2_4x32b);
781 /* lvl_shift2 * wgt1 */
782 lvl_shift2_4x32b = _mm_unpacklo_epi16(res_temp3_4x32b, res_temp4_4x32b);
783
784 if(0 == (wd & 7)) /* wd multiple of 8 case */
785 {
786 __m128i res_temp5_4x32b, res_temp6_4x32b, res_temp7_4x32b, res_temp8_4x32b;
787 /* outer for loop starts from here */
788 for(row = 0; row < ht; row += 2)
789 {
790 for(col = 0; col < wd; col += 8)
791 {
792 /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
793 src_temp1_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1)); /* row = 0 */
794 src_temp2_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2)); /* row = 0 */
795 src_temp3_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 1 * src_strd1)); /* row = 1 */
796 src_temp4_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 1 * src_strd2)); /* row = 1 */
797
798 /*i4_tmp = (pi2_src[col]) * wgt*/ /* Lower 16 bit */
799 res_temp1_4x32b = _mm_mullo_epi16(src_temp1_8x16b, wgt0_8x16b);
800 res_temp2_4x32b = _mm_mullo_epi16(src_temp2_8x16b, wgt1_8x16b);
801 res_temp3_4x32b = _mm_mullo_epi16(src_temp3_8x16b, wgt0_8x16b);
802 res_temp4_4x32b = _mm_mullo_epi16(src_temp4_8x16b, wgt1_8x16b);
803 /*i4_tmp = (pi2_src[col] ) * wgt*/ /* Higher 16 bit */
804 src_temp1_8x16b = _mm_mulhi_epi16(src_temp1_8x16b, wgt0_8x16b);
805 src_temp2_8x16b = _mm_mulhi_epi16(src_temp2_8x16b, wgt1_8x16b);
806 src_temp3_8x16b = _mm_mulhi_epi16(src_temp3_8x16b, wgt0_8x16b);
807 src_temp4_8x16b = _mm_mulhi_epi16(src_temp4_8x16b, wgt1_8x16b);
808
809 /* Get 32 bit Result */
810 res_temp5_4x32b = _mm_unpackhi_epi16(res_temp1_4x32b, src_temp1_8x16b);
811 res_temp6_4x32b = _mm_unpackhi_epi16(res_temp2_4x32b, src_temp2_8x16b);
812 res_temp7_4x32b = _mm_unpackhi_epi16(res_temp3_4x32b, src_temp3_8x16b);
813 res_temp8_4x32b = _mm_unpackhi_epi16(res_temp4_4x32b, src_temp4_8x16b);
814
815 res_temp1_4x32b = _mm_unpacklo_epi16(res_temp1_4x32b, src_temp1_8x16b);
816 res_temp2_4x32b = _mm_unpacklo_epi16(res_temp2_4x32b, src_temp2_8x16b);
817 res_temp3_4x32b = _mm_unpacklo_epi16(res_temp3_4x32b, src_temp3_8x16b);
818 res_temp4_4x32b = _mm_unpacklo_epi16(res_temp4_4x32b, src_temp4_8x16b);
819
820 /* (pi2_src[col] + lvl_shift) * wgt */
821 res_temp5_4x32b = _mm_add_epi32(res_temp5_4x32b, lvl_shift1_4x32b);
822 res_temp6_4x32b = _mm_add_epi32(res_temp6_4x32b, lvl_shift2_4x32b);
823 res_temp7_4x32b = _mm_add_epi32(res_temp7_4x32b, lvl_shift1_4x32b);
824 res_temp8_4x32b = _mm_add_epi32(res_temp8_4x32b, lvl_shift2_4x32b);
825 res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, lvl_shift1_4x32b);
826 res_temp2_4x32b = _mm_add_epi32(res_temp2_4x32b, lvl_shift2_4x32b);
827 res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, lvl_shift1_4x32b);
828 res_temp4_4x32b = _mm_add_epi32(res_temp4_4x32b, lvl_shift2_4x32b);
829
830 /* (pi2_src1[col] + lvl_shift1) * wgt0 + (pi2_src2[col] + lvl_shift2) * wgt1 */
831 res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, res_temp2_4x32b);
832 res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, res_temp4_4x32b);
833 /* i4_tmp += (off0 + off1 + 1) << (shift - 1); */
834 res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, const_temp_4x32b);
835 res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, const_temp_4x32b);
836 /* (i4_tmp >> shift) */
837 res_temp1_4x32b = _mm_srai_epi32(res_temp1_4x32b, shift);
838 res_temp3_4x32b = _mm_srai_epi32(res_temp3_4x32b, shift);
839
840 /* Next 4 Pixels */
841 res_temp5_4x32b = _mm_add_epi32(res_temp5_4x32b, res_temp6_4x32b);
842 res_temp7_4x32b = _mm_add_epi32(res_temp7_4x32b, res_temp8_4x32b);
843 res_temp5_4x32b = _mm_add_epi32(res_temp5_4x32b, const_temp_4x32b);
844 res_temp7_4x32b = _mm_add_epi32(res_temp7_4x32b, const_temp_4x32b);
845 res_temp5_4x32b = _mm_srai_epi32(res_temp5_4x32b, shift);
846 res_temp7_4x32b = _mm_srai_epi32(res_temp7_4x32b, shift);
847
848 res_temp1_4x32b = _mm_packs_epi32(res_temp1_4x32b, res_temp5_4x32b);
849 res_temp3_4x32b = _mm_packs_epi32(res_temp3_4x32b, res_temp7_4x32b);
850
851 /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */
852 res_temp1_4x32b = _mm_packus_epi16(res_temp1_4x32b, res_temp1_4x32b);
853 res_temp3_4x32b = _mm_packus_epi16(res_temp3_4x32b, res_temp3_4x32b);
854
855 /* store four 8-bit output values */
856 _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd), res_temp1_4x32b); /* row = 0*/
857 _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), res_temp3_4x32b); /* row = 1*/
858
859 pi2_src1 += 8; /* Pointer update */
860 pi2_src2 += 8; /* Pointer update */
861 pu1_dst += 8; /* Pointer update */
862
863 } /* inner loop ends here(4-output values in single iteration) */
864
865 pi2_src1 = pi2_src1 - wd + 2 * src_strd1; /* Pointer update */
866 pi2_src2 = pi2_src2 - wd + 2 * src_strd2; /* Pointer update */
867 pu1_dst = pu1_dst - wd + 2 * dst_strd; /* Pointer update */
868
869 } /* outer loop ends */
870 }
871 else /* wd multiple of 4 case */
872 {
873 WORD32 dst0, dst1;
874 /* outer for loop starts from here */
875 for(row = 0; row < ht; row += 2)
876 {
877 for(col = 0; col < wd; col += 4)
878 {
879 /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
880 src_temp1_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1)); /* row = 0 */
881 src_temp2_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2)); /* row = 0 */
882 src_temp3_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1 + 1 * src_strd1)); /* row = 1 */
883 src_temp4_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2 + 1 * src_strd2)); /* row = 1 */
884
885 /* 2 rows together */
886 src_temp1_8x16b = _mm_unpacklo_epi64(src_temp1_8x16b, src_temp3_8x16b);
887 src_temp2_8x16b = _mm_unpacklo_epi64(src_temp2_8x16b, src_temp4_8x16b);
888
889 /*i4_tmp = (pi2_src[col]) * wgt*/ /* Lower 16 bit */
890 res_temp1_4x32b = _mm_mullo_epi16(src_temp1_8x16b, wgt0_8x16b);
891 res_temp2_4x32b = _mm_mullo_epi16(src_temp2_8x16b, wgt1_8x16b);
892 /*i4_tmp = (pi2_src[col] ) * wgt*/ /* Higher 16 bit */
893 src_temp1_8x16b = _mm_mulhi_epi16(src_temp1_8x16b, wgt0_8x16b);
894 src_temp2_8x16b = _mm_mulhi_epi16(src_temp2_8x16b, wgt1_8x16b);
895
896 /* Get 32 bit Result */
897 res_temp3_4x32b = _mm_unpackhi_epi16(res_temp1_4x32b, src_temp1_8x16b);
898 res_temp4_4x32b = _mm_unpackhi_epi16(res_temp2_4x32b, src_temp2_8x16b);
899
900 res_temp1_4x32b = _mm_unpacklo_epi16(res_temp1_4x32b, src_temp1_8x16b);
901 res_temp2_4x32b = _mm_unpacklo_epi16(res_temp2_4x32b, src_temp2_8x16b);
902
903 /* (pi2_src[col] + lvl_shift) * wgt */
904 res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, lvl_shift1_4x32b);
905 res_temp4_4x32b = _mm_add_epi32(res_temp4_4x32b, lvl_shift2_4x32b);
906 res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, lvl_shift1_4x32b);
907 res_temp2_4x32b = _mm_add_epi32(res_temp2_4x32b, lvl_shift2_4x32b);
908
909 /* (pi2_src1[col] + lvl_shift1) * wgt0 + (pi2_src2[col] + lvl_shift2) * wgt1 */
910 res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, res_temp2_4x32b);
911 res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, res_temp4_4x32b);
912
913 /* i4_tmp += (off0 + off1 + 1) << (shift - 1); */
914 res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, const_temp_4x32b);
915 res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, const_temp_4x32b);
916
917 /* (i4_tmp >> shift) */
918 res_temp1_4x32b = _mm_srai_epi32(res_temp1_4x32b, shift);
919 res_temp3_4x32b = _mm_srai_epi32(res_temp3_4x32b, shift);
920
921 res_temp1_4x32b = _mm_packs_epi32(res_temp1_4x32b, res_temp3_4x32b);
922
923 /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */
924 res_temp1_4x32b = _mm_packus_epi16(res_temp1_4x32b, res_temp1_4x32b);
925
926 dst0 = _mm_cvtsi128_si32(res_temp1_4x32b);
927
928 /* dst row = 1 to 3 */
929 res_temp2_4x32b = _mm_shuffle_epi32(res_temp1_4x32b, 1);
930
931 /* store four 8-bit output values */
932 *(WORD32 *)(&pu1_dst[0 * dst_strd]) = dst0;
933
934 dst1 = _mm_cvtsi128_si32(res_temp2_4x32b);
935
936 /* row = 1 */
937 *(WORD32 *)(&pu1_dst[1 * dst_strd]) = dst1;
938
939 pi2_src1 += 4; /* Pointer update */
940 pi2_src2 += 4; /* Pointer update */
941 pu1_dst += 4; /* Pointer update */
942
943 } /* inner loop ends here(4-output values in single iteration) */
944
945 pi2_src1 = pi2_src1 - wd + 2 * src_strd1; /* Pointer update */
946 pi2_src2 = pi2_src2 - wd + 2 * src_strd2; /* Pointer update */
947 pu1_dst = pu1_dst - wd + 2 * dst_strd; /* Pointer update */
948
949 } /* outer loop ends */
950 }
951
952 }
953
954 /**
955 *******************************************************************************
956 *
957 * @brief
958 * Does chroma bi-weighted prediction on the arrays pointed by pi2_src1 and
959 * pi2_src2 and stores it at location pointed by pi2_dst
960 *
961 * @par Description:
962 * dst = ( (src1 + lvl_shift1)*wgt0 + (src2 + lvl_shift2)*wgt1 + (off0 +
963 * off1 + 1) << (shift - 1) ) >> shift
964 *
965 * @param[in] pi2_src1
966 * Pointer to source 1
967 *
968 * @param[in] pi2_src2
969 * Pointer to source 2
970 *
971 * @param[out] pu1_dst
972 * Pointer to destination
973 *
974 * @param[in] src_strd1
975 * Source stride 1
976 *
977 * @param[in] src_strd2
978 * Source stride 2
979 *
980 * @param[in] dst_strd
981 * Destination stride
982 *
983 * @param[in] wgt0
984 * weight to be multiplied to source 1
985 *
986 * @param[in] off0
987 * offset 0
988 *
989 * @param[in] wgt1
990 * weight to be multiplied to source 2
991 *
992 * @param[in] off1
993 * offset 1
994 *
995 * @param[in] shift
996 * (14 Bit depth) + log2_weight_denominator
997 *
998 * @param[in] lvl_shift1
999 * added before shift and offset
1000 *
1001 * @param[in] lvl_shift2
1002 * added before shift and offset
1003 *
1004 * @param[in] ht
1005 * height of the source
1006 *
1007 * @param[in] wd
1008 * width of the source (each colour component)
1009 *
1010 * @returns
1011 *
1012 * @remarks
1013 * None
1014 *
1015 *******************************************************************************
1016 */
1017
1018
ihevc_weighted_pred_chroma_bi_ssse3(WORD16 * pi2_src1,WORD16 * pi2_src2,UWORD8 * pu1_dst,WORD32 src_strd1,WORD32 src_strd2,WORD32 dst_strd,WORD32 wgt0_cb,WORD32 wgt0_cr,WORD32 off0_cb,WORD32 off0_cr,WORD32 wgt1_cb,WORD32 wgt1_cr,WORD32 off1_cb,WORD32 off1_cr,WORD32 shift,WORD32 lvl_shift1,WORD32 lvl_shift2,WORD32 ht,WORD32 wd)1019 void ihevc_weighted_pred_chroma_bi_ssse3(WORD16 *pi2_src1,
1020 WORD16 *pi2_src2,
1021 UWORD8 *pu1_dst,
1022 WORD32 src_strd1,
1023 WORD32 src_strd2,
1024 WORD32 dst_strd,
1025 WORD32 wgt0_cb,
1026 WORD32 wgt0_cr,
1027 WORD32 off0_cb,
1028 WORD32 off0_cr,
1029 WORD32 wgt1_cb,
1030 WORD32 wgt1_cr,
1031 WORD32 off1_cb,
1032 WORD32 off1_cr,
1033 WORD32 shift,
1034 WORD32 lvl_shift1,
1035 WORD32 lvl_shift2,
1036 WORD32 ht,
1037 WORD32 wd)
1038 {
1039 WORD32 row, col, temp1, temp2;
1040 WORD32 wdx2;
1041
1042 __m128i src_temp1_8x16b, src_temp2_8x16b, src_temp3_8x16b, src_temp4_8x16b;
1043 __m128i const_temp_4x32b, lvl_shift1_4x32b, lvl_shift2_4x32b, wgt0_8x16b, wgt1_8x16b;
1044 __m128i res_temp1_4x32b, res_temp2_4x32b, res_temp3_4x32b, res_temp4_4x32b;
1045
1046 ASSERT(wd % 2 == 0); /* checking assumption*/
1047 ASSERT(ht % 2 == 0); /* checking assumption*/
1048
1049 temp1 = (off0_cb + off1_cb + 1) << (shift - 1);
1050 temp2 = (off0_cr + off1_cr + 1) << (shift - 1);
1051
1052 // seting values in register
1053 lvl_shift1_4x32b = _mm_set1_epi16(lvl_shift1);
1054 wgt0_8x16b = _mm_set_epi16(wgt0_cr, wgt0_cb, wgt0_cr, wgt0_cb, wgt0_cr, wgt0_cb, wgt0_cr, wgt0_cb);
1055 lvl_shift2_4x32b = _mm_set1_epi16(lvl_shift2);
1056 wgt1_8x16b = _mm_set_epi16(wgt1_cr, wgt1_cb, wgt1_cr, wgt1_cb, wgt1_cr, wgt1_cb, wgt1_cr, wgt1_cb);
1057
1058 /* lvl_shift1 * wgt0 */
1059 res_temp1_4x32b = _mm_mullo_epi16(lvl_shift1_4x32b, wgt0_8x16b);
1060 res_temp2_4x32b = _mm_mulhi_epi16(lvl_shift1_4x32b, wgt0_8x16b);
1061 /* lvl_shift2 * wgt1 */
1062 res_temp3_4x32b = _mm_mullo_epi16(lvl_shift2_4x32b, wgt1_8x16b);
1063 res_temp4_4x32b = _mm_mulhi_epi16(lvl_shift2_4x32b, wgt1_8x16b);
1064
1065 const_temp_4x32b = _mm_set_epi32(temp2, temp1, temp2, temp1);
1066 wdx2 = wd * 2;
1067
1068 /* lvl_shift1 * wgt0 */
1069 lvl_shift1_4x32b = _mm_unpacklo_epi16(res_temp1_4x32b, res_temp2_4x32b);
1070 /* lvl_shift2 * wgt1 */
1071 lvl_shift2_4x32b = _mm_unpacklo_epi16(res_temp3_4x32b, res_temp4_4x32b);
1072
1073 if(0 == (wdx2 & 7)) /* wdx2 multiple of 8 case */
1074 {
1075 __m128i res_temp5_4x32b, res_temp6_4x32b, res_temp7_4x32b, res_temp8_4x32b;
1076 /* outer for loop starts from here */
1077 for(row = 0; row < ht; row += 2)
1078 {
1079 for(col = 0; col < wdx2; col += 8)
1080 {
1081 /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
1082 src_temp1_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1)); /* row = 0 */
1083 src_temp2_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2)); /* row = 0 */
1084 src_temp3_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 1 * src_strd1)); /* row = 1 */
1085 src_temp4_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 1 * src_strd2)); /* row = 1 */
1086
1087 /*i4_tmp = (pi2_src[col]) * wgt*/ /* Lower 16 bit */
1088 res_temp1_4x32b = _mm_mullo_epi16(src_temp1_8x16b, wgt0_8x16b);
1089 res_temp2_4x32b = _mm_mullo_epi16(src_temp2_8x16b, wgt1_8x16b);
1090 res_temp3_4x32b = _mm_mullo_epi16(src_temp3_8x16b, wgt0_8x16b);
1091 res_temp4_4x32b = _mm_mullo_epi16(src_temp4_8x16b, wgt1_8x16b);
1092 /*i4_tmp = (pi2_src[col] ) * wgt*/ /* Higher 16 bit */
1093 src_temp1_8x16b = _mm_mulhi_epi16(src_temp1_8x16b, wgt0_8x16b);
1094 src_temp2_8x16b = _mm_mulhi_epi16(src_temp2_8x16b, wgt1_8x16b);
1095 src_temp3_8x16b = _mm_mulhi_epi16(src_temp3_8x16b, wgt0_8x16b);
1096 src_temp4_8x16b = _mm_mulhi_epi16(src_temp4_8x16b, wgt1_8x16b);
1097
1098 /* Get 32 bit Result */
1099 res_temp5_4x32b = _mm_unpackhi_epi16(res_temp1_4x32b, src_temp1_8x16b);
1100 res_temp6_4x32b = _mm_unpackhi_epi16(res_temp2_4x32b, src_temp2_8x16b);
1101 res_temp7_4x32b = _mm_unpackhi_epi16(res_temp3_4x32b, src_temp3_8x16b);
1102 res_temp8_4x32b = _mm_unpackhi_epi16(res_temp4_4x32b, src_temp4_8x16b);
1103
1104 res_temp1_4x32b = _mm_unpacklo_epi16(res_temp1_4x32b, src_temp1_8x16b);
1105 res_temp2_4x32b = _mm_unpacklo_epi16(res_temp2_4x32b, src_temp2_8x16b);
1106 res_temp3_4x32b = _mm_unpacklo_epi16(res_temp3_4x32b, src_temp3_8x16b);
1107 res_temp4_4x32b = _mm_unpacklo_epi16(res_temp4_4x32b, src_temp4_8x16b);
1108
1109 /* (pi2_src[col] + lvl_shift) * wgt */
1110 res_temp5_4x32b = _mm_add_epi32(res_temp5_4x32b, lvl_shift1_4x32b);
1111 res_temp6_4x32b = _mm_add_epi32(res_temp6_4x32b, lvl_shift2_4x32b);
1112 res_temp7_4x32b = _mm_add_epi32(res_temp7_4x32b, lvl_shift1_4x32b);
1113 res_temp8_4x32b = _mm_add_epi32(res_temp8_4x32b, lvl_shift2_4x32b);
1114 res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, lvl_shift1_4x32b);
1115 res_temp2_4x32b = _mm_add_epi32(res_temp2_4x32b, lvl_shift2_4x32b);
1116 res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, lvl_shift1_4x32b);
1117 res_temp4_4x32b = _mm_add_epi32(res_temp4_4x32b, lvl_shift2_4x32b);
1118
1119 /* (pi2_src1[col] + lvl_shift1) * wgt0 + (pi2_src2[col] + lvl_shift2) * wgt1 */
1120 res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, res_temp2_4x32b);
1121 res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, res_temp4_4x32b);
1122 /* i4_tmp += (off0 + off1 + 1) << (shift - 1); */
1123 res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, const_temp_4x32b);
1124 res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, const_temp_4x32b);
1125 /* (i4_tmp >> shift) */
1126 res_temp1_4x32b = _mm_srai_epi32(res_temp1_4x32b, shift);
1127 res_temp3_4x32b = _mm_srai_epi32(res_temp3_4x32b, shift);
1128
1129 /* Next 4 Pixels */
1130 res_temp5_4x32b = _mm_add_epi32(res_temp5_4x32b, res_temp6_4x32b);
1131 res_temp7_4x32b = _mm_add_epi32(res_temp7_4x32b, res_temp8_4x32b);
1132 res_temp5_4x32b = _mm_add_epi32(res_temp5_4x32b, const_temp_4x32b);
1133 res_temp7_4x32b = _mm_add_epi32(res_temp7_4x32b, const_temp_4x32b);
1134 res_temp5_4x32b = _mm_srai_epi32(res_temp5_4x32b, shift);
1135 res_temp7_4x32b = _mm_srai_epi32(res_temp7_4x32b, shift);
1136
1137 res_temp1_4x32b = _mm_packs_epi32(res_temp1_4x32b, res_temp5_4x32b);
1138 res_temp3_4x32b = _mm_packs_epi32(res_temp3_4x32b, res_temp7_4x32b);
1139
1140 /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */
1141 res_temp1_4x32b = _mm_packus_epi16(res_temp1_4x32b, res_temp1_4x32b);
1142 res_temp3_4x32b = _mm_packus_epi16(res_temp3_4x32b, res_temp3_4x32b);
1143
1144 /* store four 8-bit output values */
1145 _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd), res_temp1_4x32b); /* row = 0*/
1146 _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), res_temp3_4x32b); /* row = 1*/
1147
1148 pi2_src1 += 8; /* Pointer update */
1149 pi2_src2 += 8; /* Pointer update */
1150 pu1_dst += 8; /* Pointer update */
1151
1152 } /* inner loop ends here(4-output values in single iteration) */
1153
1154 pi2_src1 = pi2_src1 - wdx2 + 2 * src_strd1; /* Pointer update */
1155 pi2_src2 = pi2_src2 - wdx2 + 2 * src_strd2; /* Pointer update */
1156 pu1_dst = pu1_dst - wdx2 + 2 * dst_strd; /* Pointer update */
1157
1158 } /* outer loop ends */
1159 }
1160 else /* wdx2 multiple of 4 case */
1161 {
1162 WORD32 dst0, dst1;
1163 /* outer for loop starts from here */
1164 for(row = 0; row < ht; row += 2)
1165 {
1166 for(col = 0; col < wdx2; col += 4)
1167 {
1168 /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
1169 src_temp1_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1)); /* row = 0 */
1170 src_temp2_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2)); /* row = 0 */
1171 src_temp3_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1 + 1 * src_strd1)); /* row = 1 */
1172 src_temp4_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2 + 1 * src_strd2)); /* row = 1 */
1173
1174 /* 2 rows together */
1175 src_temp1_8x16b = _mm_unpacklo_epi64(src_temp1_8x16b, src_temp3_8x16b);
1176 src_temp2_8x16b = _mm_unpacklo_epi64(src_temp2_8x16b, src_temp4_8x16b);
1177
1178 /*i4_tmp = (pi2_src[col]) * wgt*/ /* Lower 16 bit */
1179 res_temp1_4x32b = _mm_mullo_epi16(src_temp1_8x16b, wgt0_8x16b);
1180 res_temp2_4x32b = _mm_mullo_epi16(src_temp2_8x16b, wgt1_8x16b);
1181 /*i4_tmp = (pi2_src[col] ) * wgt*/ /* Higher 16 bit */
1182 src_temp1_8x16b = _mm_mulhi_epi16(src_temp1_8x16b, wgt0_8x16b);
1183 src_temp2_8x16b = _mm_mulhi_epi16(src_temp2_8x16b, wgt1_8x16b);
1184
1185 /* Get 32 bit Result */
1186 res_temp3_4x32b = _mm_unpackhi_epi16(res_temp1_4x32b, src_temp1_8x16b);
1187 res_temp4_4x32b = _mm_unpackhi_epi16(res_temp2_4x32b, src_temp2_8x16b);
1188
1189 res_temp1_4x32b = _mm_unpacklo_epi16(res_temp1_4x32b, src_temp1_8x16b);
1190 res_temp2_4x32b = _mm_unpacklo_epi16(res_temp2_4x32b, src_temp2_8x16b);
1191
1192 /* (pi2_src[col] + lvl_shift) * wgt */
1193 res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, lvl_shift1_4x32b);
1194 res_temp4_4x32b = _mm_add_epi32(res_temp4_4x32b, lvl_shift2_4x32b);
1195 res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, lvl_shift1_4x32b);
1196 res_temp2_4x32b = _mm_add_epi32(res_temp2_4x32b, lvl_shift2_4x32b);
1197
1198 /* (pi2_src1[col] + lvl_shift1) * wgt0 + (pi2_src2[col] + lvl_shift2) * wgt1 */
1199 res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, res_temp2_4x32b);
1200 res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, res_temp4_4x32b);
1201
1202 /* i4_tmp += (off0 + off1 + 1) << (shift - 1); */
1203 res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, const_temp_4x32b);
1204 res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, const_temp_4x32b);
1205
1206 /* (i4_tmp >> shift) */
1207 res_temp1_4x32b = _mm_srai_epi32(res_temp1_4x32b, shift);
1208 res_temp3_4x32b = _mm_srai_epi32(res_temp3_4x32b, shift);
1209
1210 res_temp1_4x32b = _mm_packs_epi32(res_temp1_4x32b, res_temp3_4x32b);
1211
1212 /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */
1213 res_temp1_4x32b = _mm_packus_epi16(res_temp1_4x32b, res_temp1_4x32b);
1214
1215 dst0 = _mm_cvtsi128_si32(res_temp1_4x32b);
1216
1217 /* dst row = 1 to 3 */
1218 res_temp2_4x32b = _mm_shuffle_epi32(res_temp1_4x32b, 1);
1219
1220 /* store four 8-bit output values */
1221 *(WORD32 *)(&pu1_dst[0 * dst_strd]) = dst0;
1222
1223 dst1 = _mm_cvtsi128_si32(res_temp2_4x32b);
1224
1225 /* row = 1 */
1226 *(WORD32 *)(&pu1_dst[1 * dst_strd]) = dst1;
1227
1228 pi2_src1 += 4; /* Pointer update */
1229 pi2_src2 += 4; /* Pointer update */
1230 pu1_dst += 4; /* Pointer update */
1231
1232 } /* inner loop ends here(4-output values in single iteration) */
1233
1234 pi2_src1 = pi2_src1 - wdx2 + 2 * src_strd1; /* Pointer update */
1235 pi2_src2 = pi2_src2 - wdx2 + 2 * src_strd2; /* Pointer update */
1236 pu1_dst = pu1_dst - wdx2 + 2 * dst_strd; /* Pointer update */
1237 }
1238 }
1239
1240 }
1241
1242 /**
1243 *******************************************************************************
1244 *
1245 * @brief
1246 * Does default bi-weighted prediction on the arrays pointed by pi2_src1 and
1247 * pi2_src2 and stores it at location pointed by pi2_dst
1248 *
1249 * @par Description:
1250 * dst = ( (src1 + lvl_shift1) + (src2 + lvl_shift2) + 1 << (shift - 1) )
1251 * >> shift where shift = 15 - BitDepth
1252 *
1253 * @param[in] pi2_src1
1254 * Pointer to source 1
1255 *
1256 * @param[in] pi2_src2
1257 * Pointer to source 2
1258 *
1259 * @param[out] pu1_dst
1260 * Pointer to destination
1261 *
1262 * @param[in] src_strd1
1263 * Source stride 1
1264 *
1265 * @param[in] src_strd2
1266 * Source stride 2
1267 *
1268 * @param[in] dst_strd
1269 * Destination stride
1270 *
1271 * @param[in] lvl_shift1
1272 * added before shift and offset
1273 *
1274 * @param[in] lvl_shift2
1275 * added before shift and offset
1276 *
1277 * @param[in] ht
1278 * height of the source
1279 *
1280 * @param[in] wd
1281 * width of the source
1282 *
1283 * @returns
1284 *
1285 * @remarks
1286 * None
1287 *
1288 * Assumption : ht%4 == 0, wd%4 == 0
1289 * shift == 7, (lvl_shift1+lvl_shift2) can take {0, 8K, 16K}. In that case,
1290 * final result will match even if intermediate precision is in 16 bit.
1291 *
1292 *******************************************************************************
1293 */
ihevc_weighted_pred_bi_default_ssse3(WORD16 * pi2_src1,WORD16 * pi2_src2,UWORD8 * pu1_dst,WORD32 src_strd1,WORD32 src_strd2,WORD32 dst_strd,WORD32 lvl_shift1,WORD32 lvl_shift2,WORD32 ht,WORD32 wd)1294 void ihevc_weighted_pred_bi_default_ssse3(WORD16 *pi2_src1,
1295 WORD16 *pi2_src2,
1296 UWORD8 *pu1_dst,
1297 WORD32 src_strd1,
1298 WORD32 src_strd2,
1299 WORD32 dst_strd,
1300 WORD32 lvl_shift1,
1301 WORD32 lvl_shift2,
1302 WORD32 ht,
1303 WORD32 wd)
1304 {
1305 {
1306 WORD32 row, col, temp;
1307 WORD32 shift;
1308
1309 __m128i src_temp1_8x16b, src_temp2_8x16b, src_temp3_8x16b, src_temp4_8x16b;
1310 __m128i const_temp_8x16b, lvl_shift1_8x16b, lvl_shift2_8x16b;
1311 __m128i src_temp5_8x16b, src_temp6_8x16b, src_temp7_8x16b, src_temp8_8x16b;
1312
1313 ASSERT(wd % 4 == 0); /* checking assumption*/
1314 ASSERT(ht % 2 == 0); /* checking assumption*/
1315
1316 shift = SHIFT_14_MINUS_BIT_DEPTH + 1;
1317 temp = 1 << (shift - 1);
1318
1319 // seting values in register
1320 lvl_shift1_8x16b = _mm_set1_epi16(lvl_shift1);
1321 lvl_shift2_8x16b = _mm_set1_epi16(lvl_shift2);
1322 const_temp_8x16b = _mm_set1_epi16(temp);
1323
1324 lvl_shift1_8x16b = _mm_adds_epi16(lvl_shift1_8x16b, lvl_shift2_8x16b);
1325 lvl_shift1_8x16b = _mm_adds_epi16(lvl_shift1_8x16b, const_temp_8x16b);
1326
1327 if(0 == (ht & 3)) /* ht multiple of 4*/
1328 {
1329 if(0 == (wd & 15)) /* wd multiple of 16 case */
1330 {
1331 __m128i src_temp9_8x16b, src_temp10_8x16b, src_temp11_8x16b, src_temp12_8x16b;
1332 __m128i src_temp13_8x16b, src_temp14_8x16b, src_temp15_8x16b, src_temp16_8x16b;
1333 /* outer for loop starts from here */
1334 for(row = 0; row < ht; row += 4)
1335 {
1336 for(col = 0; col < wd; col += 16)
1337 {
1338 /*load 8 pixel values */ /* First 8 Values */
1339 src_temp1_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1));
1340 src_temp2_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2));
1341 /* row = 1 */
1342 src_temp3_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + src_strd1));
1343 src_temp4_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + src_strd2));
1344 /* row = 2 */
1345 src_temp5_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 2 * src_strd1));
1346 src_temp6_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 2 * src_strd2));
1347 /* row = 3 */
1348 src_temp7_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 3 * src_strd1));
1349 src_temp8_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 3 * src_strd2));
1350
1351 /*load 8 pixel values */ /* Second 8 Values */
1352 src_temp9_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 8));
1353 src_temp10_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 8));
1354 /* row = 1 */
1355 src_temp11_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + src_strd1 + 8));
1356 src_temp12_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + src_strd2 + 8));
1357 /* row = 2 */
1358 src_temp13_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 2 * src_strd1 + 8));
1359 src_temp14_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 2 * src_strd2 + 8));
1360
1361 /* (pi2_src1[col] + pi2_src2[col]) */ /* First 8 Values */
1362 src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, src_temp2_8x16b);
1363 src_temp3_8x16b = _mm_adds_epi16(src_temp3_8x16b, src_temp4_8x16b);
1364 src_temp5_8x16b = _mm_adds_epi16(src_temp5_8x16b, src_temp6_8x16b);
1365 src_temp7_8x16b = _mm_adds_epi16(src_temp7_8x16b, src_temp8_8x16b);
1366
1367 /*load 8 pixel values */ /* Second 8 Values */
1368 /* row = 3 */
1369 src_temp15_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 3 * src_strd1 + 8));
1370 src_temp16_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 3 * src_strd2 + 8));
1371
1372 /* i4_tmp = (pi2_src1[col] + pi2_src2[col] + lvl_shift1 + lvl_shift2 + shift_value) */ /* First 8 Values */
1373 src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, lvl_shift1_8x16b);
1374 src_temp3_8x16b = _mm_adds_epi16(src_temp3_8x16b, lvl_shift1_8x16b);
1375 src_temp5_8x16b = _mm_adds_epi16(src_temp5_8x16b, lvl_shift1_8x16b);
1376 src_temp7_8x16b = _mm_adds_epi16(src_temp7_8x16b, lvl_shift1_8x16b);
1377
1378 /* (pi2_src1[col] + pi2_src2[col]) */ /* Second 8 Values */
1379 src_temp9_8x16b = _mm_adds_epi16(src_temp9_8x16b, src_temp10_8x16b);
1380 src_temp11_8x16b = _mm_adds_epi16(src_temp11_8x16b, src_temp12_8x16b);
1381 src_temp13_8x16b = _mm_adds_epi16(src_temp13_8x16b, src_temp14_8x16b);
1382 src_temp15_8x16b = _mm_adds_epi16(src_temp15_8x16b, src_temp16_8x16b);
1383
1384 /* (i4_tmp >> shift) */ /* First 8 Values */
1385 src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b, shift);
1386 src_temp3_8x16b = _mm_srai_epi16(src_temp3_8x16b, shift);
1387 src_temp5_8x16b = _mm_srai_epi16(src_temp5_8x16b, shift);
1388 src_temp7_8x16b = _mm_srai_epi16(src_temp7_8x16b, shift);
1389
1390 /* i4_tmp = (pi2_src1[col] + pi2_src2[col] + lvl_shift1 + lvl_shift2 + shift_value) */ /* Second 8 Values */
1391 src_temp9_8x16b = _mm_adds_epi16(src_temp9_8x16b, lvl_shift1_8x16b);
1392 src_temp11_8x16b = _mm_adds_epi16(src_temp11_8x16b, lvl_shift1_8x16b);
1393 src_temp13_8x16b = _mm_adds_epi16(src_temp13_8x16b, lvl_shift1_8x16b);
1394 src_temp15_8x16b = _mm_adds_epi16(src_temp15_8x16b, lvl_shift1_8x16b);
1395
1396 /* (i4_tmp >> shift) */ /* Second 8 Values */
1397 src_temp9_8x16b = _mm_srai_epi16(src_temp9_8x16b, shift);
1398 src_temp11_8x16b = _mm_srai_epi16(src_temp11_8x16b, shift);
1399 src_temp13_8x16b = _mm_srai_epi16(src_temp13_8x16b, shift);
1400 src_temp15_8x16b = _mm_srai_epi16(src_temp15_8x16b, shift);
1401
1402 /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */ /* 16 8 Values */
1403 src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp9_8x16b);
1404 src_temp3_8x16b = _mm_packus_epi16(src_temp3_8x16b, src_temp11_8x16b);
1405 src_temp5_8x16b = _mm_packus_epi16(src_temp5_8x16b, src_temp13_8x16b);
1406 src_temp7_8x16b = _mm_packus_epi16(src_temp7_8x16b, src_temp15_8x16b);
1407
1408 /* store four 8-bit output values */ /* 16 8 Values */
1409 _mm_storeu_si128((__m128i *)(pu1_dst + 0 * dst_strd), src_temp1_8x16b); /* row = 0*/
1410 _mm_storeu_si128((__m128i *)(pu1_dst + 1 * dst_strd), src_temp3_8x16b); /* row = 2*/
1411 _mm_storeu_si128((__m128i *)(pu1_dst + 2 * dst_strd), src_temp5_8x16b); /* row = 1*/
1412 _mm_storeu_si128((__m128i *)(pu1_dst + 3 * dst_strd), src_temp7_8x16b); /* row = 3*/
1413
1414 /* To update pointer */
1415 pi2_src1 += 16;
1416 pi2_src2 += 16;
1417 pu1_dst += 16;
1418
1419 } /* inner loop ends here(8-output values in single iteration) */
1420
1421 pi2_src1 = pi2_src1 - wd + 4 * src_strd1; /* Pointer update */
1422 pi2_src2 = pi2_src2 - wd + 4 * src_strd2; /* Pointer update */
1423 pu1_dst = pu1_dst - wd + 4 * dst_strd; /* Pointer update */
1424
1425 }
1426 }
1427 else if(0 == (wd & 7)) /* multiple of 8 case */
1428 {
1429 /* outer for loop starts from here */
1430 for(row = 0; row < ht; row += 4)
1431 {
1432 for(col = 0; col < wd; col += 8)
1433 {
1434 /*load 8 pixel values */
1435 src_temp1_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1));
1436 src_temp2_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2));
1437 /* row = 1 */
1438 src_temp3_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + src_strd1));
1439 src_temp4_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + src_strd2));
1440 /* row = 2 */
1441 src_temp5_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 2 * src_strd1));
1442 src_temp6_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 2 * src_strd2));
1443 /* row = 3 */
1444 src_temp7_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 3 * src_strd1));
1445 src_temp8_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 3 * src_strd2));
1446
1447 /* (pi2_src1[col] + pi2_src2[col]) */
1448 src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, src_temp2_8x16b);
1449 src_temp3_8x16b = _mm_adds_epi16(src_temp3_8x16b, src_temp4_8x16b);
1450 src_temp5_8x16b = _mm_adds_epi16(src_temp5_8x16b, src_temp6_8x16b);
1451 src_temp7_8x16b = _mm_adds_epi16(src_temp7_8x16b, src_temp8_8x16b);
1452
1453 /* i4_tmp = (pi2_src1[col] + pi2_src2[col] + lvl_shift1 + lvl_shift2 + shift_value) */
1454 src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, lvl_shift1_8x16b);
1455 src_temp3_8x16b = _mm_adds_epi16(src_temp3_8x16b, lvl_shift1_8x16b);
1456 src_temp5_8x16b = _mm_adds_epi16(src_temp5_8x16b, lvl_shift1_8x16b);
1457 src_temp7_8x16b = _mm_adds_epi16(src_temp7_8x16b, lvl_shift1_8x16b);
1458
1459 /* (i4_tmp >> shift) */
1460 src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b, shift);
1461 src_temp3_8x16b = _mm_srai_epi16(src_temp3_8x16b, shift);
1462 src_temp5_8x16b = _mm_srai_epi16(src_temp5_8x16b, shift);
1463 src_temp7_8x16b = _mm_srai_epi16(src_temp7_8x16b, shift);
1464
1465 /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */
1466 src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp1_8x16b);
1467 src_temp3_8x16b = _mm_packus_epi16(src_temp3_8x16b, src_temp3_8x16b);
1468 src_temp5_8x16b = _mm_packus_epi16(src_temp5_8x16b, src_temp5_8x16b);
1469 src_temp7_8x16b = _mm_packus_epi16(src_temp7_8x16b, src_temp7_8x16b);
1470
1471 /* store four 8-bit output values */
1472 _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd), src_temp1_8x16b); /* row = 0*/
1473 _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), src_temp3_8x16b); /* row = 2*/
1474 _mm_storel_epi64((__m128i *)(pu1_dst + 2 * dst_strd), src_temp5_8x16b); /* row = 1*/
1475 _mm_storel_epi64((__m128i *)(pu1_dst + 3 * dst_strd), src_temp7_8x16b); /* row = 3*/
1476
1477 /* To update pointer */
1478 pi2_src1 += 8;
1479 pi2_src2 += 8;
1480 pu1_dst += 8;
1481
1482 } /* inner loop ends here(8-output values in single iteration) */
1483
1484 pi2_src1 = pi2_src1 - wd + 4 * src_strd1; /* Pointer update */
1485 pi2_src2 = pi2_src2 - wd + 4 * src_strd2; /* Pointer update */
1486 pu1_dst = pu1_dst - wd + 4 * dst_strd; /* Pointer update */
1487
1488 }
1489 }
1490 else /* wd multiple of 4 case*/
1491 {
1492 WORD32 dst0, dst1, dst2, dst3;
1493
1494 /* outer for loop starts from here */
1495 for(row = 0; row < ht; row += 4)
1496 {
1497 for(col = 0; col < wd; col += 4)
1498 {
1499 /*load 4 pixel values from 7:0 pos. relative to cur. pos.*/
1500 src_temp1_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1));
1501 /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
1502 src_temp2_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2));
1503
1504 /* row = 1 */
1505 src_temp3_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1 + src_strd1));
1506 src_temp4_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2 + src_strd2));
1507 /* row = 2 */
1508 src_temp5_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1 + 2 * src_strd1));
1509 src_temp6_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2 + 2 * src_strd2));
1510 /* row = 3 */
1511 src_temp7_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1 + 3 * src_strd1));
1512 src_temp8_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2 + 3 * src_strd2));
1513
1514 /* Pack two rows together */
1515 src_temp1_8x16b = _mm_unpacklo_epi64(src_temp1_8x16b, src_temp3_8x16b);
1516 src_temp2_8x16b = _mm_unpacklo_epi64(src_temp2_8x16b, src_temp4_8x16b);
1517 src_temp5_8x16b = _mm_unpacklo_epi64(src_temp5_8x16b, src_temp7_8x16b);
1518 src_temp6_8x16b = _mm_unpacklo_epi64(src_temp6_8x16b, src_temp8_8x16b);
1519
1520 /* (pi2_src1[col] + pi2_src2[col]) */
1521 src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, src_temp2_8x16b);
1522 src_temp5_8x16b = _mm_adds_epi16(src_temp5_8x16b, src_temp6_8x16b);
1523
1524 /* i4_tmp = (pi2_src1[col] + pi2_src2[col] + lvl_shift1 + lvl_shift2 + shift_value) */
1525 src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, lvl_shift1_8x16b);
1526 src_temp5_8x16b = _mm_adds_epi16(src_temp5_8x16b, lvl_shift1_8x16b);
1527
1528 /* (i4_tmp >> shift) */
1529 src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b, shift);
1530 src_temp5_8x16b = _mm_srai_epi16(src_temp5_8x16b, shift);
1531
1532 /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */
1533 src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp1_8x16b);
1534 src_temp5_8x16b = _mm_packus_epi16(src_temp5_8x16b, src_temp5_8x16b);
1535
1536 dst0 = _mm_cvtsi128_si32(src_temp1_8x16b);
1537 /* dst row = 1 to 3 */
1538 src_temp2_8x16b = _mm_shuffle_epi32(src_temp1_8x16b, 1);
1539 src_temp4_8x16b = _mm_shuffle_epi32(src_temp5_8x16b, 1);
1540
1541 /* store four 8-bit output values */
1542 *(WORD32 *)(&pu1_dst[0 * dst_strd]) = dst0;
1543
1544 dst1 = _mm_cvtsi128_si32(src_temp2_8x16b);
1545 dst2 = _mm_cvtsi128_si32(src_temp5_8x16b);
1546 dst3 = _mm_cvtsi128_si32(src_temp4_8x16b);
1547
1548 /* row = 1 to row = 3 */
1549 *(WORD32 *)(&pu1_dst[1 * dst_strd]) = dst1;
1550 *(WORD32 *)(&pu1_dst[2 * dst_strd]) = dst2;
1551 *(WORD32 *)(&pu1_dst[3 * dst_strd]) = dst3;
1552
1553 /* To update pointer */
1554 pi2_src1 += 4;
1555 pi2_src2 += 4;
1556 pu1_dst += 4;
1557
1558 } /* inner loop ends here(4-output values in single iteration) */
1559
1560 pi2_src1 = pi2_src1 - wd + 4 * src_strd1; /* Pointer update */
1561 pi2_src2 = pi2_src2 - wd + 4 * src_strd2; /* Pointer update */
1562 pu1_dst = pu1_dst - wd + 4 * dst_strd; /* Pointer update */
1563
1564 }
1565 }
1566 }
1567 else /* ht multiple of 2 case and wd multiple of 4 case*/
1568 {
1569
1570 WORD32 dst0, dst1;
1571
1572 /* outer for loop starts from here */
1573 for(row = 0; row < ht; row += 2)
1574 {
1575 for(col = 0; col < wd; col += 4)
1576 {
1577 /*load 4 pixel values from 7:0 pos. relative to cur. pos.*/
1578 src_temp1_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1));
1579 /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
1580 src_temp2_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2));
1581
1582 /* row = 1 */
1583 src_temp3_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1 + src_strd1));
1584 src_temp4_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2 + src_strd2));
1585
1586 /* Pack two rows together */
1587 src_temp1_8x16b = _mm_unpacklo_epi64(src_temp1_8x16b, src_temp3_8x16b);
1588 src_temp2_8x16b = _mm_unpacklo_epi64(src_temp2_8x16b, src_temp4_8x16b);
1589
1590 /* (pi2_src1[col] + pi2_src2[col]) */
1591 src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, src_temp2_8x16b);
1592
1593 /* i4_tmp = (pi2_src1[col] + pi2_src2[col] + lvl_shift1 + lvl_shift2 + shift_value) */
1594 src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, lvl_shift1_8x16b);
1595
1596 /* (i4_tmp >> shift) */
1597 src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b, shift);
1598
1599 /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */
1600 src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp1_8x16b);
1601
1602 dst0 = _mm_cvtsi128_si32(src_temp1_8x16b);
1603 /* dst row = 1 to 3 */
1604 src_temp2_8x16b = _mm_shuffle_epi32(src_temp1_8x16b, 1);
1605
1606 /* store four 8-bit output values */
1607 *(WORD32 *)(&pu1_dst[0 * dst_strd]) = dst0;
1608
1609 dst1 = _mm_cvtsi128_si32(src_temp2_8x16b);
1610
1611 /* row = 1 to row = 3 */
1612 *(WORD32 *)(&pu1_dst[1 * dst_strd]) = dst1;
1613
1614 /* To update pointer */
1615 pi2_src1 += 4;
1616 pi2_src2 += 4;
1617 pu1_dst += 4;
1618
1619 } /* inner loop ends here(4-output values in single iteration) */
1620
1621 pi2_src1 = pi2_src1 - wd + 2 * src_strd1; /* Pointer update */
1622 pi2_src2 = pi2_src2 - wd + 2 * src_strd2; /* Pointer update */
1623 pu1_dst = pu1_dst - wd + 2 * dst_strd; /* Pointer update */
1624
1625 }
1626
1627 }
1628
1629 }
1630 }
1631
1632
1633 /**
1634 *******************************************************************************
1635 *
1636 * @brief
1637 * Does chroma default bi-weighted prediction on arrays pointed by pi2_src1 and
1638 * pi2_src2 and stores it at location pointed by pi2_dst
1639 *
1640 * @par Description:
1641 * dst = ( (src1 + lvl_shift1) + (src2 + lvl_shift2) + 1 << (shift - 1) )
1642 * >> shift where shift = 15 - BitDepth
1643 *
1644 * @param[in] pi2_src1
1645 * Pointer to source 1
1646 *
1647 * @param[in] pi2_src2
1648 * Pointer to source 2
1649 *
1650 * @param[out] pu1_dst
1651 * Pointer to destination
1652 *
1653 * @param[in] src_strd1
1654 * Source stride 1
1655 *
1656 * @param[in] src_strd2
1657 * Source stride 2
1658 *
1659 * @param[in] dst_strd
1660 * Destination stride
1661 *
1662 * @param[in] lvl_shift1
1663 * added before shift and offset
1664 *
1665 * @param[in] lvl_shift2
1666 * added before shift and offset
1667 *
1668 * @param[in] ht
1669 * height of the source
1670 *
1671 * @param[in] wd
1672 * width of the source (each colour component)
1673 *
1674 * @returns
1675 *
1676 * @remarks
1677 * None
1678 *
1679 * Assumption : ht%2 == 0, wd%2 == 0, lvl_shift1==0, lvl_shift2==0.
1680 * shift == 7, (lvl_shift1+lvl_shift2) can take {0, 8K, 16K}. In that case,
1681 * final result will match even if intermediate precision is in 16 bit.
1682 *******************************************************************************
1683 */
1684
ihevc_weighted_pred_chroma_bi_default_ssse3(WORD16 * pi2_src1,WORD16 * pi2_src2,UWORD8 * pu1_dst,WORD32 src_strd1,WORD32 src_strd2,WORD32 dst_strd,WORD32 lvl_shift1,WORD32 lvl_shift2,WORD32 ht,WORD32 wd)1685 void ihevc_weighted_pred_chroma_bi_default_ssse3(WORD16 *pi2_src1,
1686 WORD16 *pi2_src2,
1687 UWORD8 *pu1_dst,
1688 WORD32 src_strd1,
1689 WORD32 src_strd2,
1690 WORD32 dst_strd,
1691 WORD32 lvl_shift1,
1692 WORD32 lvl_shift2,
1693 WORD32 ht,
1694 WORD32 wd)
1695 {
1696 WORD32 row, col, temp;
1697 WORD32 shift, wdx2;
1698
1699 __m128i src_temp1_8x16b, src_temp2_8x16b, src_temp3_8x16b, src_temp4_8x16b;
1700 __m128i lvl_shift1_8x16b;
1701 __m128i src_temp5_8x16b, src_temp6_8x16b, src_temp7_8x16b, src_temp8_8x16b;
1702
1703 ASSERT(wd % 2 == 0); /* checking assumption*/
1704 ASSERT(ht % 2 == 0); /* checking assumption*/
1705 UNUSED(lvl_shift1);
1706 UNUSED(lvl_shift2);
1707 shift = SHIFT_14_MINUS_BIT_DEPTH + 1;
1708 temp = 1 << (shift - 1);
1709 wdx2 = wd * 2;
1710
1711 // seting values in register
1712 lvl_shift1_8x16b = _mm_set1_epi16(temp);
1713
1714 if(0 == (ht & 3)) /* ht multiple of 4 case */
1715 {
1716 if(0 == (wdx2 & 15)) /* 2*wd multiple of 16 case */
1717 {
1718 __m128i src_temp9_8x16b, src_temp10_8x16b, src_temp11_8x16b, src_temp12_8x16b;
1719 __m128i src_temp13_8x16b, src_temp14_8x16b, src_temp15_8x16b, src_temp16_8x16b;
1720 /* outer for loop starts from here */
1721 for(row = 0; row < ht; row += 4)
1722 {
1723 for(col = 0; col < wdx2; col += 16)
1724 {
1725 /*load 8 pixel values */ /* First 8 Values */
1726 src_temp1_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1));
1727 src_temp2_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2));
1728 /* row = 1 */
1729 src_temp3_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + src_strd1));
1730 src_temp4_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + src_strd2));
1731 /* row = 2 */
1732 src_temp5_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 2 * src_strd1));
1733 src_temp6_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 2 * src_strd2));
1734 /* row = 3 */
1735 src_temp7_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 3 * src_strd1));
1736 src_temp8_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 3 * src_strd2));
1737
1738 /*load 8 pixel values */ /* Second 8 Values */
1739 src_temp9_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 8));
1740 src_temp10_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 8));
1741 /* row = 1 */
1742 src_temp11_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + src_strd1 + 8));
1743 src_temp12_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + src_strd2 + 8));
1744 /* row = 2 */
1745 src_temp13_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 2 * src_strd1 + 8));
1746 src_temp14_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 2 * src_strd2 + 8));
1747
1748 /* (pi2_src1[col] + pi2_src2[col]) */ /* First 8 Values */
1749 src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, src_temp2_8x16b);
1750 src_temp3_8x16b = _mm_adds_epi16(src_temp3_8x16b, src_temp4_8x16b);
1751 src_temp5_8x16b = _mm_adds_epi16(src_temp5_8x16b, src_temp6_8x16b);
1752 src_temp7_8x16b = _mm_adds_epi16(src_temp7_8x16b, src_temp8_8x16b);
1753
1754 /*load 8 pixel values */ /* Second 8 Values */
1755 /* row = 3 */
1756 src_temp15_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 3 * src_strd1 + 8));
1757 src_temp16_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 3 * src_strd2 + 8));
1758
1759 /* i4_tmp = (pi2_src1[col] + pi2_src2[col] + lvl_shift1 + lvl_shift2 + shift_value) */ /* First 8 Values */
1760 src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, lvl_shift1_8x16b);
1761 src_temp3_8x16b = _mm_adds_epi16(src_temp3_8x16b, lvl_shift1_8x16b);
1762 src_temp5_8x16b = _mm_adds_epi16(src_temp5_8x16b, lvl_shift1_8x16b);
1763 src_temp7_8x16b = _mm_adds_epi16(src_temp7_8x16b, lvl_shift1_8x16b);
1764
1765 /* (pi2_src1[col] + pi2_src2[col]) */ /* Second 8 Values */
1766 src_temp9_8x16b = _mm_adds_epi16(src_temp9_8x16b, src_temp10_8x16b);
1767 src_temp11_8x16b = _mm_adds_epi16(src_temp11_8x16b, src_temp12_8x16b);
1768 src_temp13_8x16b = _mm_adds_epi16(src_temp13_8x16b, src_temp14_8x16b);
1769 src_temp15_8x16b = _mm_adds_epi16(src_temp15_8x16b, src_temp16_8x16b);
1770
1771 /* (i4_tmp >> shift) */ /* First 8 Values */
1772 src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b, shift);
1773 src_temp3_8x16b = _mm_srai_epi16(src_temp3_8x16b, shift);
1774 src_temp5_8x16b = _mm_srai_epi16(src_temp5_8x16b, shift);
1775 src_temp7_8x16b = _mm_srai_epi16(src_temp7_8x16b, shift);
1776
1777 /* i4_tmp = (pi2_src1[col] + pi2_src2[col] + lvl_shift1 + lvl_shift2 + shift_value) */ /* Second 8 Values */
1778 src_temp9_8x16b = _mm_adds_epi16(src_temp9_8x16b, lvl_shift1_8x16b);
1779 src_temp11_8x16b = _mm_adds_epi16(src_temp11_8x16b, lvl_shift1_8x16b);
1780 src_temp13_8x16b = _mm_adds_epi16(src_temp13_8x16b, lvl_shift1_8x16b);
1781 src_temp15_8x16b = _mm_adds_epi16(src_temp15_8x16b, lvl_shift1_8x16b);
1782
1783 /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */ /* First 8 Values */
1784 src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp1_8x16b);
1785 src_temp3_8x16b = _mm_packus_epi16(src_temp3_8x16b, src_temp3_8x16b);
1786 src_temp5_8x16b = _mm_packus_epi16(src_temp5_8x16b, src_temp5_8x16b);
1787 src_temp7_8x16b = _mm_packus_epi16(src_temp7_8x16b, src_temp7_8x16b);
1788
1789 /* (i4_tmp >> shift) */ /* Second 8 Values */
1790 src_temp9_8x16b = _mm_srai_epi16(src_temp9_8x16b, shift);
1791 src_temp11_8x16b = _mm_srai_epi16(src_temp11_8x16b, shift);
1792 src_temp13_8x16b = _mm_srai_epi16(src_temp13_8x16b, shift);
1793 src_temp15_8x16b = _mm_srai_epi16(src_temp15_8x16b, shift);
1794
1795 /* store four 8-bit output values */ /* First 8 Values */
1796 _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd), src_temp1_8x16b); /* row = 0*/
1797 _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), src_temp3_8x16b); /* row = 2*/
1798 _mm_storel_epi64((__m128i *)(pu1_dst + 2 * dst_strd), src_temp5_8x16b); /* row = 1*/
1799 _mm_storel_epi64((__m128i *)(pu1_dst + 3 * dst_strd), src_temp7_8x16b); /* row = 3*/
1800
1801 /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */ /* Second 8 Values */
1802 src_temp9_8x16b = _mm_packus_epi16(src_temp9_8x16b, src_temp9_8x16b);
1803 src_temp11_8x16b = _mm_packus_epi16(src_temp11_8x16b, src_temp11_8x16b);
1804 src_temp13_8x16b = _mm_packus_epi16(src_temp13_8x16b, src_temp13_8x16b);
1805 src_temp15_8x16b = _mm_packus_epi16(src_temp15_8x16b, src_temp15_8x16b);
1806
1807 /* store four 8-bit output values */ /* Second 8 Values */
1808 _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd + 8), src_temp9_8x16b); /* row = 0*/
1809 _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd + 8), src_temp11_8x16b); /* row = 2*/
1810 _mm_storel_epi64((__m128i *)(pu1_dst + 2 * dst_strd + 8), src_temp13_8x16b); /* row = 1*/
1811 _mm_storel_epi64((__m128i *)(pu1_dst + 3 * dst_strd + 8), src_temp15_8x16b); /* row = 3*/
1812
1813 /* To update pointer */
1814 pi2_src1 += 16;
1815 pi2_src2 += 16;
1816 pu1_dst += 16;
1817
1818 } /* inner loop ends here(8-output values in single iteration) */
1819
1820 pi2_src1 = pi2_src1 - wdx2 + 4 * src_strd1; /* Pointer update */
1821 pi2_src2 = pi2_src2 - wdx2 + 4 * src_strd2; /* Pointer update */
1822 pu1_dst = pu1_dst - wdx2 + 4 * dst_strd; /* Pointer update */
1823
1824 }
1825 }
1826 else if(0 == (wdx2 & 7)) /* multiple of 8 case */
1827 {
1828 /* outer for loop starts from here */
1829 for(row = 0; row < ht; row += 4)
1830 {
1831 for(col = 0; col < wdx2; col += 8)
1832 {
1833 /*load 8 pixel values */
1834 src_temp1_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1));
1835 src_temp2_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2));
1836 /* row = 1 */
1837 src_temp3_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + src_strd1));
1838 src_temp4_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + src_strd2));
1839 /* row = 2 */
1840 src_temp5_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 2 * src_strd1));
1841 src_temp6_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 2 * src_strd2));
1842 /* row = 3 */
1843 src_temp7_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 3 * src_strd1));
1844 src_temp8_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 3 * src_strd2));
1845
1846 /* (pi2_src1[col] + pi2_src2[col]) */
1847 src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, src_temp2_8x16b);
1848 src_temp3_8x16b = _mm_adds_epi16(src_temp3_8x16b, src_temp4_8x16b);
1849 src_temp5_8x16b = _mm_adds_epi16(src_temp5_8x16b, src_temp6_8x16b);
1850 src_temp7_8x16b = _mm_adds_epi16(src_temp7_8x16b, src_temp8_8x16b);
1851
1852 /* i4_tmp = (pi2_src1[col] + pi2_src2[col] + lvl_shift1 + lvl_shift2 + shift_value) */
1853 src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, lvl_shift1_8x16b);
1854 src_temp3_8x16b = _mm_adds_epi16(src_temp3_8x16b, lvl_shift1_8x16b);
1855 src_temp5_8x16b = _mm_adds_epi16(src_temp5_8x16b, lvl_shift1_8x16b);
1856 src_temp7_8x16b = _mm_adds_epi16(src_temp7_8x16b, lvl_shift1_8x16b);
1857
1858 /* (i4_tmp >> shift) */
1859 src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b, shift);
1860 src_temp3_8x16b = _mm_srai_epi16(src_temp3_8x16b, shift);
1861 src_temp5_8x16b = _mm_srai_epi16(src_temp5_8x16b, shift);
1862 src_temp7_8x16b = _mm_srai_epi16(src_temp7_8x16b, shift);
1863
1864 /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */
1865 src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp1_8x16b);
1866 src_temp3_8x16b = _mm_packus_epi16(src_temp3_8x16b, src_temp3_8x16b);
1867 src_temp5_8x16b = _mm_packus_epi16(src_temp5_8x16b, src_temp5_8x16b);
1868 src_temp7_8x16b = _mm_packus_epi16(src_temp7_8x16b, src_temp7_8x16b);
1869
1870 /* store four 8-bit output values */
1871 _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd), src_temp1_8x16b); /* row = 0*/
1872 _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), src_temp3_8x16b); /* row = 2*/
1873 _mm_storel_epi64((__m128i *)(pu1_dst + 2 * dst_strd), src_temp5_8x16b); /* row = 1*/
1874 _mm_storel_epi64((__m128i *)(pu1_dst + 3 * dst_strd), src_temp7_8x16b); /* row = 3*/
1875
1876 /* To update pointer */
1877 pi2_src1 += 8;
1878 pi2_src2 += 8;
1879 pu1_dst += 8;
1880
1881 } /* inner loop ends here(8-output values in single iteration) */
1882
1883 pi2_src1 = pi2_src1 - wdx2 + 4 * src_strd1; /* Pointer update */
1884 pi2_src2 = pi2_src2 - wdx2 + 4 * src_strd2; /* Pointer update */
1885 pu1_dst = pu1_dst - wdx2 + 4 * dst_strd; /* Pointer update */
1886
1887 }
1888 }
1889 else /* 2*wd multiple of 4 case */
1890 {
1891 WORD32 dst0, dst1, dst2, dst3;
1892 /* outer for loop starts from here */
1893 for(row = 0; row < ht; row += 4)
1894 {
1895 for(col = 0; col < wdx2; col += 4)
1896 {
1897 /*load 4 pixel values from 7:0 pos. relative to cur. pos.*/
1898 src_temp1_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1));
1899 /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
1900 src_temp2_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2));
1901
1902 /* row = 1 */
1903 src_temp3_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1 + src_strd1));
1904 src_temp4_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2 + src_strd2));
1905 /* row = 2 */
1906 src_temp5_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1 + 2 * src_strd1));
1907 src_temp6_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2 + 2 * src_strd2));
1908 /* row = 3 */
1909 src_temp7_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1 + 3 * src_strd1));
1910 src_temp8_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2 + 3 * src_strd2));
1911
1912 /* Pack two rows together */
1913 src_temp1_8x16b = _mm_unpacklo_epi64(src_temp1_8x16b, src_temp3_8x16b);
1914 src_temp2_8x16b = _mm_unpacklo_epi64(src_temp2_8x16b, src_temp4_8x16b);
1915 src_temp5_8x16b = _mm_unpacklo_epi64(src_temp5_8x16b, src_temp7_8x16b);
1916 src_temp6_8x16b = _mm_unpacklo_epi64(src_temp6_8x16b, src_temp8_8x16b);
1917
1918 /* (pi2_src1[col] + pi2_src2[col]) */
1919 src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, src_temp2_8x16b);
1920 src_temp5_8x16b = _mm_adds_epi16(src_temp5_8x16b, src_temp6_8x16b);
1921
1922 /* i4_tmp = (pi2_src1[col] + pi2_src2[col] + lvl_shift1 + lvl_shift2 + shift_value) */
1923 src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, lvl_shift1_8x16b);
1924 src_temp5_8x16b = _mm_adds_epi16(src_temp5_8x16b, lvl_shift1_8x16b);
1925
1926 /* (i4_tmp >> shift) */
1927 src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b, shift);
1928 src_temp5_8x16b = _mm_srai_epi16(src_temp5_8x16b, shift);
1929
1930 /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */
1931 src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp1_8x16b);
1932 src_temp5_8x16b = _mm_packus_epi16(src_temp5_8x16b, src_temp5_8x16b);
1933
1934 dst0 = _mm_cvtsi128_si32(src_temp1_8x16b);
1935 /* dst row = 1 to 3 */
1936 src_temp2_8x16b = _mm_shuffle_epi32(src_temp1_8x16b, 1);
1937 src_temp4_8x16b = _mm_shuffle_epi32(src_temp5_8x16b, 1);
1938
1939 /* store four 8-bit output values */
1940 *(WORD32 *)(&pu1_dst[0 * dst_strd]) = dst0;
1941
1942 dst1 = _mm_cvtsi128_si32(src_temp2_8x16b);
1943 dst2 = _mm_cvtsi128_si32(src_temp5_8x16b);
1944 dst3 = _mm_cvtsi128_si32(src_temp4_8x16b);
1945
1946 /* row = 1 to row = 3 */
1947 *(WORD32 *)(&pu1_dst[1 * dst_strd]) = dst1;
1948 *(WORD32 *)(&pu1_dst[2 * dst_strd]) = dst2;
1949 *(WORD32 *)(&pu1_dst[3 * dst_strd]) = dst3;
1950
1951 /* To update pointer */
1952 pi2_src1 += 4;
1953 pi2_src2 += 4;
1954 pu1_dst += 4;
1955
1956 } /* inner loop ends here(4-output values in single iteration) */
1957
1958 pi2_src1 = pi2_src1 - wdx2 + 4 * src_strd1; /* Pointer update */
1959 pi2_src2 = pi2_src2 - wdx2 + 4 * src_strd2; /* Pointer update */
1960 pu1_dst = pu1_dst - wdx2 + 4 * dst_strd; /* Pointer update */
1961
1962 }
1963 }
1964 }
1965 else /* ht multiple of 2 case */
1966 {
1967 if(0 == (wdx2 & 15)) /* 2*wd multiple of 16 case */
1968 {
1969 __m128i src_temp9_8x16b, src_temp10_8x16b, src_temp11_8x16b, src_temp12_8x16b;
1970 /* outer for loop starts from here */
1971 for(row = 0; row < ht; row += 2)
1972 {
1973 for(col = 0; col < wdx2; col += 16)
1974 {
1975 /*load 8 pixel values */ /* First 8 Values */
1976 src_temp1_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1));
1977 src_temp2_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2));
1978 /* row = 1 */
1979 src_temp3_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + src_strd1));
1980 src_temp4_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + src_strd2));
1981
1982 /*load 8 pixel values */ /* Second 8 Values */
1983 src_temp9_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 8));
1984 src_temp10_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 8));
1985 /* row = 1 */
1986 src_temp11_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + src_strd1 + 8));
1987 src_temp12_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + src_strd2 + 8));
1988
1989 /* (pi2_src1[col] + pi2_src2[col]) */ /* First 8 Values */
1990 src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, src_temp2_8x16b);
1991 src_temp3_8x16b = _mm_adds_epi16(src_temp3_8x16b, src_temp4_8x16b);
1992
1993 /* i4_tmp = (pi2_src1[col] + pi2_src2[col] + lvl_shift1 + lvl_shift2 + shift_value) */ /* First 8 Values */
1994 src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, lvl_shift1_8x16b);
1995 src_temp3_8x16b = _mm_adds_epi16(src_temp3_8x16b, lvl_shift1_8x16b);
1996
1997 /* (pi2_src1[col] + pi2_src2[col]) */ /* Second 8 Values */
1998 src_temp9_8x16b = _mm_adds_epi16(src_temp9_8x16b, src_temp10_8x16b);
1999 src_temp11_8x16b = _mm_adds_epi16(src_temp11_8x16b, src_temp12_8x16b);
2000
2001 /* (i4_tmp >> shift) */ /* First 8 Values */
2002 src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b, shift);
2003 src_temp3_8x16b = _mm_srai_epi16(src_temp3_8x16b, shift);
2004
2005 /* i4_tmp = (pi2_src1[col] + pi2_src2[col] + lvl_shift1 + lvl_shift2 + shift_value) */ /* Second 8 Values */
2006 src_temp9_8x16b = _mm_adds_epi16(src_temp9_8x16b, lvl_shift1_8x16b);
2007 src_temp11_8x16b = _mm_adds_epi16(src_temp11_8x16b, lvl_shift1_8x16b);
2008
2009 /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */ /* First 8 Values */
2010 src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp1_8x16b);
2011 src_temp3_8x16b = _mm_packus_epi16(src_temp3_8x16b, src_temp3_8x16b);
2012
2013 /* (i4_tmp >> shift) */ /* Second 8 Values */
2014 src_temp9_8x16b = _mm_srai_epi16(src_temp9_8x16b, shift);
2015 src_temp11_8x16b = _mm_srai_epi16(src_temp11_8x16b, shift);
2016
2017 /* store four 8-bit output values */ /* First 8 Values */
2018 _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd), src_temp1_8x16b); /* row = 0*/
2019 _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), src_temp3_8x16b); /* row = 2*/
2020
2021 /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */ /* Second 8 Values */
2022 src_temp9_8x16b = _mm_packus_epi16(src_temp9_8x16b, src_temp9_8x16b);
2023 src_temp11_8x16b = _mm_packus_epi16(src_temp11_8x16b, src_temp11_8x16b);
2024
2025 /* store four 8-bit output values */ /* Second 8 Values */
2026 _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd + 8), src_temp9_8x16b); /* row = 0*/
2027 _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd + 8), src_temp11_8x16b); /* row = 2*/
2028
2029 /* To update pointer */
2030 pi2_src1 += 16;
2031 pi2_src2 += 16;
2032 pu1_dst += 16;
2033
2034 } /* inner loop ends here(8-output values in single iteration) */
2035
2036 pi2_src1 = pi2_src1 - wdx2 + 2 * src_strd1; /* Pointer update */
2037 pi2_src2 = pi2_src2 - wdx2 + 2 * src_strd2; /* Pointer update */
2038 pu1_dst = pu1_dst - wdx2 + 2 * dst_strd; /* Pointer update */
2039
2040 }
2041 }
2042 else if(0 == (wdx2 & 7)) /* multiple of 8 case */
2043 {
2044 /* outer for loop starts from here */
2045 for(row = 0; row < ht; row += 2)
2046 {
2047 for(col = 0; col < wdx2; col += 8)
2048 {
2049 /*load 8 pixel values */
2050 src_temp1_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1));
2051 src_temp2_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2));
2052 /* row = 1 */
2053 src_temp3_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + src_strd1));
2054 src_temp4_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + src_strd2));
2055
2056 /* (pi2_src1[col] + pi2_src2[col]) */
2057 src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, src_temp2_8x16b);
2058 src_temp3_8x16b = _mm_adds_epi16(src_temp3_8x16b, src_temp4_8x16b);
2059
2060 /* i4_tmp = (pi2_src1[col] + pi2_src2[col] + lvl_shift1 + lvl_shift2 + shift_value) */
2061 src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, lvl_shift1_8x16b);
2062 src_temp3_8x16b = _mm_adds_epi16(src_temp3_8x16b, lvl_shift1_8x16b);
2063
2064 /* (i4_tmp >> shift) */
2065 src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b, shift);
2066 src_temp3_8x16b = _mm_srai_epi16(src_temp3_8x16b, shift);
2067
2068 /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */
2069 src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp1_8x16b);
2070 src_temp3_8x16b = _mm_packus_epi16(src_temp3_8x16b, src_temp3_8x16b);
2071
2072 /* store four 8-bit output values */
2073 _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd), src_temp1_8x16b); /* row = 0*/
2074 _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), src_temp3_8x16b); /* row = 1*/
2075
2076 /* To update pointer */
2077 pi2_src1 += 8;
2078 pi2_src2 += 8;
2079 pu1_dst += 8;
2080
2081 } /* inner loop ends here(8-output values in single iteration) */
2082
2083 pi2_src1 = pi2_src1 - wdx2 + 2 * src_strd1; /* Pointer update */
2084 pi2_src2 = pi2_src2 - wdx2 + 2 * src_strd2; /* Pointer update */
2085 pu1_dst = pu1_dst - wdx2 + 2 * dst_strd; /* Pointer update */
2086
2087 }
2088 }
2089 else /* 2*wd multiple of 4 case */
2090 {
2091 WORD32 dst0, dst1;
2092 /* outer for loop starts from here */
2093 for(row = 0; row < ht; row += 2)
2094 {
2095 for(col = 0; col < wdx2; col += 4)
2096 {
2097 /*load 4 pixel values from 7:0 pos. relative to cur. pos.*/
2098 src_temp1_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1));
2099 /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
2100 src_temp2_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2));
2101 /* row = 1 */
2102 src_temp3_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1 + src_strd1));
2103 src_temp4_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2 + src_strd2));
2104
2105 /* Pack two rows together */
2106 src_temp1_8x16b = _mm_unpacklo_epi64(src_temp1_8x16b, src_temp3_8x16b);
2107 src_temp2_8x16b = _mm_unpacklo_epi64(src_temp2_8x16b, src_temp4_8x16b);
2108
2109 /* (pi2_src1[col] + pi2_src2[col]) */
2110 src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, src_temp2_8x16b);
2111 /* i4_tmp = (pi2_src1[col] + pi2_src2[col] + lvl_shift1 + lvl_shift2 + shift_value) */
2112 src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, lvl_shift1_8x16b);
2113
2114 /* (i4_tmp >> shift) */
2115 src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b, shift);
2116 /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */
2117 src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp1_8x16b);
2118
2119 dst0 = _mm_cvtsi128_si32(src_temp1_8x16b);
2120 /* dst row = 1 */
2121 src_temp2_8x16b = _mm_shuffle_epi32(src_temp1_8x16b, 1);
2122
2123 /* store four 8-bit output values */
2124 *(WORD32 *)(&pu1_dst[0 * dst_strd]) = dst0;
2125
2126 dst1 = _mm_cvtsi128_si32(src_temp2_8x16b);
2127 /* row = 1 */
2128 *(WORD32 *)(&pu1_dst[1 * dst_strd]) = dst1;
2129
2130 /* To update pointer */
2131 pi2_src1 += 4;
2132 pi2_src2 += 4;
2133 pu1_dst += 4;
2134 } /* inner loop ends here(4-output values in single iteration) */
2135
2136 pi2_src1 = pi2_src1 - wdx2 + 2 * src_strd1; /* Pointer update */
2137 pi2_src2 = pi2_src2 - wdx2 + 2 * src_strd2; /* Pointer update */
2138 pu1_dst = pu1_dst - wdx2 + 2 * dst_strd; /* Pointer update */
2139
2140 }
2141 }
2142 }
2143 }
2144