1 /******************************************************************************
2 *
3 * Copyright (C) 2015 The Android Open Source Project
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at:
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 *
17 *****************************************************************************
18 * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19 */
20 /*****************************************************************************/
21 /* */
22 /* File Name : ih264_weighted_pred_intr_sse42.c */
23 /* */
24 /* Description : Contains function definitions for weighted */
25 /* prediction functions in x86 sse4 intrinsics */
26 /* */
27 /* List of Functions : ih264_default_weighted_pred_luma_sse42() */
28 /* ih264_default_weighted_pred_chroma_sse42() */
29 /* ih264_weighted_pred_luma_sse42() */
30 /* ih264_weighted_pred_chroma_sse42() */
31 /* ih264_weighted_bipred_luma_sse42() */
32 /* ih264_weighted_bipred_chroma_sse42() */
33 /* */
34 /* Issues / Problems : None */
35 /* */
36 /* Revision History : */
37 /* */
38 /* DD MM YYYY Author(s) Changes */
39 /* 30 01 2015 Kaushik Initial version */
40 /* Senthoor */
41 /* */
42 /*****************************************************************************/
43 /*****************************************************************************/
44 /* File Includes */
45 /*****************************************************************************/
46
47 #include <immintrin.h>
48 #include "ih264_typedefs.h"
49 #include "ih264_macros.h"
50 #include "ih264_platform_macros.h"
51 #include "ih264_weighted_pred.h"
52
53 /*****************************************************************************/
54 /* Function definitions . */
55 /*****************************************************************************/
56 /*****************************************************************************/
57 /* */
58 /* Function Name : ih264_default_weighted_pred_luma_sse42 */
59 /* */
60 /* Description : This function performs the default weighted prediction */
61 /* as described in sec 8.4.2.3.1 titled "Default weighted */
62 /* sample prediction process" for luma. The function gets */
63 /* two ht x wd blocks, calculates their rounded-average and */
64 /* stores it in the destination block. (ht,wd) can be */
65 /* (4,4), (8,4), (4,8), (8,8), (16,8), (8,16) or (16,16). */
66 /* */
67 /* Inputs : pu1_src1 - Pointer to source 1 */
68 /* pu1_src2 - Pointer to source 2 */
69 /* pu1_dst - Pointer to destination */
70 /* src_strd1 - stride for source 1 */
71 /* src_strd1 - stride for source 2 */
72 /* dst_strd - stride for destination */
73 /* ht - height of the block */
74 /* wd - width of the block */
75 /* */
76 /* Issues : None */
77 /* */
78 /* Revision History: */
79 /* */
80 /* DD MM YYYY Author(s) Changes */
81 /* 04 02 2015 Kaushik Initial Version */
82 /* Senthoor */
83 /* */
84 /*****************************************************************************/
ih264_default_weighted_pred_luma_sse42(UWORD8 * pu1_src1,UWORD8 * pu1_src2,UWORD8 * pu1_dst,WORD32 src_strd1,WORD32 src_strd2,WORD32 dst_strd,WORD32 ht,WORD32 wd)85 void ih264_default_weighted_pred_luma_sse42(UWORD8 *pu1_src1,
86 UWORD8 *pu1_src2,
87 UWORD8 *pu1_dst,
88 WORD32 src_strd1,
89 WORD32 src_strd2,
90 WORD32 dst_strd,
91 WORD32 ht,
92 WORD32 wd)
93 {
94 __m128i y0_0_16x8b, y0_1_16x8b, y0_2_16x8b, y0_3_16x8b;
95 __m128i y1_0_16x8b, y1_1_16x8b, y1_2_16x8b, y1_3_16x8b;
96
97 if(wd == 4)
98 {
99 do
100 {
101 y0_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src1);
102 y0_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src1 + src_strd1));
103 y0_2_16x8b = _mm_loadl_epi64(
104 (__m128i *)(pu1_src1 + (src_strd1 << 1)));
105 y0_3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src1 + src_strd1 * 3));
106
107 y1_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src2);
108 y1_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src2 + src_strd2));
109 y1_2_16x8b = _mm_loadl_epi64(
110 (__m128i *)(pu1_src2 + (src_strd2 << 1)));
111 y1_3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src2 + src_strd2 * 3));
112
113 y0_0_16x8b = _mm_avg_epu8(y0_0_16x8b, y1_0_16x8b);
114 y0_1_16x8b = _mm_avg_epu8(y0_1_16x8b, y1_1_16x8b);
115 y0_2_16x8b = _mm_avg_epu8(y0_2_16x8b, y1_2_16x8b);
116 y0_3_16x8b = _mm_avg_epu8(y0_3_16x8b, y1_3_16x8b);
117
118 *((WORD32 *)(pu1_dst)) = _mm_cvtsi128_si32(y0_0_16x8b);
119 *((WORD32 *)(pu1_dst + dst_strd)) = _mm_cvtsi128_si32(y0_1_16x8b);
120 *((WORD32 *)(pu1_dst + (dst_strd << 1))) = _mm_cvtsi128_si32(y0_2_16x8b);
121 *((WORD32 *)(pu1_dst + dst_strd * 3)) = _mm_cvtsi128_si32(y0_3_16x8b);
122
123 ht -= 4;
124 pu1_src1 += src_strd1 << 2;
125 pu1_src2 += src_strd2 << 2;
126 pu1_dst += dst_strd << 2;
127 }
128 while(ht > 0);
129 }
130 else if(wd == 8)
131 {
132 do
133 {
134 y0_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src1);
135 y0_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src1 + src_strd1));
136 y0_2_16x8b = _mm_loadl_epi64(
137 (__m128i *)(pu1_src1 + (src_strd1 << 1)));
138 y0_3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src1 + src_strd1 * 3));
139
140 y1_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src2);
141 y1_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src2 + src_strd2));
142 y1_2_16x8b = _mm_loadl_epi64(
143 (__m128i *)(pu1_src2 + (src_strd2 << 1)));
144 y1_3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src2 + src_strd2 * 3));
145
146 y0_0_16x8b = _mm_avg_epu8(y0_0_16x8b, y1_0_16x8b);
147 y0_1_16x8b = _mm_avg_epu8(y0_1_16x8b, y1_1_16x8b);
148 y0_2_16x8b = _mm_avg_epu8(y0_2_16x8b, y1_2_16x8b);
149 y0_3_16x8b = _mm_avg_epu8(y0_3_16x8b, y1_3_16x8b);
150
151 _mm_storel_epi64((__m128i *)pu1_dst, y0_0_16x8b);
152 _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), y0_1_16x8b);
153 _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd << 1)), y0_2_16x8b);
154 _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd * 3), y0_3_16x8b);
155
156 ht -= 4;
157 pu1_src1 += src_strd1 << 2;
158 pu1_src2 += src_strd2 << 2;
159 pu1_dst += dst_strd << 2;
160 }
161 while(ht > 0);
162 }
163 else // wd == 16
164 {
165 __m128i y0_4_16x8b, y0_5_16x8b, y0_6_16x8b, y0_7_16x8b;
166 __m128i y1_4_16x8b, y1_5_16x8b, y1_6_16x8b, y1_7_16x8b;
167
168 do
169 {
170 y0_0_16x8b = _mm_loadu_si128((__m128i *)pu1_src1);
171 y0_1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src1 + src_strd1));
172 y0_2_16x8b = _mm_loadu_si128(
173 (__m128i *)(pu1_src1 + (src_strd1 << 1)));
174 y0_3_16x8b = _mm_loadu_si128((__m128i *)(pu1_src1 + src_strd1 * 3));
175 y0_4_16x8b = _mm_loadu_si128(
176 (__m128i *)(pu1_src1 + (src_strd1 << 2)));
177 y0_5_16x8b = _mm_loadu_si128((__m128i *)(pu1_src1 + src_strd1 * 5));
178 y0_6_16x8b = _mm_loadu_si128((__m128i *)(pu1_src1 + src_strd1 * 6));
179 y0_7_16x8b = _mm_loadu_si128((__m128i *)(pu1_src1 + src_strd1 * 7));
180
181 y1_0_16x8b = _mm_loadu_si128((__m128i *)pu1_src2);
182 y1_1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src2 + src_strd2));
183 y1_2_16x8b = _mm_loadu_si128(
184 (__m128i *)(pu1_src2 + (src_strd2 << 1)));
185 y1_3_16x8b = _mm_loadu_si128((__m128i *)(pu1_src2 + src_strd2 * 3));
186 y1_4_16x8b = _mm_loadu_si128(
187 (__m128i *)(pu1_src2 + (src_strd2 << 2)));
188 y1_5_16x8b = _mm_loadu_si128((__m128i *)(pu1_src2 + src_strd2 * 5));
189 y1_6_16x8b = _mm_loadu_si128((__m128i *)(pu1_src2 + src_strd2 * 6));
190 y1_7_16x8b = _mm_loadu_si128((__m128i *)(pu1_src2 + src_strd2 * 7));
191
192 y0_0_16x8b = _mm_avg_epu8(y0_0_16x8b, y1_0_16x8b);
193 y0_1_16x8b = _mm_avg_epu8(y0_1_16x8b, y1_1_16x8b);
194 y0_2_16x8b = _mm_avg_epu8(y0_2_16x8b, y1_2_16x8b);
195 y0_3_16x8b = _mm_avg_epu8(y0_3_16x8b, y1_3_16x8b);
196 y0_4_16x8b = _mm_avg_epu8(y0_4_16x8b, y1_4_16x8b);
197 y0_5_16x8b = _mm_avg_epu8(y0_5_16x8b, y1_5_16x8b);
198 y0_6_16x8b = _mm_avg_epu8(y0_6_16x8b, y1_6_16x8b);
199 y0_7_16x8b = _mm_avg_epu8(y0_7_16x8b, y1_7_16x8b);
200
201 _mm_storeu_si128((__m128i *)pu1_dst, y0_0_16x8b);
202 _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), y0_1_16x8b);
203 _mm_storeu_si128((__m128i *)(pu1_dst + (dst_strd << 1)), y0_2_16x8b);
204 _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd * 3), y0_3_16x8b);
205 _mm_storeu_si128((__m128i *)(pu1_dst + (dst_strd << 2)), y0_4_16x8b);
206 _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd * 5), y0_5_16x8b);
207 _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd * 6), y0_6_16x8b);
208 _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd * 7), y0_7_16x8b);
209
210 ht -= 8;
211 pu1_src1 += src_strd1 << 3;
212 pu1_src2 += src_strd2 << 3;
213 pu1_dst += dst_strd << 3;
214 }
215 while(ht > 0);
216 }
217 }
218
219 /*****************************************************************************/
220 /* */
221 /* Function Name : ih264_default_weighted_pred_chroma_sse42 */
222 /* */
223 /* Description : This function performs the default weighted prediction */
224 /* as described in sec 8.4.2.3.1 titled "Default weighted */
225 /* sample prediction process" for chroma. The function gets */
226 /* two ht x wd blocks, calculates their rounded-average and */
227 /* stores it in the destination block. (ht,wd) can be */
228 /* (2,2), (4,2) , (2,4), (4,4), (8,4), (4,8) or (8,8). */
229 /* */
230 /* Inputs : pu1_src1 - Pointer to source 1 */
231 /* pu1_src2 - Pointer to source 2 */
232 /* pu1_dst - Pointer to destination */
233 /* src_strd1 - stride for source 1 */
234 /* src_strd1 - stride for source 2 */
235 /* dst_strd - stride for destination */
236 /* ht - height of the block */
237 /* wd - width of the block */
238 /* */
239 /* Issues : None */
240 /* */
241 /* Revision History: */
242 /* */
243 /* DD MM YYYY Author(s) Changes */
244 /* 04 02 2015 Kaushik Initial Version */
245 /* Senthoor */
246 /* */
247 /*****************************************************************************/
ih264_default_weighted_pred_chroma_sse42(UWORD8 * pu1_src1,UWORD8 * pu1_src2,UWORD8 * pu1_dst,WORD32 src_strd1,WORD32 src_strd2,WORD32 dst_strd,WORD32 ht,WORD32 wd)248 void ih264_default_weighted_pred_chroma_sse42(UWORD8 *pu1_src1,
249 UWORD8 *pu1_src2,
250 UWORD8 *pu1_dst,
251 WORD32 src_strd1,
252 WORD32 src_strd2,
253 WORD32 dst_strd,
254 WORD32 ht,
255 WORD32 wd)
256 {
257 __m128i uv0_0_16x8b, uv0_1_16x8b;
258 __m128i uv1_0_16x8b, uv1_1_16x8b;
259
260 if(wd == 2)
261 {
262 do
263 {
264 uv0_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src1);
265 uv0_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src1 + src_strd1));
266
267 uv1_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src2);
268 uv1_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src2 + src_strd2));
269
270 uv0_0_16x8b = _mm_avg_epu8(uv0_0_16x8b, uv1_0_16x8b);
271 uv0_1_16x8b = _mm_avg_epu8(uv0_1_16x8b, uv1_1_16x8b);
272
273 *((WORD32 *)(pu1_dst)) = _mm_cvtsi128_si32(uv0_0_16x8b);
274 *((WORD32 *)(pu1_dst + dst_strd)) = _mm_cvtsi128_si32(uv0_1_16x8b);
275
276 ht -= 2;
277 pu1_src1 += src_strd1 << 1;
278 pu1_src2 += src_strd2 << 1;
279 pu1_dst += dst_strd << 1;
280 }
281 while(ht > 0);
282 }
283 else if(wd == 4)
284 {
285 do
286 {
287 uv0_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src1);
288 uv0_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src1 + src_strd1));
289
290 uv1_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src2);
291 uv1_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src2 + src_strd2));
292
293 uv0_0_16x8b = _mm_avg_epu8(uv0_0_16x8b, uv1_0_16x8b);
294 uv0_1_16x8b = _mm_avg_epu8(uv0_1_16x8b, uv1_1_16x8b);
295
296 _mm_storel_epi64((__m128i *)pu1_dst, uv0_0_16x8b);
297 _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), uv0_1_16x8b);
298
299 ht -= 2;
300 pu1_src1 += src_strd1 << 1;
301 pu1_src2 += src_strd2 << 1;
302 pu1_dst += dst_strd << 1;
303 }
304 while(ht > 0);
305 }
306 else // wd == 8
307 {
308 __m128i uv0_2_16x8b, uv0_3_16x8b;
309 __m128i uv1_2_16x8b, uv1_3_16x8b;
310
311 do
312 {
313 uv0_0_16x8b = _mm_loadu_si128((__m128i *)pu1_src1);
314 uv0_1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src1 + src_strd1));
315 uv0_2_16x8b = _mm_loadu_si128(
316 (__m128i *)(pu1_src1 + (src_strd1 << 1)));
317 uv0_3_16x8b = _mm_loadu_si128(
318 (__m128i *)(pu1_src1 + src_strd1 * 3));
319
320 uv1_0_16x8b = _mm_loadu_si128((__m128i *)pu1_src2);
321 uv1_1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src2 + src_strd2));
322 uv1_2_16x8b = _mm_loadu_si128(
323 (__m128i *)(pu1_src2 + (src_strd2 << 1)));
324 uv1_3_16x8b = _mm_loadu_si128(
325 (__m128i *)(pu1_src2 + src_strd2 * 3));
326
327 uv0_0_16x8b = _mm_avg_epu8(uv0_0_16x8b, uv1_0_16x8b);
328 uv0_1_16x8b = _mm_avg_epu8(uv0_1_16x8b, uv1_1_16x8b);
329 uv0_2_16x8b = _mm_avg_epu8(uv0_2_16x8b, uv1_2_16x8b);
330 uv0_3_16x8b = _mm_avg_epu8(uv0_3_16x8b, uv1_3_16x8b);
331
332 _mm_storeu_si128((__m128i *)pu1_dst, uv0_0_16x8b);
333 _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), uv0_1_16x8b);
334 _mm_storeu_si128(
335 (__m128i *)(pu1_dst + (dst_strd << 1)), uv0_2_16x8b);
336 _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd * 3), uv0_3_16x8b);
337
338 ht -= 4;
339 pu1_src1 += src_strd1 << 2;
340 pu1_src2 += src_strd2 << 2;
341 pu1_dst += dst_strd << 2;
342 }
343 while(ht > 0);
344 }
345 }
346
347 /*****************************************************************************/
348 /* */
349 /* Function Name : ih264_weighted_pred_luma_sse42 */
350 /* */
351 /* Description : This function performs the weighted prediction as */
352 /* described in sec 8.4.2.3.2 titled "Weighted sample */
353 /* prediction process" for luma. The function gets one */
354 /* ht x wd block, weights it, rounds it off, offsets it, */
355 /* saturates it to unsigned 8-bit and stores it in the */
356 /* destination block. (ht,wd) can be (4,4), (8,4), (4,8), */
357 /* (8,8), (16,8), (8,16) or (16,16). */
358 /* */
359 /* Inputs : pu1_src - Pointer to source */
360 /* pu1_dst - Pointer to destination */
361 /* src_strd - stride for source */
362 /* dst_strd - stride for destination */
363 /* log_wd - number of bits to be rounded off */
364 /* wt - weight value */
365 /* ofst - offset value */
366 /* ht - height of the block */
367 /* wd - width of the block */
368 /* */
369 /* Issues : None */
370 /* */
371 /* Revision History: */
372 /* */
373 /* DD MM YYYY Author(s) Changes */
374 /* 04 02 2015 Kaushik Initial Version */
375 /* Senthoor */
376 /* */
377 /*****************************************************************************/
ih264_weighted_pred_luma_sse42(UWORD8 * pu1_src,UWORD8 * pu1_dst,WORD32 src_strd,WORD32 dst_strd,WORD32 log_wd,WORD32 wt,WORD32 ofst,WORD32 ht,WORD32 wd)378 void ih264_weighted_pred_luma_sse42(UWORD8 *pu1_src,
379 UWORD8 *pu1_dst,
380 WORD32 src_strd,
381 WORD32 dst_strd,
382 WORD32 log_wd,
383 WORD32 wt,
384 WORD32 ofst,
385 WORD32 ht,
386 WORD32 wd)
387 {
388 __m128i y_0_16x8b, y_1_16x8b, y_2_16x8b, y_3_16x8b;
389
390 __m128i wt_8x16b, round_8x16b, ofst_8x16b;
391
392 WORD32 round_val;
393
394 wt = (WORD16)(wt & 0xffff);
395 round_val = 1 << (log_wd - 1);
396 ofst = (WORD8)(ofst & 0xff);
397
398 wt_8x16b = _mm_set1_epi16(wt);
399 round_8x16b = _mm_set1_epi16(round_val);
400 ofst_8x16b = _mm_set1_epi16(ofst);
401
402 if(wd == 4)
403 {
404 __m128i y_0_8x16b, y_2_8x16b;
405
406 do
407 {
408 y_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src);
409 y_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd));
410 y_2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (src_strd << 1)));
411 y_3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd * 3));
412
413 y_0_16x8b = _mm_unpacklo_epi32(y_0_16x8b, y_1_16x8b);
414 y_2_16x8b = _mm_unpacklo_epi32(y_2_16x8b, y_3_16x8b);
415
416 y_0_8x16b = _mm_cvtepu8_epi16(y_0_16x8b);
417 y_2_8x16b = _mm_cvtepu8_epi16(y_2_16x8b);
418
419 y_0_8x16b = _mm_mullo_epi16(y_0_8x16b, wt_8x16b);
420 y_2_8x16b = _mm_mullo_epi16(y_2_8x16b, wt_8x16b);
421
422 y_0_8x16b = _mm_adds_epi16(round_8x16b, y_0_8x16b);
423 y_2_8x16b = _mm_adds_epi16(round_8x16b, y_2_8x16b);
424
425 y_0_8x16b = _mm_srai_epi16(y_0_8x16b, log_wd);
426 y_2_8x16b = _mm_srai_epi16(y_2_8x16b, log_wd);
427
428 y_0_8x16b = _mm_adds_epi16(ofst_8x16b, y_0_8x16b);
429 y_2_8x16b = _mm_adds_epi16(ofst_8x16b, y_2_8x16b);
430
431 y_0_16x8b = _mm_packus_epi16(y_0_8x16b, y_2_8x16b);
432 y_1_16x8b = _mm_srli_si128(y_0_16x8b, 4);
433 y_2_16x8b = _mm_srli_si128(y_0_16x8b, 8);
434 y_3_16x8b = _mm_srli_si128(y_0_16x8b, 12);
435
436 *((WORD32 *)(pu1_dst)) = _mm_cvtsi128_si32(y_0_16x8b);
437 *((WORD32 *)(pu1_dst + dst_strd)) = _mm_cvtsi128_si32(y_1_16x8b);
438 *((WORD32 *)(pu1_dst + (dst_strd << 1))) = _mm_cvtsi128_si32(y_2_16x8b);
439 *((WORD32 *)(pu1_dst + dst_strd * 3)) = _mm_cvtsi128_si32(y_3_16x8b);
440
441 ht -= 4;
442 pu1_src += src_strd << 2;
443 pu1_dst += dst_strd << 2;
444 }
445 while(ht > 0);
446 }
447 else if(wd == 8)
448 {
449 __m128i y_0_8x16b, y_1_8x16b, y_2_8x16b, y_3_8x16b;
450
451 do
452 {
453 y_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src);
454 y_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd));
455 y_2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (src_strd << 1)));
456 y_3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd * 3));
457
458 y_0_8x16b = _mm_cvtepu8_epi16(y_0_16x8b);
459 y_1_8x16b = _mm_cvtepu8_epi16(y_1_16x8b);
460 y_2_8x16b = _mm_cvtepu8_epi16(y_2_16x8b);
461 y_3_8x16b = _mm_cvtepu8_epi16(y_3_16x8b);
462
463 y_0_8x16b = _mm_mullo_epi16(y_0_8x16b, wt_8x16b);
464 y_1_8x16b = _mm_mullo_epi16(y_1_8x16b, wt_8x16b);
465 y_2_8x16b = _mm_mullo_epi16(y_2_8x16b, wt_8x16b);
466 y_3_8x16b = _mm_mullo_epi16(y_3_8x16b, wt_8x16b);
467
468 y_0_8x16b = _mm_adds_epi16(round_8x16b, y_0_8x16b);
469 y_1_8x16b = _mm_adds_epi16(round_8x16b, y_1_8x16b);
470 y_2_8x16b = _mm_adds_epi16(round_8x16b, y_2_8x16b);
471 y_3_8x16b = _mm_adds_epi16(round_8x16b, y_3_8x16b);
472
473 y_0_8x16b = _mm_srai_epi16(y_0_8x16b, log_wd);
474 y_1_8x16b = _mm_srai_epi16(y_1_8x16b, log_wd);
475 y_2_8x16b = _mm_srai_epi16(y_2_8x16b, log_wd);
476 y_3_8x16b = _mm_srai_epi16(y_3_8x16b, log_wd);
477
478 y_0_8x16b = _mm_adds_epi16(ofst_8x16b, y_0_8x16b);
479 y_1_8x16b = _mm_adds_epi16(ofst_8x16b, y_1_8x16b);
480 y_2_8x16b = _mm_adds_epi16(ofst_8x16b, y_2_8x16b);
481 y_3_8x16b = _mm_adds_epi16(ofst_8x16b, y_3_8x16b);
482
483 y_0_16x8b = _mm_packus_epi16(y_0_8x16b, y_1_8x16b);
484 y_2_16x8b = _mm_packus_epi16(y_2_8x16b, y_3_8x16b);
485 y_1_16x8b = _mm_srli_si128(y_0_16x8b, 8);
486 y_3_16x8b = _mm_srli_si128(y_2_16x8b, 8);
487
488 _mm_storel_epi64((__m128i *)pu1_dst, y_0_16x8b);
489 _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), y_1_16x8b);
490 _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd << 1)), y_2_16x8b);
491 _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd * 3), y_3_16x8b);
492
493 ht -= 4;
494 pu1_src += src_strd << 2;
495 pu1_dst += dst_strd << 2;
496 }
497 while(ht > 0);
498 }
499 else // wd == 16
500 {
501 __m128i y_0L_8x16b, y_1L_8x16b, y_2L_8x16b, y_3L_8x16b;
502 __m128i y_0H_8x16b, y_1H_8x16b, y_2H_8x16b, y_3H_8x16b;
503
504 __m128i zero_16x8b;
505 zero_16x8b = _mm_set1_epi8(0);
506
507 do
508 {
509 y_0_16x8b = _mm_loadu_si128((__m128i *)pu1_src);
510 y_1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + src_strd));
511 y_2_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + (src_strd << 1)));
512 y_3_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + src_strd * 3));
513
514 y_0L_8x16b = _mm_cvtepu8_epi16(y_0_16x8b);
515 y_0H_8x16b = _mm_unpackhi_epi8(y_0_16x8b, zero_16x8b);
516 y_1L_8x16b = _mm_cvtepu8_epi16(y_1_16x8b);
517 y_1H_8x16b = _mm_unpackhi_epi8(y_1_16x8b, zero_16x8b);
518 y_2L_8x16b = _mm_cvtepu8_epi16(y_2_16x8b);
519 y_2H_8x16b = _mm_unpackhi_epi8(y_2_16x8b, zero_16x8b);
520 y_3L_8x16b = _mm_cvtepu8_epi16(y_3_16x8b);
521 y_3H_8x16b = _mm_unpackhi_epi8(y_3_16x8b, zero_16x8b);
522
523 y_0L_8x16b = _mm_mullo_epi16(y_0L_8x16b, wt_8x16b);
524 y_0H_8x16b = _mm_mullo_epi16(y_0H_8x16b, wt_8x16b);
525 y_1L_8x16b = _mm_mullo_epi16(y_1L_8x16b, wt_8x16b);
526 y_1H_8x16b = _mm_mullo_epi16(y_1H_8x16b, wt_8x16b);
527 y_2L_8x16b = _mm_mullo_epi16(y_2L_8x16b, wt_8x16b);
528 y_2H_8x16b = _mm_mullo_epi16(y_2H_8x16b, wt_8x16b);
529 y_3L_8x16b = _mm_mullo_epi16(y_3L_8x16b, wt_8x16b);
530 y_3H_8x16b = _mm_mullo_epi16(y_3H_8x16b, wt_8x16b);
531
532 y_0L_8x16b = _mm_adds_epi16(round_8x16b, y_0L_8x16b);
533 y_0H_8x16b = _mm_adds_epi16(round_8x16b, y_0H_8x16b);
534 y_1L_8x16b = _mm_adds_epi16(round_8x16b, y_1L_8x16b);
535 y_1H_8x16b = _mm_adds_epi16(round_8x16b, y_1H_8x16b);
536 y_2L_8x16b = _mm_adds_epi16(round_8x16b, y_2L_8x16b);
537 y_2H_8x16b = _mm_adds_epi16(round_8x16b, y_2H_8x16b);
538 y_3L_8x16b = _mm_adds_epi16(round_8x16b, y_3L_8x16b);
539 y_3H_8x16b = _mm_adds_epi16(round_8x16b, y_3H_8x16b);
540
541 y_0L_8x16b = _mm_srai_epi16(y_0L_8x16b, log_wd);
542 y_0H_8x16b = _mm_srai_epi16(y_0H_8x16b, log_wd);
543 y_1L_8x16b = _mm_srai_epi16(y_1L_8x16b, log_wd);
544 y_1H_8x16b = _mm_srai_epi16(y_1H_8x16b, log_wd);
545 y_2L_8x16b = _mm_srai_epi16(y_2L_8x16b, log_wd);
546 y_2H_8x16b = _mm_srai_epi16(y_2H_8x16b, log_wd);
547 y_3L_8x16b = _mm_srai_epi16(y_3L_8x16b, log_wd);
548 y_3H_8x16b = _mm_srai_epi16(y_3H_8x16b, log_wd);
549
550 y_0L_8x16b = _mm_adds_epi16(ofst_8x16b, y_0L_8x16b);
551 y_0H_8x16b = _mm_adds_epi16(ofst_8x16b, y_0H_8x16b);
552 y_1L_8x16b = _mm_adds_epi16(ofst_8x16b, y_1L_8x16b);
553 y_1H_8x16b = _mm_adds_epi16(ofst_8x16b, y_1H_8x16b);
554 y_2L_8x16b = _mm_adds_epi16(ofst_8x16b, y_2L_8x16b);
555 y_2H_8x16b = _mm_adds_epi16(ofst_8x16b, y_2H_8x16b);
556 y_3L_8x16b = _mm_adds_epi16(ofst_8x16b, y_3L_8x16b);
557 y_3H_8x16b = _mm_adds_epi16(ofst_8x16b, y_3H_8x16b);
558
559 y_0_16x8b = _mm_packus_epi16(y_0L_8x16b, y_0H_8x16b);
560 y_1_16x8b = _mm_packus_epi16(y_1L_8x16b, y_1H_8x16b);
561 y_2_16x8b = _mm_packus_epi16(y_2L_8x16b, y_2H_8x16b);
562 y_3_16x8b = _mm_packus_epi16(y_3L_8x16b, y_3H_8x16b);
563
564 _mm_storeu_si128((__m128i *)pu1_dst, y_0_16x8b);
565 _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), y_1_16x8b);
566 _mm_storeu_si128((__m128i *)(pu1_dst + (dst_strd << 1)), y_2_16x8b);
567 _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd * 3), y_3_16x8b);
568
569 ht -= 4;
570 pu1_src += src_strd << 2;
571 pu1_dst += dst_strd << 2;
572 }
573 while(ht > 0);
574 }
575 }
576
577 /*****************************************************************************/
578 /* */
579 /* Function Name : ih264_weighted_pred_chroma_sse42 */
580 /* */
581 /* Description : This function performs the weighted prediction as */
582 /* described in sec 8.4.2.3.2 titled "Weighted sample */
583 /* prediction process" for chroma. The function gets one */
584 /* ht x wd block, weights it, rounds it off, offsets it, */
585 /* saturates it to unsigned 8-bit and stores it in the */
586 /* destination block. (ht,wd) can be (2,2), (4,2), (2,4), */
587 /* (4,4), (8,4), (4,8) or (8,8). */
588 /* */
589 /* Inputs : pu1_src - Pointer to source */
590 /* pu1_dst - Pointer to destination */
591 /* src_strd - stride for source */
592 /* dst_strd - stride for destination */
593 /* log_wd - number of bits to be rounded off */
594 /* wt - weight values for u and v */
595 /* ofst - offset values for u and v */
596 /* ht - height of the block */
597 /* wd - width of the block */
598 /* */
599 /* Issues : None */
600 /* */
601 /* Revision History: */
602 /* */
603 /* DD MM YYYY Author(s) Changes */
604 /* 04 02 2015 Kaushik Initial Version */
605 /* Senthoor */
606 /* */
607 /*****************************************************************************/
ih264_weighted_pred_chroma_sse42(UWORD8 * pu1_src,UWORD8 * pu1_dst,WORD32 src_strd,WORD32 dst_strd,WORD32 log_wd,WORD32 wt,WORD32 ofst,WORD32 ht,WORD32 wd)608 void ih264_weighted_pred_chroma_sse42(UWORD8 *pu1_src,
609 UWORD8 *pu1_dst,
610 WORD32 src_strd,
611 WORD32 dst_strd,
612 WORD32 log_wd,
613 WORD32 wt,
614 WORD32 ofst,
615 WORD32 ht,
616 WORD32 wd)
617 {
618 __m128i y_0_16x8b, y_1_16x8b;
619
620 __m128i wt_8x16b, round_8x16b, ofst_8x16b;
621
622 WORD32 ofst_u, ofst_v;
623 WORD32 round_val;
624
625 ofst_u = (WORD8)(ofst & 0xff);
626 ofst_v = (WORD8)(ofst >> 8);
627 round_val = 1 << (log_wd - 1);
628 ofst = (ofst_u & 0xffff) | (ofst_v << 16);
629
630 wt_8x16b = _mm_set1_epi32(wt);
631 round_8x16b = _mm_set1_epi16(round_val);
632 ofst_8x16b = _mm_set1_epi32(ofst);
633
634 if(wd == 2)
635 {
636 __m128i y_0_8x16b;
637
638 do
639 {
640 y_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src);
641 y_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd));
642
643 y_0_16x8b = _mm_unpacklo_epi32(y_0_16x8b, y_1_16x8b);
644
645 y_0_8x16b = _mm_cvtepu8_epi16(y_0_16x8b);
646
647 y_0_8x16b = _mm_mullo_epi16(y_0_8x16b, wt_8x16b);
648
649 y_0_8x16b = _mm_adds_epi16(round_8x16b, y_0_8x16b);
650
651 y_0_8x16b = _mm_srai_epi16(y_0_8x16b, log_wd);
652
653 y_0_8x16b = _mm_adds_epi16(ofst_8x16b, y_0_8x16b);
654
655 y_0_16x8b = _mm_packus_epi16(y_0_8x16b, y_0_8x16b);
656 y_1_16x8b = _mm_srli_si128(y_0_16x8b, 4);
657
658 *((WORD32 *)(pu1_dst)) = _mm_cvtsi128_si32(y_0_16x8b);
659 *((WORD32 *)(pu1_dst + dst_strd)) = _mm_cvtsi128_si32(y_1_16x8b);
660
661 ht -= 2;
662 pu1_src += src_strd << 1;
663 pu1_dst += dst_strd << 1;
664 }
665 while(ht > 0);
666 }
667 else if(wd == 4)
668 {
669 __m128i y_0_8x16b, y_1_8x16b;
670
671 do
672 {
673 y_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src);
674 y_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd));
675
676 y_0_8x16b = _mm_cvtepu8_epi16(y_0_16x8b);
677 y_1_8x16b = _mm_cvtepu8_epi16(y_1_16x8b);
678
679 y_0_8x16b = _mm_mullo_epi16(y_0_8x16b, wt_8x16b);
680 y_1_8x16b = _mm_mullo_epi16(y_1_8x16b, wt_8x16b);
681
682 y_0_8x16b = _mm_adds_epi16(round_8x16b, y_0_8x16b);
683 y_1_8x16b = _mm_adds_epi16(round_8x16b, y_1_8x16b);
684
685 y_0_8x16b = _mm_srai_epi16(y_0_8x16b, log_wd);
686 y_1_8x16b = _mm_srai_epi16(y_1_8x16b, log_wd);
687
688 y_0_8x16b = _mm_adds_epi16(ofst_8x16b, y_0_8x16b);
689 y_1_8x16b = _mm_adds_epi16(ofst_8x16b, y_1_8x16b);
690
691 y_0_16x8b = _mm_packus_epi16(y_0_8x16b, y_1_8x16b);
692 y_1_16x8b = _mm_srli_si128(y_0_16x8b, 8);
693
694 _mm_storel_epi64((__m128i *)pu1_dst, y_0_16x8b);
695 _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), y_1_16x8b);
696
697 ht -= 2;
698 pu1_src += src_strd << 1;
699 pu1_dst += dst_strd << 1;
700 }
701 while(ht > 0);
702 }
703 else // wd == 16
704 {
705 __m128i y_2_16x8b, y_3_16x8b;
706 __m128i y_0L_8x16b, y_1L_8x16b, y_2L_8x16b, y_3L_8x16b;
707 __m128i y_0H_8x16b, y_1H_8x16b, y_2H_8x16b, y_3H_8x16b;
708
709 __m128i zero_16x8b;
710 zero_16x8b = _mm_set1_epi8(0);
711
712 do
713 {
714 y_0_16x8b = _mm_loadu_si128((__m128i *)pu1_src);
715 y_1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + src_strd));
716 y_2_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + (src_strd << 1)));
717 y_3_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + src_strd * 3));
718
719 y_0L_8x16b = _mm_cvtepu8_epi16(y_0_16x8b);
720 y_0H_8x16b = _mm_unpackhi_epi8(y_0_16x8b, zero_16x8b);
721 y_1L_8x16b = _mm_cvtepu8_epi16(y_1_16x8b);
722 y_1H_8x16b = _mm_unpackhi_epi8(y_1_16x8b, zero_16x8b);
723 y_2L_8x16b = _mm_cvtepu8_epi16(y_2_16x8b);
724 y_2H_8x16b = _mm_unpackhi_epi8(y_2_16x8b, zero_16x8b);
725 y_3L_8x16b = _mm_cvtepu8_epi16(y_3_16x8b);
726 y_3H_8x16b = _mm_unpackhi_epi8(y_3_16x8b, zero_16x8b);
727
728 y_0L_8x16b = _mm_mullo_epi16(y_0L_8x16b, wt_8x16b);
729 y_0H_8x16b = _mm_mullo_epi16(y_0H_8x16b, wt_8x16b);
730 y_1L_8x16b = _mm_mullo_epi16(y_1L_8x16b, wt_8x16b);
731 y_1H_8x16b = _mm_mullo_epi16(y_1H_8x16b, wt_8x16b);
732 y_2L_8x16b = _mm_mullo_epi16(y_2L_8x16b, wt_8x16b);
733 y_2H_8x16b = _mm_mullo_epi16(y_2H_8x16b, wt_8x16b);
734 y_3L_8x16b = _mm_mullo_epi16(y_3L_8x16b, wt_8x16b);
735 y_3H_8x16b = _mm_mullo_epi16(y_3H_8x16b, wt_8x16b);
736
737 y_0L_8x16b = _mm_adds_epi16(round_8x16b, y_0L_8x16b);
738 y_0H_8x16b = _mm_adds_epi16(round_8x16b, y_0H_8x16b);
739 y_1L_8x16b = _mm_adds_epi16(round_8x16b, y_1L_8x16b);
740 y_1H_8x16b = _mm_adds_epi16(round_8x16b, y_1H_8x16b);
741 y_2L_8x16b = _mm_adds_epi16(round_8x16b, y_2L_8x16b);
742 y_2H_8x16b = _mm_adds_epi16(round_8x16b, y_2H_8x16b);
743 y_3L_8x16b = _mm_adds_epi16(round_8x16b, y_3L_8x16b);
744 y_3H_8x16b = _mm_adds_epi16(round_8x16b, y_3H_8x16b);
745
746 y_0L_8x16b = _mm_srai_epi16(y_0L_8x16b, log_wd);
747 y_0H_8x16b = _mm_srai_epi16(y_0H_8x16b, log_wd);
748 y_1L_8x16b = _mm_srai_epi16(y_1L_8x16b, log_wd);
749 y_1H_8x16b = _mm_srai_epi16(y_1H_8x16b, log_wd);
750 y_2L_8x16b = _mm_srai_epi16(y_2L_8x16b, log_wd);
751 y_2H_8x16b = _mm_srai_epi16(y_2H_8x16b, log_wd);
752 y_3L_8x16b = _mm_srai_epi16(y_3L_8x16b, log_wd);
753 y_3H_8x16b = _mm_srai_epi16(y_3H_8x16b, log_wd);
754
755 y_0L_8x16b = _mm_adds_epi16(ofst_8x16b, y_0L_8x16b);
756 y_0H_8x16b = _mm_adds_epi16(ofst_8x16b, y_0H_8x16b);
757 y_1L_8x16b = _mm_adds_epi16(ofst_8x16b, y_1L_8x16b);
758 y_1H_8x16b = _mm_adds_epi16(ofst_8x16b, y_1H_8x16b);
759 y_2L_8x16b = _mm_adds_epi16(ofst_8x16b, y_2L_8x16b);
760 y_2H_8x16b = _mm_adds_epi16(ofst_8x16b, y_2H_8x16b);
761 y_3L_8x16b = _mm_adds_epi16(ofst_8x16b, y_3L_8x16b);
762 y_3H_8x16b = _mm_adds_epi16(ofst_8x16b, y_3H_8x16b);
763
764 y_0_16x8b = _mm_packus_epi16(y_0L_8x16b, y_0H_8x16b);
765 y_1_16x8b = _mm_packus_epi16(y_1L_8x16b, y_1H_8x16b);
766 y_2_16x8b = _mm_packus_epi16(y_2L_8x16b, y_2H_8x16b);
767 y_3_16x8b = _mm_packus_epi16(y_3L_8x16b, y_3H_8x16b);
768
769 _mm_storeu_si128((__m128i *)pu1_dst, y_0_16x8b);
770 _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), y_1_16x8b);
771 _mm_storeu_si128((__m128i *)(pu1_dst + (dst_strd << 1)), y_2_16x8b);
772 _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd * 3), y_3_16x8b);
773
774 ht -= 4;
775 pu1_src += src_strd << 2;
776 pu1_dst += dst_strd << 2;
777 }
778 while(ht > 0);
779 }
780 }
781
782 /*****************************************************************************/
783 /* */
784 /* Function Name : ih264_weighted_bi_pred_luma_sse42 */
785 /* */
786 /* Description : This function performs the weighted biprediction as */
787 /* described in sec 8.4.2.3.2 titled "Weighted sample */
788 /* prediction process" for luma. The function gets two */
789 /* ht x wd blocks, weights them, adds them, rounds off the */
790 /* sum, offsets it, saturates it to unsigned 8-bit and */
791 /* stores it in the destination block. (ht,wd) can be */
792 /* (4,4), (8,4), (4,8), (8,8), (16,8), (8,16) or (16,16). */
793 /* */
794 /* Inputs : pu1_src1 - Pointer to source 1 */
795 /* pu1_src2 - Pointer to source 2 */
796 /* pu1_dst - Pointer to destination */
797 /* src_strd1 - stride for source 1 */
798 /* src_strd2 - stride for source 2 */
799 /* dst_strd2 - stride for destination */
800 /* log_wd - number of bits to be rounded off */
801 /* wt1 - weight value for source 1 */
802 /* wt2 - weight value for source 2 */
803 /* ofst1 - offset value for source 1 */
804 /* ofst2 - offset value for source 2 */
805 /* ht - height of the block */
806 /* wd - width of the block */
807 /* */
808 /* Issues : None */
809 /* */
810 /* Revision History: */
811 /* */
812 /* DD MM YYYY Author(s) Changes */
813 /* 04 02 2015 Kaushik Initial Version */
814 /* Senthoor */
815 /* */
816 /*****************************************************************************/
ih264_weighted_bi_pred_luma_sse42(UWORD8 * pu1_src1,UWORD8 * pu1_src2,UWORD8 * pu1_dst,WORD32 src_strd1,WORD32 src_strd2,WORD32 dst_strd,WORD32 log_wd,WORD32 wt1,WORD32 wt2,WORD32 ofst1,WORD32 ofst2,WORD32 ht,WORD32 wd)817 void ih264_weighted_bi_pred_luma_sse42(UWORD8 *pu1_src1,
818 UWORD8 *pu1_src2,
819 UWORD8 *pu1_dst,
820 WORD32 src_strd1,
821 WORD32 src_strd2,
822 WORD32 dst_strd,
823 WORD32 log_wd,
824 WORD32 wt1,
825 WORD32 wt2,
826 WORD32 ofst1,
827 WORD32 ofst2,
828 WORD32 ht,
829 WORD32 wd)
830 {
831 __m128i y1_0_16x8b, y1_1_16x8b;
832 __m128i y2_0_16x8b, y2_1_16x8b;
833
834 __m128i wt1_8x16b, wt2_8x16b;
835 __m128i ofst_8x16b, round_8x16b;
836
837 WORD32 ofst;
838 WORD32 round_val, shft;
839
840 wt1 = (WORD16)(wt1 & 0xffff);
841 wt2 = (WORD16)(wt2 & 0xffff);
842 round_val = 1 << log_wd;
843 shft = log_wd + 1;
844 ofst1 = (WORD8)(ofst1 & 0xff);
845 ofst2 = (WORD8)(ofst2 & 0xff);
846 ofst = (ofst1 + ofst2 + 1) >> 1;
847
848 wt1_8x16b = _mm_set1_epi16(wt1);
849 wt2_8x16b = _mm_set1_epi16(wt2);
850 round_8x16b = _mm_set1_epi16(round_val);
851 ofst_8x16b = _mm_set1_epi16(ofst);
852
853 if(wd == 4)
854 {
855 __m128i y1_2_16x8b, y1_3_16x8b;
856 __m128i y2_2_16x8b, y2_3_16x8b;
857
858 __m128i y1_0_8x16b, y1_2_8x16b;
859 __m128i y2_0_8x16b, y2_2_8x16b;
860
861 do
862 {
863 y1_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src1);
864 y1_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src1 + src_strd1));
865 y1_2_16x8b = _mm_loadl_epi64(
866 (__m128i *)(pu1_src1 + (src_strd1 << 1)));
867 y1_3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src1 + src_strd1 * 3));
868
869 y2_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src2);
870 y2_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src2 + src_strd2));
871 y2_2_16x8b = _mm_loadl_epi64(
872 (__m128i *)(pu1_src2 + (src_strd2 << 1)));
873 y2_3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src2 + src_strd2 * 3));
874
875 y1_0_16x8b = _mm_unpacklo_epi32(y1_0_16x8b, y1_1_16x8b);
876 y1_2_16x8b = _mm_unpacklo_epi32(y1_2_16x8b, y1_3_16x8b);
877 y2_0_16x8b = _mm_unpacklo_epi32(y2_0_16x8b, y2_1_16x8b);
878 y2_2_16x8b = _mm_unpacklo_epi32(y2_2_16x8b, y2_3_16x8b);
879
880 y1_0_8x16b = _mm_cvtepu8_epi16(y1_0_16x8b);
881 y1_2_8x16b = _mm_cvtepu8_epi16(y1_2_16x8b);
882 y2_0_8x16b = _mm_cvtepu8_epi16(y2_0_16x8b);
883 y2_2_8x16b = _mm_cvtepu8_epi16(y2_2_16x8b);
884
885 y1_0_8x16b = _mm_mullo_epi16(y1_0_8x16b, wt1_8x16b);
886 y2_0_8x16b = _mm_mullo_epi16(y2_0_8x16b, wt2_8x16b);
887 y1_2_8x16b = _mm_mullo_epi16(y1_2_8x16b, wt1_8x16b);
888 y2_2_8x16b = _mm_mullo_epi16(y2_2_8x16b, wt2_8x16b);
889
890 y1_0_8x16b = _mm_adds_epi16(y1_0_8x16b, y2_0_8x16b);
891 y1_2_8x16b = _mm_adds_epi16(y1_2_8x16b, y2_2_8x16b);
892
893 y1_0_8x16b = _mm_adds_epi16(round_8x16b, y1_0_8x16b);
894 y1_2_8x16b = _mm_adds_epi16(round_8x16b, y1_2_8x16b);
895
896 y1_0_8x16b = _mm_srai_epi16(y1_0_8x16b, shft);
897 y1_2_8x16b = _mm_srai_epi16(y1_2_8x16b, shft);
898
899 y1_0_8x16b = _mm_adds_epi16(ofst_8x16b, y1_0_8x16b);
900 y1_2_8x16b = _mm_adds_epi16(ofst_8x16b, y1_2_8x16b);
901
902 y1_0_16x8b = _mm_packus_epi16(y1_0_8x16b, y1_2_8x16b);
903 y1_1_16x8b = _mm_srli_si128(y1_0_16x8b, 4);
904 y1_2_16x8b = _mm_srli_si128(y1_0_16x8b, 8);
905 y1_3_16x8b = _mm_srli_si128(y1_0_16x8b, 12);
906
907 *((WORD32 *)(pu1_dst)) = _mm_cvtsi128_si32(y1_0_16x8b);
908 *((WORD32 *)(pu1_dst + dst_strd)) = _mm_cvtsi128_si32(y1_1_16x8b);
909 *((WORD32 *)(pu1_dst + (dst_strd << 1))) = _mm_cvtsi128_si32(y1_2_16x8b);
910 *((WORD32 *)(pu1_dst + dst_strd * 3)) = _mm_cvtsi128_si32(y1_3_16x8b);
911
912
913 ht -= 4;
914 pu1_src1 += src_strd1 << 2;
915 pu1_src2 += src_strd2 << 2;
916 pu1_dst += dst_strd << 2;
917 }
918 while(ht > 0);
919 }
920 else if(wd == 8)
921 {
922 __m128i y1_2_16x8b, y1_3_16x8b;
923 __m128i y2_2_16x8b, y2_3_16x8b;
924
925 __m128i y1_0_8x16b, y1_1_8x16b, y1_2_8x16b, y1_3_8x16b;
926 __m128i y2_0_8x16b, y2_1_8x16b, y2_2_8x16b, y2_3_8x16b;
927
928 do
929 {
930 y1_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src1);
931 y1_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src1 + src_strd1));
932 y1_2_16x8b = _mm_loadl_epi64(
933 (__m128i *)(pu1_src1 + (src_strd1 << 1)));
934 y1_3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src1 + src_strd1 * 3));
935
936 y2_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src2);
937 y2_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src2 + src_strd2));
938 y2_2_16x8b = _mm_loadl_epi64(
939 (__m128i *)(pu1_src2 + (src_strd2 << 1)));
940 y2_3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src2 + src_strd2 * 3));
941
942 y1_0_8x16b = _mm_cvtepu8_epi16(y1_0_16x8b);
943 y1_1_8x16b = _mm_cvtepu8_epi16(y1_1_16x8b);
944 y1_2_8x16b = _mm_cvtepu8_epi16(y1_2_16x8b);
945 y1_3_8x16b = _mm_cvtepu8_epi16(y1_3_16x8b);
946
947 y2_0_8x16b = _mm_cvtepu8_epi16(y2_0_16x8b);
948 y2_1_8x16b = _mm_cvtepu8_epi16(y2_1_16x8b);
949 y2_2_8x16b = _mm_cvtepu8_epi16(y2_2_16x8b);
950 y2_3_8x16b = _mm_cvtepu8_epi16(y2_3_16x8b);
951
952 y1_0_8x16b = _mm_mullo_epi16(y1_0_8x16b, wt1_8x16b);
953 y2_0_8x16b = _mm_mullo_epi16(y2_0_8x16b, wt2_8x16b);
954 y1_1_8x16b = _mm_mullo_epi16(y1_1_8x16b, wt1_8x16b);
955 y2_1_8x16b = _mm_mullo_epi16(y2_1_8x16b, wt2_8x16b);
956
957 y1_2_8x16b = _mm_mullo_epi16(y1_2_8x16b, wt1_8x16b);
958 y2_2_8x16b = _mm_mullo_epi16(y2_2_8x16b, wt2_8x16b);
959 y1_3_8x16b = _mm_mullo_epi16(y1_3_8x16b, wt1_8x16b);
960 y2_3_8x16b = _mm_mullo_epi16(y2_3_8x16b, wt2_8x16b);
961
962 y1_0_8x16b = _mm_adds_epi16(y1_0_8x16b, y2_0_8x16b);
963 y1_1_8x16b = _mm_adds_epi16(y1_1_8x16b, y2_1_8x16b);
964 y1_2_8x16b = _mm_adds_epi16(y1_2_8x16b, y2_2_8x16b);
965 y1_3_8x16b = _mm_adds_epi16(y1_3_8x16b, y2_3_8x16b);
966
967 y1_0_8x16b = _mm_adds_epi16(round_8x16b, y1_0_8x16b);
968 y1_1_8x16b = _mm_adds_epi16(round_8x16b, y1_1_8x16b);
969 y1_2_8x16b = _mm_adds_epi16(round_8x16b, y1_2_8x16b);
970 y1_3_8x16b = _mm_adds_epi16(round_8x16b, y1_3_8x16b);
971
972 y1_0_8x16b = _mm_srai_epi16(y1_0_8x16b, shft);
973 y1_1_8x16b = _mm_srai_epi16(y1_1_8x16b, shft);
974 y1_2_8x16b = _mm_srai_epi16(y1_2_8x16b, shft);
975 y1_3_8x16b = _mm_srai_epi16(y1_3_8x16b, shft);
976
977 y1_0_8x16b = _mm_adds_epi16(ofst_8x16b, y1_0_8x16b);
978 y1_1_8x16b = _mm_adds_epi16(ofst_8x16b, y1_1_8x16b);
979 y1_2_8x16b = _mm_adds_epi16(ofst_8x16b, y1_2_8x16b);
980 y1_3_8x16b = _mm_adds_epi16(ofst_8x16b, y1_3_8x16b);
981
982 y1_0_16x8b = _mm_packus_epi16(y1_0_8x16b, y1_1_8x16b);
983 y1_2_16x8b = _mm_packus_epi16(y1_2_8x16b, y1_3_8x16b);
984 y1_1_16x8b = _mm_srli_si128(y1_0_16x8b, 8);
985 y1_3_16x8b = _mm_srli_si128(y1_2_16x8b, 8);
986
987 _mm_storel_epi64((__m128i *)pu1_dst, y1_0_16x8b);
988 _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), y1_1_16x8b);
989 _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd << 1)), y1_2_16x8b);
990 _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd * 3), y1_3_16x8b);
991
992 ht -= 4;
993 pu1_src1 += src_strd1 << 2;
994 pu1_src2 += src_strd2 << 2;
995 pu1_dst += dst_strd << 2;
996 }
997 while(ht > 0);
998 }
999 else // wd == 16
1000 {
1001 __m128i y1_0L_8x16b, y1_0H_8x16b, y1_1L_8x16b, y1_1H_8x16b;
1002 __m128i y2_0L_8x16b, y2_0H_8x16b, y2_1L_8x16b, y2_1H_8x16b;
1003
1004 __m128i zero_16x8b;
1005 zero_16x8b = _mm_set1_epi8(0);
1006
1007 do
1008 {
1009 y1_0_16x8b = _mm_loadu_si128((__m128i *)pu1_src1);
1010 y1_1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src1 + src_strd1));
1011 y2_0_16x8b = _mm_loadu_si128((__m128i *)pu1_src2);
1012 y2_1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src2 + src_strd2));
1013
1014 y1_0L_8x16b = _mm_cvtepu8_epi16(y1_0_16x8b);
1015 y1_0H_8x16b = _mm_unpackhi_epi8(y1_0_16x8b, zero_16x8b);
1016 y1_1L_8x16b = _mm_cvtepu8_epi16(y1_1_16x8b);
1017 y1_1H_8x16b = _mm_unpackhi_epi8(y1_1_16x8b, zero_16x8b);
1018
1019 y2_0L_8x16b = _mm_cvtepu8_epi16(y2_0_16x8b);
1020 y2_0H_8x16b = _mm_unpackhi_epi8(y2_0_16x8b, zero_16x8b);
1021 y2_1L_8x16b = _mm_cvtepu8_epi16(y2_1_16x8b);
1022 y2_1H_8x16b = _mm_unpackhi_epi8(y2_1_16x8b, zero_16x8b);
1023
1024 y1_0L_8x16b = _mm_mullo_epi16(y1_0L_8x16b, wt1_8x16b);
1025 y1_0H_8x16b = _mm_mullo_epi16(y1_0H_8x16b, wt1_8x16b);
1026 y1_1L_8x16b = _mm_mullo_epi16(y1_1L_8x16b, wt1_8x16b);
1027 y1_1H_8x16b = _mm_mullo_epi16(y1_1H_8x16b, wt1_8x16b);
1028
1029 y2_0L_8x16b = _mm_mullo_epi16(y2_0L_8x16b, wt2_8x16b);
1030 y2_0H_8x16b = _mm_mullo_epi16(y2_0H_8x16b, wt2_8x16b);
1031 y2_1L_8x16b = _mm_mullo_epi16(y2_1L_8x16b, wt2_8x16b);
1032 y2_1H_8x16b = _mm_mullo_epi16(y2_1H_8x16b, wt2_8x16b);
1033
1034 y1_0L_8x16b = _mm_adds_epi16(y1_0L_8x16b, y2_0L_8x16b);
1035 y1_0H_8x16b = _mm_adds_epi16(y1_0H_8x16b, y2_0H_8x16b);
1036 y1_1L_8x16b = _mm_adds_epi16(y1_1L_8x16b, y2_1L_8x16b);
1037 y1_1H_8x16b = _mm_adds_epi16(y1_1H_8x16b, y2_1H_8x16b);
1038
1039 y1_0L_8x16b = _mm_adds_epi16(round_8x16b, y1_0L_8x16b);
1040 y1_0H_8x16b = _mm_adds_epi16(round_8x16b, y1_0H_8x16b);
1041 y1_1L_8x16b = _mm_adds_epi16(round_8x16b, y1_1L_8x16b);
1042 y1_1H_8x16b = _mm_adds_epi16(round_8x16b, y1_1H_8x16b);
1043
1044 y1_0L_8x16b = _mm_srai_epi16(y1_0L_8x16b, shft);
1045 y1_0H_8x16b = _mm_srai_epi16(y1_0H_8x16b, shft);
1046 y1_1L_8x16b = _mm_srai_epi16(y1_1L_8x16b, shft);
1047 y1_1H_8x16b = _mm_srai_epi16(y1_1H_8x16b, shft);
1048
1049 y1_0L_8x16b = _mm_adds_epi16(ofst_8x16b, y1_0L_8x16b);
1050 y1_0H_8x16b = _mm_adds_epi16(ofst_8x16b, y1_0H_8x16b);
1051 y1_1L_8x16b = _mm_adds_epi16(ofst_8x16b, y1_1L_8x16b);
1052 y1_1H_8x16b = _mm_adds_epi16(ofst_8x16b, y1_1H_8x16b);
1053
1054 y1_0_16x8b = _mm_packus_epi16(y1_0L_8x16b, y1_0H_8x16b);
1055 y1_1_16x8b = _mm_packus_epi16(y1_1L_8x16b, y1_1H_8x16b);
1056
1057 _mm_storeu_si128((__m128i *)pu1_dst, y1_0_16x8b);
1058 _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), y1_1_16x8b);
1059
1060 ht -= 2;
1061 pu1_src1 += src_strd1 << 1;
1062 pu1_src2 += src_strd2 << 1;
1063 pu1_dst += dst_strd << 1;
1064 }
1065 while(ht > 0);
1066 }
1067 }
1068
1069 /*****************************************************************************/
1070 /* */
1071 /* Function Name : ih264_weighted_bi_pred_chroma_sse42 */
1072 /* */
1073 /* Description : This function performs the weighted biprediction as */
1074 /* described in sec 8.4.2.3.2 titled "Weighted sample */
1075 /* prediction process" for chroma. The function gets two */
1076 /* ht x wd blocks, weights them, adds them, rounds off the */
1077 /* sum, offsets it, saturates it to unsigned 8-bit and */
1078 /* stores it in the destination block. (ht,wd) can be */
1079 /* (2,2), (4,2), (2,4), (4,4), (8,4), (4,8) or (8,8). */
1080 /* */
1081 /* Inputs : pu1_src1 - Pointer to source 1 */
1082 /* pu1_src2 - Pointer to source 2 */
1083 /* pu1_dst - Pointer to destination */
1084 /* src_strd1 - stride for source 1 */
1085 /* src_strd2 - stride for source 2 */
1086 /* dst_strd2 - stride for destination */
1087 /* log_wd - number of bits to be rounded off */
1088 /* wt1 - weight values for u and v in source 1 */
1089 /* wt2 - weight values for u and v in source 2 */
1090 /* ofst1 - offset value for u and v in source 1 */
1091 /* ofst2 - offset value for u and v in source 2 */
1092 /* ht - height of the block */
1093 /* wd - width of the block */
1094 /* */
1095 /* Issues : None */
1096 /* */
1097 /* Revision History: */
1098 /* */
1099 /* DD MM YYYY Author(s) Changes */
1100 /* 04 02 2015 Kaushik Initial Version */
1101 /* Senthoor */
1102 /* */
1103 /*****************************************************************************/
ih264_weighted_bi_pred_chroma_sse42(UWORD8 * pu1_src1,UWORD8 * pu1_src2,UWORD8 * pu1_dst,WORD32 src_strd1,WORD32 src_strd2,WORD32 dst_strd,WORD32 log_wd,WORD32 wt1,WORD32 wt2,WORD32 ofst1,WORD32 ofst2,WORD32 ht,WORD32 wd)1104 void ih264_weighted_bi_pred_chroma_sse42(UWORD8 *pu1_src1,
1105 UWORD8 *pu1_src2,
1106 UWORD8 *pu1_dst,
1107 WORD32 src_strd1,
1108 WORD32 src_strd2,
1109 WORD32 dst_strd,
1110 WORD32 log_wd,
1111 WORD32 wt1,
1112 WORD32 wt2,
1113 WORD32 ofst1,
1114 WORD32 ofst2,
1115 WORD32 ht,
1116 WORD32 wd)
1117 {
1118 __m128i y1_0_16x8b, y1_1_16x8b;
1119 __m128i y2_0_16x8b, y2_1_16x8b;
1120
1121 __m128i wt1_8x16b, wt2_8x16b;
1122 __m128i ofst_8x16b, round_8x16b;
1123
1124 WORD32 ofst1_u, ofst2_u, ofst_u;
1125 WORD32 ofst1_v, ofst2_v, ofst_v;
1126 WORD32 round_val, shft, ofst_val;
1127
1128 round_val = 1 << log_wd;
1129 shft = log_wd + 1;
1130
1131 ofst1_u = (WORD8)(ofst1 & 0xff);
1132 ofst1_v = (WORD8)(ofst1 >> 8);
1133 ofst2_u = (WORD8)(ofst2 & 0xff);
1134 ofst2_v = (WORD8)(ofst2 >> 8);
1135
1136 wt1_8x16b = _mm_set1_epi32(wt1);
1137 wt2_8x16b = _mm_set1_epi32(wt2);
1138
1139 ofst_u = (ofst1_u + ofst2_u + 1) >> 1;
1140 ofst_v = (ofst1_v + ofst2_v + 1) >> 1;
1141 ofst_val = (ofst_u & 0xffff) | (ofst_v << 16);
1142
1143 round_8x16b = _mm_set1_epi16(round_val);
1144 ofst_8x16b = _mm_set1_epi32(ofst_val);
1145
1146 if(wd == 2)
1147 {
1148 __m128i y1_0_8x16b, y2_0_8x16b;
1149
1150 do
1151 {
1152 y1_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src1);
1153 y1_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src1 + src_strd1));
1154
1155 y2_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src2);
1156 y2_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src2 + src_strd2));
1157
1158 y1_0_16x8b = _mm_unpacklo_epi32(y1_0_16x8b, y1_1_16x8b);
1159 y2_0_16x8b = _mm_unpacklo_epi32(y2_0_16x8b, y2_1_16x8b);
1160
1161 y1_0_8x16b = _mm_cvtepu8_epi16(y1_0_16x8b);
1162 y2_0_8x16b = _mm_cvtepu8_epi16(y2_0_16x8b);
1163
1164 y1_0_8x16b = _mm_mullo_epi16(y1_0_8x16b, wt1_8x16b);
1165 y2_0_8x16b = _mm_mullo_epi16(y2_0_8x16b, wt2_8x16b);
1166
1167 y1_0_8x16b = _mm_adds_epi16(y1_0_8x16b, y2_0_8x16b);
1168 y1_0_8x16b = _mm_adds_epi16(round_8x16b, y1_0_8x16b);
1169
1170 y1_0_8x16b = _mm_srai_epi16(y1_0_8x16b, shft);
1171 y1_0_8x16b = _mm_adds_epi16(ofst_8x16b, y1_0_8x16b);
1172
1173 y1_0_16x8b = _mm_packus_epi16(y1_0_8x16b, y1_0_8x16b);
1174 y1_1_16x8b = _mm_srli_si128(y1_0_16x8b, 4);
1175
1176 *((WORD32 *)(pu1_dst)) = _mm_cvtsi128_si32(y1_0_16x8b);
1177 *((WORD32 *)(pu1_dst + dst_strd)) = _mm_cvtsi128_si32(y1_1_16x8b);
1178
1179 ht -= 2;
1180 pu1_src1 += src_strd1 << 1;
1181 pu1_src2 += src_strd2 << 1;
1182 pu1_dst += dst_strd << 1;
1183 }
1184 while(ht > 0);
1185 }
1186 else if(wd == 4)
1187 {
1188 __m128i y1_0_8x16b, y1_1_8x16b;
1189 __m128i y2_0_8x16b, y2_1_8x16b;
1190
1191 do
1192 {
1193 y1_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src1);
1194 y1_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src1 + src_strd1));
1195
1196 y2_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src2);
1197 y2_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src2 + src_strd2));
1198
1199 y1_0_8x16b = _mm_cvtepu8_epi16(y1_0_16x8b);
1200 y1_1_8x16b = _mm_cvtepu8_epi16(y1_1_16x8b);
1201
1202 y2_0_8x16b = _mm_cvtepu8_epi16(y2_0_16x8b);
1203 y2_1_8x16b = _mm_cvtepu8_epi16(y2_1_16x8b);
1204
1205 y1_0_8x16b = _mm_mullo_epi16(y1_0_8x16b, wt1_8x16b);
1206 y2_0_8x16b = _mm_mullo_epi16(y2_0_8x16b, wt2_8x16b);
1207 y1_1_8x16b = _mm_mullo_epi16(y1_1_8x16b, wt1_8x16b);
1208 y2_1_8x16b = _mm_mullo_epi16(y2_1_8x16b, wt2_8x16b);
1209
1210 y1_0_8x16b = _mm_adds_epi16(y1_0_8x16b, y2_0_8x16b);
1211 y1_1_8x16b = _mm_adds_epi16(y1_1_8x16b, y2_1_8x16b);
1212
1213 y1_0_8x16b = _mm_adds_epi16(round_8x16b, y1_0_8x16b);
1214 y1_1_8x16b = _mm_adds_epi16(round_8x16b, y1_1_8x16b);
1215
1216 y1_0_8x16b = _mm_srai_epi16(y1_0_8x16b, shft);
1217 y1_1_8x16b = _mm_srai_epi16(y1_1_8x16b, shft);
1218
1219 y1_0_8x16b = _mm_adds_epi16(ofst_8x16b, y1_0_8x16b);
1220 y1_1_8x16b = _mm_adds_epi16(ofst_8x16b, y1_1_8x16b);
1221
1222 y1_0_16x8b = _mm_packus_epi16(y1_0_8x16b, y1_1_8x16b);
1223 y1_1_16x8b = _mm_srli_si128(y1_0_16x8b, 8);
1224
1225 _mm_storel_epi64((__m128i *)pu1_dst, y1_0_16x8b);
1226 _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), y1_1_16x8b);
1227
1228 ht -= 2;
1229 pu1_src1 += src_strd1 << 1;
1230 pu1_src2 += src_strd2 << 1;
1231 pu1_dst += dst_strd << 1;
1232 }
1233 while(ht > 0);
1234 }
1235 else // wd == 8
1236 {
1237 __m128i y1_0L_8x16b, y1_0H_8x16b, y1_1L_8x16b, y1_1H_8x16b;
1238 __m128i y2_0L_8x16b, y2_0H_8x16b, y2_1L_8x16b, y2_1H_8x16b;
1239
1240 __m128i zero_16x8b;
1241 zero_16x8b = _mm_set1_epi8(0);
1242
1243 do
1244 {
1245 y1_0_16x8b = _mm_loadu_si128((__m128i *)pu1_src1);
1246 y1_1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src1 + src_strd1));
1247 y2_0_16x8b = _mm_loadu_si128((__m128i *)pu1_src2);
1248 y2_1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src2 + src_strd2));
1249
1250 y1_0L_8x16b = _mm_cvtepu8_epi16(y1_0_16x8b);
1251 y1_0H_8x16b = _mm_unpackhi_epi8(y1_0_16x8b, zero_16x8b);
1252 y1_1L_8x16b = _mm_cvtepu8_epi16(y1_1_16x8b);
1253 y1_1H_8x16b = _mm_unpackhi_epi8(y1_1_16x8b, zero_16x8b);
1254
1255 y2_0L_8x16b = _mm_cvtepu8_epi16(y2_0_16x8b);
1256 y2_0H_8x16b = _mm_unpackhi_epi8(y2_0_16x8b, zero_16x8b);
1257 y2_1L_8x16b = _mm_cvtepu8_epi16(y2_1_16x8b);
1258 y2_1H_8x16b = _mm_unpackhi_epi8(y2_1_16x8b, zero_16x8b);
1259
1260 y1_0L_8x16b = _mm_mullo_epi16(y1_0L_8x16b, wt1_8x16b);
1261 y1_0H_8x16b = _mm_mullo_epi16(y1_0H_8x16b, wt1_8x16b);
1262 y1_1L_8x16b = _mm_mullo_epi16(y1_1L_8x16b, wt1_8x16b);
1263 y1_1H_8x16b = _mm_mullo_epi16(y1_1H_8x16b, wt1_8x16b);
1264
1265 y2_0L_8x16b = _mm_mullo_epi16(y2_0L_8x16b, wt2_8x16b);
1266 y2_0H_8x16b = _mm_mullo_epi16(y2_0H_8x16b, wt2_8x16b);
1267 y2_1L_8x16b = _mm_mullo_epi16(y2_1L_8x16b, wt2_8x16b);
1268 y2_1H_8x16b = _mm_mullo_epi16(y2_1H_8x16b, wt2_8x16b);
1269
1270 y1_0L_8x16b = _mm_adds_epi16(y1_0L_8x16b, y2_0L_8x16b);
1271 y1_0H_8x16b = _mm_adds_epi16(y1_0H_8x16b, y2_0H_8x16b);
1272 y1_1L_8x16b = _mm_adds_epi16(y1_1L_8x16b, y2_1L_8x16b);
1273 y1_1H_8x16b = _mm_adds_epi16(y1_1H_8x16b, y2_1H_8x16b);
1274
1275 y1_0L_8x16b = _mm_adds_epi16(round_8x16b, y1_0L_8x16b);
1276 y1_0H_8x16b = _mm_adds_epi16(round_8x16b, y1_0H_8x16b);
1277 y1_1L_8x16b = _mm_adds_epi16(round_8x16b, y1_1L_8x16b);
1278 y1_1H_8x16b = _mm_adds_epi16(round_8x16b, y1_1H_8x16b);
1279
1280 y1_0L_8x16b = _mm_srai_epi16(y1_0L_8x16b, shft);
1281 y1_0H_8x16b = _mm_srai_epi16(y1_0H_8x16b, shft);
1282 y1_1L_8x16b = _mm_srai_epi16(y1_1L_8x16b, shft);
1283 y1_1H_8x16b = _mm_srai_epi16(y1_1H_8x16b, shft);
1284
1285 y1_0L_8x16b = _mm_adds_epi16(ofst_8x16b, y1_0L_8x16b);
1286 y1_0H_8x16b = _mm_adds_epi16(ofst_8x16b, y1_0H_8x16b);
1287 y1_1L_8x16b = _mm_adds_epi16(ofst_8x16b, y1_1L_8x16b);
1288 y1_1H_8x16b = _mm_adds_epi16(ofst_8x16b, y1_1H_8x16b);
1289
1290 y1_0_16x8b = _mm_packus_epi16(y1_0L_8x16b, y1_0H_8x16b);
1291 y1_1_16x8b = _mm_packus_epi16(y1_1L_8x16b, y1_1H_8x16b);
1292
1293 _mm_storeu_si128((__m128i *)pu1_dst, y1_0_16x8b);
1294 _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), y1_1_16x8b);
1295
1296 ht -= 2;
1297 pu1_src1 += src_strd1 << 1;
1298 pu1_src2 += src_strd2 << 1;
1299 pu1_dst += dst_strd << 1;
1300 }
1301 while(ht > 0);
1302 }
1303 }
1304