1 /******************************************************************************
2 *
3 * Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at:
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 *
17 ******************************************************************************/
18 /**
19 *******************************************************************************
20 * @file
21 * ihevcd_it_rec_dc_x86_intr.c
22 *
23 * @brief
24 * Platform specific intrinsic implementation of certain functions
25 *
26 * @author
27 * Ittiam
28 * @par List of Functions:
29 * - ihevcd_itrans_recon_dc
30 * - ihevcd_fmt_conv_420sp_to_420p
31 *
32 * @remarks
33 * None
34 *
35 *******************************************************************************
36 */
37
38 #include "ihevc_typedefs.h"
39 #include "ihevc_defs.h"
40 #include "ihevc_macros.h"
41 #include "ihevc_platform_macros.h"
42 #include "ihevcd_function_selector.h"
43
44 #include <immintrin.h>
45
46
ihevcd_itrans_recon_dc_luma_sse42(UWORD8 * pu1_pred,UWORD8 * pu1_dst,WORD32 pred_strd,WORD32 dst_strd,WORD32 log2_trans_size,WORD16 i2_coeff_value)47 void ihevcd_itrans_recon_dc_luma_sse42(UWORD8 *pu1_pred, UWORD8 *pu1_dst, WORD32 pred_strd, WORD32 dst_strd,
48 WORD32 log2_trans_size, WORD16 i2_coeff_value)
49 {
50 __m128i m_temp_reg_0;
51 __m128i m_temp_reg_1;
52 __m128i m_temp_reg_2;
53 __m128i m_temp_reg_3;
54 __m128i m_temp_reg_4;
55 __m128i m_temp_reg_5;
56 __m128i m_temp_reg_6;
57 __m128i m_temp_reg_7;
58 __m128i m_temp_reg_8;
59 __m128i m_temp_reg_9;
60 __m128i m_temp_reg_10;
61 __m128i m_temp_reg_11;
62 __m128i m_temp_reg_12;
63 __m128i m_temp_reg_13;
64 __m128i m_temp_reg_14;
65 __m128i m_temp_reg_15;
66 __m128i m_temp_reg_20, zero_8x16b;
67 __m128i *pi4_dst = (__m128i *)pu1_dst;
68
69
70 //WORD32 row,col;
71 WORD32 add, shift;
72 WORD32 dc_value, quant_out;
73 WORD32 trans_size;
74
75
76
77
78 trans_size = (1 << log2_trans_size);
79
80 quant_out = i2_coeff_value;
81
82 shift = IT_SHIFT_STAGE_1;
83 add = 1 << (shift - 1);
84 dc_value = CLIP_S16((quant_out * 64 + add) >> shift);
85 shift = IT_SHIFT_STAGE_2;
86 add = 1 << (shift - 1);
87 dc_value = CLIP_S16((dc_value * 64 + add) >> shift);
88
89 /*Replicate the DC value within 16 bits in 128 bit register*/
90 m_temp_reg_20 = _mm_set1_epi16(dc_value);
91 zero_8x16b = _mm_setzero_si128();
92
93 if(trans_size == 4)
94 {
95 WORD32 *pi4_dst = (WORD32 *)pu1_dst;
96
97 m_temp_reg_0 = _mm_loadl_epi64((__m128i *)(pu1_pred));
98 m_temp_reg_1 = _mm_loadl_epi64((__m128i *)(pu1_pred + pred_strd));
99 m_temp_reg_2 = _mm_loadl_epi64((__m128i *)(pu1_pred + 2 * pred_strd));
100 m_temp_reg_3 = _mm_loadl_epi64((__m128i *)(pu1_pred + 3 * pred_strd));
101
102 m_temp_reg_4 = _mm_unpacklo_epi32(m_temp_reg_0, m_temp_reg_1);
103 m_temp_reg_5 = _mm_unpacklo_epi32(m_temp_reg_2, m_temp_reg_3);
104
105 m_temp_reg_4 = _mm_unpacklo_epi8(m_temp_reg_4, zero_8x16b);
106 m_temp_reg_5 = _mm_unpacklo_epi8(m_temp_reg_5, zero_8x16b);
107
108 m_temp_reg_6 = _mm_add_epi16(m_temp_reg_4, m_temp_reg_20);
109 m_temp_reg_7 = _mm_add_epi16(m_temp_reg_5, m_temp_reg_20);
110
111 m_temp_reg_8 = _mm_packus_epi16(m_temp_reg_6, m_temp_reg_7);
112
113
114 *pi4_dst = _mm_cvtsi128_si32(m_temp_reg_8);
115 m_temp_reg_1 = _mm_srli_si128(m_temp_reg_8, 4);
116 m_temp_reg_2 = _mm_srli_si128(m_temp_reg_8, 8);
117 m_temp_reg_3 = _mm_srli_si128(m_temp_reg_8, 12);
118 pu1_dst += dst_strd;
119 pi4_dst = (WORD32 *)(pu1_dst);
120
121 *pi4_dst = _mm_cvtsi128_si32(m_temp_reg_1);
122 pu1_dst += dst_strd;
123 pi4_dst = (WORD32 *)(pu1_dst);
124
125 *pi4_dst = _mm_cvtsi128_si32(m_temp_reg_2);
126 pu1_dst += dst_strd;
127 pi4_dst = (WORD32 *)(pu1_dst);
128
129 *pi4_dst = _mm_cvtsi128_si32(m_temp_reg_3);
130 }
131 else
132 {
133 WORD32 i, j;
134
135 for(i = 1; i <= trans_size; i += 4)
136 {
137 for(j = 1; j <= trans_size; j += 8)
138 {
139
140 m_temp_reg_0 = _mm_loadl_epi64((__m128i *)pu1_pred);
141 m_temp_reg_1 = _mm_loadl_epi64((__m128i *)(pu1_pred + pred_strd));
142 m_temp_reg_2 = _mm_loadl_epi64((__m128i *)(pu1_pred + 2 * pred_strd));
143 m_temp_reg_3 = _mm_loadl_epi64((__m128i *)(pu1_pred + 3 * pred_strd));
144
145
146 m_temp_reg_4 = _mm_unpacklo_epi8(m_temp_reg_0, zero_8x16b);
147 m_temp_reg_5 = _mm_unpacklo_epi8(m_temp_reg_1, zero_8x16b);
148 m_temp_reg_6 = _mm_unpacklo_epi8(m_temp_reg_2, zero_8x16b);
149 m_temp_reg_7 = _mm_unpacklo_epi8(m_temp_reg_3, zero_8x16b);
150
151 m_temp_reg_8 = _mm_add_epi16(m_temp_reg_4, m_temp_reg_20);
152 m_temp_reg_9 = _mm_add_epi16(m_temp_reg_5, m_temp_reg_20);
153 m_temp_reg_10 = _mm_add_epi16(m_temp_reg_6, m_temp_reg_20);
154 m_temp_reg_11 = _mm_add_epi16(m_temp_reg_7, m_temp_reg_20);
155
156 pi4_dst = (__m128i *)(pu1_dst);
157
158 m_temp_reg_12 = _mm_packus_epi16(m_temp_reg_8, m_temp_reg_9);
159 _mm_storel_epi64(pi4_dst, m_temp_reg_12);
160
161 pi4_dst = (__m128i *)(pu1_dst + dst_strd);
162
163 m_temp_reg_13 = _mm_srli_si128(m_temp_reg_12, 8);
164 _mm_storel_epi64(pi4_dst, m_temp_reg_13);
165
166 pi4_dst = (__m128i *)(pu1_dst + 2 * dst_strd);
167
168 m_temp_reg_14 = _mm_packus_epi16(m_temp_reg_10, m_temp_reg_11);
169 _mm_storel_epi64(pi4_dst, m_temp_reg_14);
170
171 pi4_dst = (__m128i *)(pu1_dst + 3 * dst_strd);
172
173 m_temp_reg_15 = _mm_srli_si128(m_temp_reg_14, 8);
174 _mm_storel_epi64(pi4_dst, m_temp_reg_15);
175
176 pu1_pred += 8;
177 pu1_dst += 8;
178 }
179 pu1_pred += 4 * pred_strd - trans_size;
180 pu1_dst += 4 * dst_strd - trans_size;
181 }
182 }
183
184
185 }
186
ihevcd_itrans_recon_dc_chroma_sse42(UWORD8 * pu1_pred,UWORD8 * pu1_dst,WORD32 pred_strd,WORD32 dst_strd,WORD32 log2_trans_size,WORD16 i2_coeff_value)187 void ihevcd_itrans_recon_dc_chroma_sse42(UWORD8 *pu1_pred, UWORD8 *pu1_dst, WORD32 pred_strd, WORD32 dst_strd,
188 WORD32 log2_trans_size, WORD16 i2_coeff_value)
189 {
190 __m128i m_temp_reg_0;
191 __m128i m_temp_reg_1;
192 __m128i m_temp_reg_2;
193 __m128i m_temp_reg_3;
194 __m128i m_temp_reg_4;
195 __m128i m_temp_reg_5;
196 __m128i m_temp_reg_6;
197 __m128i m_temp_reg_7;
198 __m128i m_temp_reg_8;
199 __m128i m_temp_reg_9;
200 __m128i m_temp_reg_10;
201 __m128i m_temp_reg_11;
202 __m128i m_temp_reg_12;
203 __m128i m_temp_reg_13;
204 __m128i m_temp_reg_14;
205 __m128i m_temp_reg_15;
206 __m128i m_temp_reg_20, zero_8x16b;
207 __m128i *pi4_dst = (__m128i *)pu1_dst;
208
209
210 //WORD32 row,col;
211 WORD32 add, shift;
212 WORD32 dc_value, quant_out;
213 WORD32 trans_size;
214
215
216 WORD32 shuffle_mask_4x4 = 0x06040200;
217 WORD32 unchanged_mask_4x4 = 0x07050301;
218 LWORD64 shuffle_mask = 0x0E0C0A0806040200LL;
219 LWORD64 unchanged_mask = 0x0F0D0B0907050301LL;
220
221 trans_size = (1 << log2_trans_size);
222
223 quant_out = i2_coeff_value;
224
225 shift = IT_SHIFT_STAGE_1;
226 add = 1 << (shift - 1);
227 dc_value = CLIP_S16((quant_out * 64 + add) >> shift);
228 shift = IT_SHIFT_STAGE_2;
229 add = 1 << (shift - 1);
230 dc_value = CLIP_S16((dc_value * 64 + add) >> shift);
231
232 /*Replicate the DC value within 16 bits in 128 bit register*/
233 m_temp_reg_20 = _mm_set1_epi16(dc_value);
234 zero_8x16b = _mm_setzero_si128();
235
236 if(trans_size == 4)
237 {
238 __m128i chroma_shuffle_mask_16x8b;
239 __m128i chroma_unchanged_mask_16x8b;
240 chroma_shuffle_mask_16x8b = _mm_cvtsi32_si128(shuffle_mask_4x4);
241 chroma_unchanged_mask_16x8b = _mm_cvtsi32_si128(unchanged_mask_4x4);
242
243 /*Load the prediction data*/
244 m_temp_reg_0 = _mm_loadl_epi64((__m128i *)(pu1_pred));
245 m_temp_reg_1 = _mm_loadl_epi64((__m128i *)(pu1_pred + pred_strd));
246 m_temp_reg_2 = _mm_loadl_epi64((__m128i *)(pu1_pred + 2 * pred_strd));
247 m_temp_reg_3 = _mm_loadl_epi64((__m128i *)(pu1_pred + 3 * pred_strd));
248
249 m_temp_reg_10 = _mm_shuffle_epi8(m_temp_reg_0, chroma_shuffle_mask_16x8b);
250 m_temp_reg_11 = _mm_shuffle_epi8(m_temp_reg_1, chroma_shuffle_mask_16x8b);
251 m_temp_reg_12 = _mm_shuffle_epi8(m_temp_reg_2, chroma_shuffle_mask_16x8b);
252 m_temp_reg_13 = _mm_shuffle_epi8(m_temp_reg_3, chroma_shuffle_mask_16x8b);
253
254 m_temp_reg_14 = _mm_unpacklo_epi32(m_temp_reg_10, m_temp_reg_11);
255 m_temp_reg_15 = _mm_unpacklo_epi32(m_temp_reg_12, m_temp_reg_13);
256
257 m_temp_reg_4 = _mm_unpacklo_epi8(m_temp_reg_14, zero_8x16b);
258 m_temp_reg_5 = _mm_unpacklo_epi8(m_temp_reg_15, zero_8x16b);
259
260 m_temp_reg_6 = _mm_add_epi16(m_temp_reg_4, m_temp_reg_20);
261 m_temp_reg_7 = _mm_add_epi16(m_temp_reg_5, m_temp_reg_20);
262
263 /*Load the recon data to make sure that 'v' is not corrupted when 'u' is called and vice versa*/
264 m_temp_reg_0 = _mm_loadl_epi64((__m128i *)pu1_dst);
265 m_temp_reg_1 = _mm_loadl_epi64((__m128i *)(pu1_dst + dst_strd));
266 m_temp_reg_2 = _mm_loadl_epi64((__m128i *)(pu1_dst + 2 * dst_strd));
267 m_temp_reg_3 = _mm_loadl_epi64((__m128i *)(pu1_dst + 3 * dst_strd));
268
269 m_temp_reg_0 = _mm_shuffle_epi8(m_temp_reg_0, chroma_unchanged_mask_16x8b);
270 m_temp_reg_1 = _mm_shuffle_epi8(m_temp_reg_1, chroma_unchanged_mask_16x8b);
271 m_temp_reg_2 = _mm_shuffle_epi8(m_temp_reg_2, chroma_unchanged_mask_16x8b);
272 m_temp_reg_3 = _mm_shuffle_epi8(m_temp_reg_3, chroma_unchanged_mask_16x8b);
273
274
275 m_temp_reg_8 = _mm_packus_epi16(m_temp_reg_6, m_temp_reg_7);
276 m_temp_reg_9 = _mm_unpacklo_epi8(m_temp_reg_8, m_temp_reg_0);
277 m_temp_reg_8 = _mm_srli_si128(m_temp_reg_8, 4);
278 m_temp_reg_10 = _mm_unpacklo_epi8(m_temp_reg_8, m_temp_reg_1);
279 m_temp_reg_8 = _mm_srli_si128(m_temp_reg_8, 4);
280 m_temp_reg_11 = _mm_unpacklo_epi8(m_temp_reg_8, m_temp_reg_2);
281 m_temp_reg_8 = _mm_srli_si128(m_temp_reg_8, 4);
282 m_temp_reg_12 = _mm_unpacklo_epi8(m_temp_reg_8, m_temp_reg_3);
283
284 /*Store the result in the destination*/
285 _mm_storel_epi64(pi4_dst, m_temp_reg_9);
286 pu1_dst += dst_strd;
287 pi4_dst = (__m128i *)(pu1_dst);
288
289
290 _mm_storel_epi64(pi4_dst, m_temp_reg_10);
291 pu1_dst += dst_strd;
292 pi4_dst = (__m128i *)(pu1_dst);
293
294 _mm_storel_epi64(pi4_dst, m_temp_reg_11);
295 pu1_dst += dst_strd;
296 pi4_dst = (__m128i *)(pu1_dst);
297
298 _mm_storel_epi64(pi4_dst, m_temp_reg_12);
299 }
300 else
301 {
302 WORD32 i, j;
303 __m128i chroma_shuffle_mask_16x8b;
304 __m128i chroma_unchanged_mask_16x8b;
305 chroma_shuffle_mask_16x8b = _mm_loadl_epi64((__m128i *)(&shuffle_mask));
306 chroma_unchanged_mask_16x8b =
307 _mm_loadl_epi64((__m128i *)(&unchanged_mask));
308
309 for(i = 0; i < trans_size; i += 4)
310 {
311 for(j = 0; j < trans_size; j += 8)
312 {
313
314 m_temp_reg_0 = _mm_loadu_si128((__m128i *)pu1_pred);
315 m_temp_reg_1 = _mm_loadu_si128((__m128i *)(pu1_pred + pred_strd));
316 m_temp_reg_2 = _mm_loadu_si128((__m128i *)(pu1_pred + 2 * pred_strd));
317 m_temp_reg_3 = _mm_loadu_si128((__m128i *)(pu1_pred + 3 * pred_strd));
318
319 /*Retain only one chroma component*/
320 m_temp_reg_4 = _mm_shuffle_epi8(m_temp_reg_0, chroma_shuffle_mask_16x8b);
321 m_temp_reg_5 = _mm_shuffle_epi8(m_temp_reg_1, chroma_shuffle_mask_16x8b);
322 m_temp_reg_6 = _mm_shuffle_epi8(m_temp_reg_2, chroma_shuffle_mask_16x8b);
323 m_temp_reg_7 = _mm_shuffle_epi8(m_temp_reg_3, chroma_shuffle_mask_16x8b);
324
325 m_temp_reg_4 = _mm_unpacklo_epi8(m_temp_reg_4, zero_8x16b);
326 m_temp_reg_5 = _mm_unpacklo_epi8(m_temp_reg_5, zero_8x16b);
327 m_temp_reg_6 = _mm_unpacklo_epi8(m_temp_reg_6, zero_8x16b);
328 m_temp_reg_7 = _mm_unpacklo_epi8(m_temp_reg_7, zero_8x16b);
329
330 m_temp_reg_8 = _mm_add_epi16(m_temp_reg_4, m_temp_reg_20);
331 m_temp_reg_9 = _mm_add_epi16(m_temp_reg_5, m_temp_reg_20);
332 m_temp_reg_10 = _mm_add_epi16(m_temp_reg_6, m_temp_reg_20);
333 m_temp_reg_11 = _mm_add_epi16(m_temp_reg_7, m_temp_reg_20);
334
335
336 /*Load the recon data to make sure that 'v' is not corrupted when 'u' is called and vice versa*/
337 m_temp_reg_0 = _mm_loadu_si128((__m128i *)pu1_dst);
338 m_temp_reg_1 = _mm_loadu_si128((__m128i *)(pu1_dst + dst_strd));
339 m_temp_reg_2 = _mm_loadu_si128((__m128i *)(pu1_dst + 2 * dst_strd));
340 m_temp_reg_3 = _mm_loadu_si128((__m128i *)(pu1_dst + 3 * dst_strd));
341
342 m_temp_reg_0 = _mm_shuffle_epi8(m_temp_reg_0, chroma_unchanged_mask_16x8b);
343 m_temp_reg_1 = _mm_shuffle_epi8(m_temp_reg_1, chroma_unchanged_mask_16x8b);
344 m_temp_reg_2 = _mm_shuffle_epi8(m_temp_reg_2, chroma_unchanged_mask_16x8b);
345 m_temp_reg_3 = _mm_shuffle_epi8(m_temp_reg_3, chroma_unchanged_mask_16x8b);
346
347 m_temp_reg_4 = _mm_packus_epi16(m_temp_reg_8, m_temp_reg_9);
348 m_temp_reg_5 = _mm_packus_epi16(m_temp_reg_10, m_temp_reg_11);
349
350 m_temp_reg_12 = _mm_unpacklo_epi8(m_temp_reg_4, m_temp_reg_0);
351 m_temp_reg_4 = _mm_srli_si128(m_temp_reg_4, 8);
352 m_temp_reg_13 = _mm_unpacklo_epi8(m_temp_reg_4, m_temp_reg_1);
353
354 m_temp_reg_14 = _mm_unpacklo_epi8(m_temp_reg_5, m_temp_reg_2);
355 m_temp_reg_5 = _mm_srli_si128(m_temp_reg_5, 8);
356 m_temp_reg_15 = _mm_unpacklo_epi8(m_temp_reg_5, m_temp_reg_3);
357
358 /*Store the result in the destination*/
359 pi4_dst = (__m128i *)(pu1_dst);
360
361 _mm_storel_epi64(pi4_dst, m_temp_reg_12);
362 m_temp_reg_8 = _mm_srli_si128(m_temp_reg_12, 8);
363
364 pi4_dst = (__m128i *)(pu1_dst + 8);
365 _mm_storel_epi64(pi4_dst, m_temp_reg_8);
366
367 pi4_dst = (__m128i *)(pu1_dst + dst_strd);
368
369 _mm_storel_epi64(pi4_dst, m_temp_reg_13);
370 m_temp_reg_9 = _mm_srli_si128(m_temp_reg_13, 8);
371
372 pi4_dst = (__m128i *)(pu1_dst + dst_strd + 8);
373 _mm_storel_epi64(pi4_dst, m_temp_reg_9);
374
375 pi4_dst = (__m128i *)(pu1_dst + 2 * dst_strd);
376
377 _mm_storel_epi64(pi4_dst, m_temp_reg_14);
378 m_temp_reg_10 = _mm_srli_si128(m_temp_reg_14, 8);
379
380 pi4_dst = (__m128i *)(pu1_dst + 2 * dst_strd + 8);
381 _mm_storel_epi64(pi4_dst, m_temp_reg_10);
382
383 pi4_dst = (__m128i *)(pu1_dst + 3 * dst_strd);
384
385 _mm_storel_epi64(pi4_dst, m_temp_reg_15);
386 m_temp_reg_11 = _mm_srli_si128(m_temp_reg_15, 8);
387
388 pi4_dst = (__m128i *)(pu1_dst + 3 * dst_strd + 8);
389 _mm_storel_epi64(pi4_dst, m_temp_reg_11);
390
391 pu1_pred += 16;
392 pu1_dst += 16;
393 }
394
395 pu1_pred += 4 * pred_strd - 2 * trans_size;
396 pu1_dst += 4 * dst_strd - 2 * trans_size;
397 }
398 }
399
400
401 }
402