1 /******************************************************************************
2 *
3 * Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at:
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 *
17 ******************************************************************************/
18 /**
19 *******************************************************************************
20 * @file
21 *  ihevcd_it_rec_dc_x86_intr.c
22 *
23 * @brief
24 *  Platform specific intrinsic implementation of certain functions
25 *
26 * @author
27 *  Ittiam
28 * @par List of Functions:
29 *  - ihevcd_itrans_recon_dc
30 *  - ihevcd_fmt_conv_420sp_to_420p
31 *
32 * @remarks
33 *  None
34 *
35 *******************************************************************************
36 */
37 
38 #include "ihevc_typedefs.h"
39 #include "ihevc_defs.h"
40 #include "ihevc_macros.h"
41 #include "ihevc_platform_macros.h"
42 #include "ihevcd_function_selector.h"
43 
44 #include <immintrin.h>
45 
46 
ihevcd_itrans_recon_dc_luma_sse42(UWORD8 * pu1_pred,UWORD8 * pu1_dst,WORD32 pred_strd,WORD32 dst_strd,WORD32 log2_trans_size,WORD16 i2_coeff_value)47 void ihevcd_itrans_recon_dc_luma_sse42(UWORD8 *pu1_pred, UWORD8 *pu1_dst, WORD32 pred_strd, WORD32 dst_strd,
48                                        WORD32 log2_trans_size, WORD16 i2_coeff_value)
49 {
50     __m128i m_temp_reg_0;
51     __m128i m_temp_reg_1;
52     __m128i m_temp_reg_2;
53     __m128i m_temp_reg_3;
54     __m128i m_temp_reg_4;
55     __m128i m_temp_reg_5;
56     __m128i m_temp_reg_6;
57     __m128i m_temp_reg_7;
58     __m128i m_temp_reg_8;
59     __m128i m_temp_reg_9;
60     __m128i m_temp_reg_10;
61     __m128i m_temp_reg_11;
62     __m128i m_temp_reg_12;
63     __m128i m_temp_reg_13;
64     __m128i m_temp_reg_14;
65     __m128i m_temp_reg_15;
66     __m128i m_temp_reg_20, zero_8x16b;
67     __m128i *pi4_dst = (__m128i *)pu1_dst;
68 
69 
70     //WORD32 row,col;
71     WORD32 add, shift;
72     WORD32 dc_value, quant_out;
73     WORD32 trans_size;
74 
75 
76 
77 
78     trans_size = (1 << log2_trans_size);
79 
80     quant_out = i2_coeff_value;
81 
82     shift = IT_SHIFT_STAGE_1;
83     add = 1 << (shift - 1);
84     dc_value = CLIP_S16((quant_out * 64 + add) >> shift);
85     shift = IT_SHIFT_STAGE_2;
86     add = 1 << (shift - 1);
87     dc_value = CLIP_S16((dc_value * 64 + add) >> shift);
88 
89     /*Replicate the DC value within 16 bits in 128 bit register*/
90     m_temp_reg_20 = _mm_set1_epi16(dc_value);
91     zero_8x16b = _mm_setzero_si128();
92 
93     if(trans_size == 4)
94     {
95         WORD32 *pi4_dst = (WORD32 *)pu1_dst;
96 
97         m_temp_reg_0 = _mm_loadl_epi64((__m128i *)(pu1_pred));
98         m_temp_reg_1 = _mm_loadl_epi64((__m128i *)(pu1_pred + pred_strd));
99         m_temp_reg_2 = _mm_loadl_epi64((__m128i *)(pu1_pred + 2 * pred_strd));
100         m_temp_reg_3 = _mm_loadl_epi64((__m128i *)(pu1_pred + 3 * pred_strd));
101 
102         m_temp_reg_4 = _mm_unpacklo_epi32(m_temp_reg_0, m_temp_reg_1);
103         m_temp_reg_5 = _mm_unpacklo_epi32(m_temp_reg_2, m_temp_reg_3);
104 
105         m_temp_reg_4 = _mm_unpacklo_epi8(m_temp_reg_4, zero_8x16b);
106         m_temp_reg_5 = _mm_unpacklo_epi8(m_temp_reg_5, zero_8x16b);
107 
108         m_temp_reg_6 = _mm_add_epi16(m_temp_reg_4, m_temp_reg_20);
109         m_temp_reg_7 = _mm_add_epi16(m_temp_reg_5, m_temp_reg_20);
110 
111         m_temp_reg_8 = _mm_packus_epi16(m_temp_reg_6, m_temp_reg_7);
112 
113 
114         *pi4_dst = _mm_cvtsi128_si32(m_temp_reg_8);
115         m_temp_reg_1 = _mm_srli_si128(m_temp_reg_8, 4);
116         m_temp_reg_2 = _mm_srli_si128(m_temp_reg_8, 8);
117         m_temp_reg_3 = _mm_srli_si128(m_temp_reg_8, 12);
118         pu1_dst += dst_strd;
119         pi4_dst = (WORD32 *)(pu1_dst);
120 
121         *pi4_dst = _mm_cvtsi128_si32(m_temp_reg_1);
122         pu1_dst += dst_strd;
123         pi4_dst = (WORD32 *)(pu1_dst);
124 
125         *pi4_dst = _mm_cvtsi128_si32(m_temp_reg_2);
126         pu1_dst += dst_strd;
127         pi4_dst = (WORD32 *)(pu1_dst);
128 
129         *pi4_dst = _mm_cvtsi128_si32(m_temp_reg_3);
130     }
131     else
132     {
133         WORD32 i, j;
134 
135         for(i = 1; i <= trans_size; i += 4)
136         {
137             for(j = 1; j <= trans_size; j += 8)
138             {
139 
140                 m_temp_reg_0 = _mm_loadl_epi64((__m128i *)pu1_pred);
141                 m_temp_reg_1 = _mm_loadl_epi64((__m128i *)(pu1_pred + pred_strd));
142                 m_temp_reg_2 = _mm_loadl_epi64((__m128i *)(pu1_pred + 2 * pred_strd));
143                 m_temp_reg_3 = _mm_loadl_epi64((__m128i *)(pu1_pred + 3 * pred_strd));
144 
145 
146                 m_temp_reg_4 = _mm_unpacklo_epi8(m_temp_reg_0, zero_8x16b);
147                 m_temp_reg_5 = _mm_unpacklo_epi8(m_temp_reg_1, zero_8x16b);
148                 m_temp_reg_6 = _mm_unpacklo_epi8(m_temp_reg_2, zero_8x16b);
149                 m_temp_reg_7 = _mm_unpacklo_epi8(m_temp_reg_3, zero_8x16b);
150 
151                 m_temp_reg_8 = _mm_add_epi16(m_temp_reg_4, m_temp_reg_20);
152                 m_temp_reg_9 = _mm_add_epi16(m_temp_reg_5, m_temp_reg_20);
153                 m_temp_reg_10 = _mm_add_epi16(m_temp_reg_6, m_temp_reg_20);
154                 m_temp_reg_11 = _mm_add_epi16(m_temp_reg_7, m_temp_reg_20);
155 
156                 pi4_dst = (__m128i *)(pu1_dst);
157 
158                 m_temp_reg_12 = _mm_packus_epi16(m_temp_reg_8, m_temp_reg_9);
159                 _mm_storel_epi64(pi4_dst, m_temp_reg_12);
160 
161                 pi4_dst = (__m128i *)(pu1_dst + dst_strd);
162 
163                 m_temp_reg_13 = _mm_srli_si128(m_temp_reg_12, 8);
164                 _mm_storel_epi64(pi4_dst, m_temp_reg_13);
165 
166                 pi4_dst = (__m128i *)(pu1_dst + 2 * dst_strd);
167 
168                 m_temp_reg_14 = _mm_packus_epi16(m_temp_reg_10, m_temp_reg_11);
169                 _mm_storel_epi64(pi4_dst, m_temp_reg_14);
170 
171                 pi4_dst = (__m128i *)(pu1_dst + 3 * dst_strd);
172 
173                 m_temp_reg_15 = _mm_srli_si128(m_temp_reg_14, 8);
174                 _mm_storel_epi64(pi4_dst, m_temp_reg_15);
175 
176                 pu1_pred += 8;
177                 pu1_dst += 8;
178             }
179             pu1_pred += 4 * pred_strd - trans_size;
180             pu1_dst += 4 * dst_strd - trans_size;
181         }
182     }
183 
184 
185 }
186 
ihevcd_itrans_recon_dc_chroma_sse42(UWORD8 * pu1_pred,UWORD8 * pu1_dst,WORD32 pred_strd,WORD32 dst_strd,WORD32 log2_trans_size,WORD16 i2_coeff_value)187 void ihevcd_itrans_recon_dc_chroma_sse42(UWORD8 *pu1_pred, UWORD8 *pu1_dst, WORD32 pred_strd, WORD32 dst_strd,
188                                          WORD32 log2_trans_size, WORD16 i2_coeff_value)
189 {
190     __m128i m_temp_reg_0;
191     __m128i m_temp_reg_1;
192     __m128i m_temp_reg_2;
193     __m128i m_temp_reg_3;
194     __m128i m_temp_reg_4;
195     __m128i m_temp_reg_5;
196     __m128i m_temp_reg_6;
197     __m128i m_temp_reg_7;
198     __m128i m_temp_reg_8;
199     __m128i m_temp_reg_9;
200     __m128i m_temp_reg_10;
201     __m128i m_temp_reg_11;
202     __m128i m_temp_reg_12;
203     __m128i m_temp_reg_13;
204     __m128i m_temp_reg_14;
205     __m128i m_temp_reg_15;
206     __m128i m_temp_reg_20, zero_8x16b;
207     __m128i *pi4_dst = (__m128i *)pu1_dst;
208 
209 
210     //WORD32 row,col;
211     WORD32 add, shift;
212     WORD32 dc_value, quant_out;
213     WORD32 trans_size;
214 
215 
216     WORD32 shuffle_mask_4x4 = 0x06040200;
217     WORD32 unchanged_mask_4x4 = 0x07050301;
218     LWORD64 shuffle_mask = 0x0E0C0A0806040200LL;
219     LWORD64 unchanged_mask = 0x0F0D0B0907050301LL;
220 
221     trans_size = (1 << log2_trans_size);
222 
223     quant_out = i2_coeff_value;
224 
225     shift = IT_SHIFT_STAGE_1;
226     add = 1 << (shift - 1);
227     dc_value = CLIP_S16((quant_out * 64 + add) >> shift);
228     shift = IT_SHIFT_STAGE_2;
229     add = 1 << (shift - 1);
230     dc_value = CLIP_S16((dc_value * 64 + add) >> shift);
231 
232     /*Replicate the DC value within 16 bits in 128 bit register*/
233     m_temp_reg_20 = _mm_set1_epi16(dc_value);
234     zero_8x16b = _mm_setzero_si128();
235 
236     if(trans_size == 4)
237     {
238         __m128i chroma_shuffle_mask_16x8b;
239         __m128i chroma_unchanged_mask_16x8b;
240         chroma_shuffle_mask_16x8b = _mm_cvtsi32_si128(shuffle_mask_4x4);
241         chroma_unchanged_mask_16x8b = _mm_cvtsi32_si128(unchanged_mask_4x4);
242 
243         /*Load the prediction data*/
244         m_temp_reg_0 = _mm_loadl_epi64((__m128i *)(pu1_pred));
245         m_temp_reg_1 = _mm_loadl_epi64((__m128i *)(pu1_pred + pred_strd));
246         m_temp_reg_2 = _mm_loadl_epi64((__m128i *)(pu1_pred + 2 * pred_strd));
247         m_temp_reg_3 = _mm_loadl_epi64((__m128i *)(pu1_pred + 3 * pred_strd));
248 
249         m_temp_reg_10  = _mm_shuffle_epi8(m_temp_reg_0, chroma_shuffle_mask_16x8b);
250         m_temp_reg_11  = _mm_shuffle_epi8(m_temp_reg_1, chroma_shuffle_mask_16x8b);
251         m_temp_reg_12  = _mm_shuffle_epi8(m_temp_reg_2, chroma_shuffle_mask_16x8b);
252         m_temp_reg_13  = _mm_shuffle_epi8(m_temp_reg_3, chroma_shuffle_mask_16x8b);
253 
254         m_temp_reg_14 = _mm_unpacklo_epi32(m_temp_reg_10, m_temp_reg_11);
255         m_temp_reg_15 = _mm_unpacklo_epi32(m_temp_reg_12, m_temp_reg_13);
256 
257         m_temp_reg_4 = _mm_unpacklo_epi8(m_temp_reg_14, zero_8x16b);
258         m_temp_reg_5 = _mm_unpacklo_epi8(m_temp_reg_15, zero_8x16b);
259 
260         m_temp_reg_6 = _mm_add_epi16(m_temp_reg_4, m_temp_reg_20);
261         m_temp_reg_7 = _mm_add_epi16(m_temp_reg_5, m_temp_reg_20);
262 
263         /*Load the recon data to make sure that 'v' is not corrupted when 'u' is called and vice versa*/
264         m_temp_reg_0 = _mm_loadl_epi64((__m128i *)pu1_dst);
265         m_temp_reg_1 = _mm_loadl_epi64((__m128i *)(pu1_dst + dst_strd));
266         m_temp_reg_2 = _mm_loadl_epi64((__m128i *)(pu1_dst + 2 * dst_strd));
267         m_temp_reg_3 = _mm_loadl_epi64((__m128i *)(pu1_dst + 3 * dst_strd));
268 
269         m_temp_reg_0  = _mm_shuffle_epi8(m_temp_reg_0, chroma_unchanged_mask_16x8b);
270         m_temp_reg_1  = _mm_shuffle_epi8(m_temp_reg_1, chroma_unchanged_mask_16x8b);
271         m_temp_reg_2  = _mm_shuffle_epi8(m_temp_reg_2, chroma_unchanged_mask_16x8b);
272         m_temp_reg_3  = _mm_shuffle_epi8(m_temp_reg_3, chroma_unchanged_mask_16x8b);
273 
274 
275         m_temp_reg_8 = _mm_packus_epi16(m_temp_reg_6, m_temp_reg_7);
276         m_temp_reg_9 = _mm_unpacklo_epi8(m_temp_reg_8, m_temp_reg_0);
277         m_temp_reg_8 = _mm_srli_si128(m_temp_reg_8, 4);
278         m_temp_reg_10 = _mm_unpacklo_epi8(m_temp_reg_8, m_temp_reg_1);
279         m_temp_reg_8 = _mm_srli_si128(m_temp_reg_8, 4);
280         m_temp_reg_11 = _mm_unpacklo_epi8(m_temp_reg_8, m_temp_reg_2);
281         m_temp_reg_8 = _mm_srli_si128(m_temp_reg_8, 4);
282         m_temp_reg_12 = _mm_unpacklo_epi8(m_temp_reg_8, m_temp_reg_3);
283 
284         /*Store the result in the destination*/
285         _mm_storel_epi64(pi4_dst, m_temp_reg_9);
286         pu1_dst += dst_strd;
287         pi4_dst = (__m128i *)(pu1_dst);
288 
289 
290         _mm_storel_epi64(pi4_dst, m_temp_reg_10);
291         pu1_dst += dst_strd;
292         pi4_dst = (__m128i *)(pu1_dst);
293 
294         _mm_storel_epi64(pi4_dst, m_temp_reg_11);
295         pu1_dst += dst_strd;
296         pi4_dst = (__m128i *)(pu1_dst);
297 
298         _mm_storel_epi64(pi4_dst, m_temp_reg_12);
299     }
300     else
301     {
302         WORD32 i, j;
303         __m128i chroma_shuffle_mask_16x8b;
304         __m128i chroma_unchanged_mask_16x8b;
305         chroma_shuffle_mask_16x8b = _mm_loadl_epi64((__m128i *)(&shuffle_mask));
306         chroma_unchanged_mask_16x8b =
307                         _mm_loadl_epi64((__m128i *)(&unchanged_mask));
308 
309         for(i = 0; i < trans_size; i += 4)
310         {
311             for(j = 0; j < trans_size; j += 8)
312             {
313 
314                 m_temp_reg_0 = _mm_loadu_si128((__m128i *)pu1_pred);
315                 m_temp_reg_1 = _mm_loadu_si128((__m128i *)(pu1_pred + pred_strd));
316                 m_temp_reg_2 = _mm_loadu_si128((__m128i *)(pu1_pred + 2 * pred_strd));
317                 m_temp_reg_3 = _mm_loadu_si128((__m128i *)(pu1_pred + 3 * pred_strd));
318 
319                 /*Retain only one chroma component*/
320                 m_temp_reg_4  = _mm_shuffle_epi8(m_temp_reg_0, chroma_shuffle_mask_16x8b);
321                 m_temp_reg_5  = _mm_shuffle_epi8(m_temp_reg_1, chroma_shuffle_mask_16x8b);
322                 m_temp_reg_6  = _mm_shuffle_epi8(m_temp_reg_2, chroma_shuffle_mask_16x8b);
323                 m_temp_reg_7  = _mm_shuffle_epi8(m_temp_reg_3, chroma_shuffle_mask_16x8b);
324 
325                 m_temp_reg_4 = _mm_unpacklo_epi8(m_temp_reg_4, zero_8x16b);
326                 m_temp_reg_5 = _mm_unpacklo_epi8(m_temp_reg_5, zero_8x16b);
327                 m_temp_reg_6 = _mm_unpacklo_epi8(m_temp_reg_6, zero_8x16b);
328                 m_temp_reg_7 = _mm_unpacklo_epi8(m_temp_reg_7, zero_8x16b);
329 
330                 m_temp_reg_8 = _mm_add_epi16(m_temp_reg_4, m_temp_reg_20);
331                 m_temp_reg_9 = _mm_add_epi16(m_temp_reg_5, m_temp_reg_20);
332                 m_temp_reg_10 = _mm_add_epi16(m_temp_reg_6, m_temp_reg_20);
333                 m_temp_reg_11 = _mm_add_epi16(m_temp_reg_7, m_temp_reg_20);
334 
335 
336                 /*Load the recon data to make sure that 'v' is not corrupted when 'u' is called and vice versa*/
337                 m_temp_reg_0 = _mm_loadu_si128((__m128i *)pu1_dst);
338                 m_temp_reg_1 = _mm_loadu_si128((__m128i *)(pu1_dst + dst_strd));
339                 m_temp_reg_2 = _mm_loadu_si128((__m128i *)(pu1_dst + 2 * dst_strd));
340                 m_temp_reg_3 = _mm_loadu_si128((__m128i *)(pu1_dst + 3 * dst_strd));
341 
342                 m_temp_reg_0  = _mm_shuffle_epi8(m_temp_reg_0, chroma_unchanged_mask_16x8b);
343                 m_temp_reg_1  = _mm_shuffle_epi8(m_temp_reg_1, chroma_unchanged_mask_16x8b);
344                 m_temp_reg_2  = _mm_shuffle_epi8(m_temp_reg_2, chroma_unchanged_mask_16x8b);
345                 m_temp_reg_3  = _mm_shuffle_epi8(m_temp_reg_3, chroma_unchanged_mask_16x8b);
346 
347                 m_temp_reg_4 = _mm_packus_epi16(m_temp_reg_8, m_temp_reg_9);
348                 m_temp_reg_5 = _mm_packus_epi16(m_temp_reg_10, m_temp_reg_11);
349 
350                 m_temp_reg_12 = _mm_unpacklo_epi8(m_temp_reg_4, m_temp_reg_0);
351                 m_temp_reg_4 = _mm_srli_si128(m_temp_reg_4, 8);
352                 m_temp_reg_13 = _mm_unpacklo_epi8(m_temp_reg_4, m_temp_reg_1);
353 
354                 m_temp_reg_14 = _mm_unpacklo_epi8(m_temp_reg_5, m_temp_reg_2);
355                 m_temp_reg_5 = _mm_srli_si128(m_temp_reg_5, 8);
356                 m_temp_reg_15 = _mm_unpacklo_epi8(m_temp_reg_5, m_temp_reg_3);
357 
358                 /*Store the result in the destination*/
359                 pi4_dst = (__m128i *)(pu1_dst);
360 
361                 _mm_storel_epi64(pi4_dst, m_temp_reg_12);
362                 m_temp_reg_8 = _mm_srli_si128(m_temp_reg_12, 8);
363 
364                 pi4_dst = (__m128i *)(pu1_dst + 8);
365                 _mm_storel_epi64(pi4_dst, m_temp_reg_8);
366 
367                 pi4_dst = (__m128i *)(pu1_dst + dst_strd);
368 
369                 _mm_storel_epi64(pi4_dst, m_temp_reg_13);
370                 m_temp_reg_9 = _mm_srli_si128(m_temp_reg_13, 8);
371 
372                 pi4_dst = (__m128i *)(pu1_dst + dst_strd + 8);
373                 _mm_storel_epi64(pi4_dst, m_temp_reg_9);
374 
375                 pi4_dst = (__m128i *)(pu1_dst + 2 * dst_strd);
376 
377                 _mm_storel_epi64(pi4_dst, m_temp_reg_14);
378                 m_temp_reg_10 = _mm_srli_si128(m_temp_reg_14, 8);
379 
380                 pi4_dst = (__m128i *)(pu1_dst + 2 * dst_strd + 8);
381                 _mm_storel_epi64(pi4_dst, m_temp_reg_10);
382 
383                 pi4_dst = (__m128i *)(pu1_dst + 3 * dst_strd);
384 
385                 _mm_storel_epi64(pi4_dst, m_temp_reg_15);
386                 m_temp_reg_11 = _mm_srli_si128(m_temp_reg_15, 8);
387 
388                 pi4_dst = (__m128i *)(pu1_dst + 3 * dst_strd + 8);
389                 _mm_storel_epi64(pi4_dst, m_temp_reg_11);
390 
391                 pu1_pred += 16;
392                 pu1_dst += 16;
393             }
394 
395             pu1_pred += 4 * pred_strd - 2 * trans_size;
396             pu1_dst += 4 * dst_strd - 2 * trans_size;
397         }
398     }
399 
400 
401 }
402