1 /******************************************************************************
2 *
3 * Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at:
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 *
17 ******************************************************************************/
18 /**
19 *******************************************************************************
20 * @file
21 *  ihevcd_frm_cvt_x86_intr.c
22 *
23 * @brief
24 *  Platform specific intrinsic implementation of certain functions
25 *
26 * @author
27 *  Ittiam
28 * @par List of Functions:
29 *  - ihevcd_itrans_recon_dc
30 *  - ihevcd_fmt_conv_420sp_to_420p
31 *
32 * @remarks
33 *  None
34 *
35 *******************************************************************************
36 */
37 #include "string.h"
38 #include "ihevc_typedefs.h"
39 #include "ihevc_defs.h"
40 #include "ihevc_macros.h"
41 #include "ihevc_platform_macros.h"
42 #include "ihevcd_function_selector.h"
43 #include <string.h>
44 #include <immintrin.h>
45 
46 
ihevcd_fmt_conv_420sp_to_420p_ssse3(UWORD8 * pu1_y_src,UWORD8 * pu1_uv_src,UWORD8 * pu1_y_dst,UWORD8 * pu1_u_dst,UWORD8 * pu1_v_dst,WORD32 wd,WORD32 ht,WORD32 src_y_strd,WORD32 src_uv_strd,WORD32 dst_y_strd,WORD32 dst_uv_strd,WORD32 is_u_first,WORD32 disable_luma_copy)47 void ihevcd_fmt_conv_420sp_to_420p_ssse3(UWORD8 *pu1_y_src,
48                                          UWORD8 *pu1_uv_src,
49                                          UWORD8 *pu1_y_dst,
50                                          UWORD8 *pu1_u_dst,
51                                          UWORD8 *pu1_v_dst,
52                                          WORD32 wd,
53                                          WORD32 ht,
54                                          WORD32 src_y_strd,
55                                          WORD32 src_uv_strd,
56                                          WORD32 dst_y_strd,
57                                          WORD32 dst_uv_strd,
58                                          WORD32 is_u_first,
59                                          WORD32 disable_luma_copy)
60 {
61     UWORD8 *pu1_src, *pu1_dst;
62     UWORD8 *pu1_u_src, *pu1_v_src;
63     WORD32 num_rows, num_cols, src_strd, dst_strd, cols, rows;
64     WORD32 i, j;
65 
66     cols = 0;
67     pu1_u_src = (UWORD8 *)pu1_uv_src;
68     pu1_v_src = (UWORD8 *)pu1_uv_src + 1;
69     if(0 == disable_luma_copy)
70     {
71         /* copy luma */
72         pu1_src = (UWORD8 *)pu1_y_src;
73         pu1_dst = (UWORD8 *)pu1_y_dst;
74 
75         num_rows = ht;
76         num_cols = wd;
77 
78         src_strd = src_y_strd;
79         dst_strd = dst_y_strd;
80         for(i = 0; i < num_rows; i++)
81         {
82             memcpy(pu1_dst, pu1_src, num_cols);
83             pu1_dst += dst_strd;
84             pu1_src += src_strd;
85         }
86     }
87 
88     /* de-interleave U and V and copy to destination */
89     if(!is_u_first)
90     {
91         UWORD8 *temp = pu1_u_dst;
92         pu1_u_dst = pu1_v_dst;
93         pu1_v_dst = temp;
94 
95         pu1_u_src = (UWORD8 *)pu1_uv_src + 1;
96         pu1_v_src = (UWORD8 *)pu1_uv_src;
97     }
98 
99     {
100         __m128i src_uv0_8x16b, src_uv1_8x16b, src_u_8x16b, src_v_8x16b;
101         __m128i temp0_8x16b, temp1_8x16b, alt_first_mask;
102 
103         UWORD8 FIRST_ALT_SHUFFLE[16] = {
104             0x00, 0x02, 0x04, 0x06,
105             0x08, 0x0A, 0x0C, 0x0E,
106             0x01, 0x03, 0x05, 0x07,
107             0x09, 0x0B, 0x0D, 0x0F };
108 
109         PREFETCH((char const *)(pu1_uv_src + (0 * src_uv_strd)), _MM_HINT_T0)
110         PREFETCH((char const *)(pu1_uv_src + (1 * src_uv_strd)), _MM_HINT_T0)
111         PREFETCH((char const *)(pu1_uv_src + (2 * src_uv_strd)), _MM_HINT_T0)
112         PREFETCH((char const *)(pu1_uv_src + (3 * src_uv_strd)), _MM_HINT_T0)
113         PREFETCH((char const *)(pu1_uv_src + (4 * src_uv_strd)), _MM_HINT_T0)
114         PREFETCH((char const *)(pu1_uv_src + (5 * src_uv_strd)), _MM_HINT_T0)
115         PREFETCH((char const *)(pu1_uv_src + (6 * src_uv_strd)), _MM_HINT_T0)
116         PREFETCH((char const *)(pu1_uv_src + (7 * src_uv_strd)), _MM_HINT_T0)
117 
118         num_rows = ht >> 1;
119         num_cols = wd >> 1;
120 
121         src_strd = src_uv_strd;
122         dst_strd = dst_uv_strd;
123 
124         alt_first_mask = _mm_loadu_si128((__m128i *)&FIRST_ALT_SHUFFLE[0]);
125 
126         if(num_cols > 15)
127         {
128             cols = num_cols >> 4;
129 
130             for(i = 0; i < (num_rows >> 2); i++)
131             {
132                 UWORD8 *pu1_uv_src_temp, *pu1_u_dst_temp, *pu1_v_dst_temp;
133 
134                 PREFETCH((char const *)(pu1_uv_src + (8 * src_strd)), _MM_HINT_T0)
135                 PREFETCH((char const *)(pu1_uv_src + (9 * src_strd)), _MM_HINT_T0)
136                 PREFETCH((char const *)(pu1_uv_src + (10 * src_strd)), _MM_HINT_T0)
137                 PREFETCH((char const *)(pu1_uv_src + (11 * src_strd)), _MM_HINT_T0)
138 
139                 pu1_uv_src_temp = pu1_uv_src;
140                 pu1_u_dst_temp =  pu1_u_dst;
141                 pu1_v_dst_temp =  pu1_v_dst;
142 
143                 for(j = 0; j < cols; j++)
144                 {
145 
146                     /**** Row 0 ***/
147                     src_uv0_8x16b = _mm_loadu_si128((__m128i *)pu1_uv_src_temp);
148                     src_uv1_8x16b = _mm_loadu_si128((__m128i *)(pu1_uv_src_temp + 16));
149 
150                     temp0_8x16b = _mm_shuffle_epi8(src_uv0_8x16b, alt_first_mask);
151                     temp1_8x16b = _mm_shuffle_epi8(src_uv1_8x16b, alt_first_mask);
152 
153                     src_u_8x16b = _mm_unpacklo_epi64(temp0_8x16b, temp1_8x16b);
154                     src_v_8x16b = _mm_unpackhi_epi64(temp0_8x16b, temp1_8x16b);
155 
156                     _mm_storeu_si128((__m128i *)(pu1_u_dst_temp), src_u_8x16b);
157                     _mm_storeu_si128((__m128i *)(pu1_v_dst_temp), src_v_8x16b);
158 
159                     /**** Row 1 ***/
160                     src_uv0_8x16b = _mm_loadu_si128((__m128i *)(pu1_uv_src_temp + (1 * src_strd)));
161                     src_uv1_8x16b = _mm_loadu_si128((__m128i *)(pu1_uv_src_temp + (1 * src_strd) + 16));
162 
163                     temp0_8x16b = _mm_shuffle_epi8(src_uv0_8x16b, alt_first_mask);
164                     temp1_8x16b = _mm_shuffle_epi8(src_uv1_8x16b, alt_first_mask);
165 
166                     src_u_8x16b = _mm_unpacklo_epi64(temp0_8x16b, temp1_8x16b);
167                     src_v_8x16b = _mm_unpackhi_epi64(temp0_8x16b, temp1_8x16b);
168 
169                     _mm_storeu_si128((__m128i *)(pu1_u_dst_temp + (1 * dst_strd)), src_u_8x16b);
170                     _mm_storeu_si128((__m128i *)(pu1_v_dst_temp + (1 * dst_strd)), src_v_8x16b);
171 
172                     /**** Row 2 ***/
173                     src_uv0_8x16b = _mm_loadu_si128((__m128i *)(pu1_uv_src_temp + (2 * src_strd)));
174                     src_uv1_8x16b = _mm_loadu_si128((__m128i *)(pu1_uv_src_temp + (2 * src_strd) + 16));
175 
176                     temp0_8x16b = _mm_shuffle_epi8(src_uv0_8x16b, alt_first_mask);
177                     temp1_8x16b = _mm_shuffle_epi8(src_uv1_8x16b, alt_first_mask);
178 
179                     src_u_8x16b = _mm_unpacklo_epi64(temp0_8x16b, temp1_8x16b);
180                     src_v_8x16b = _mm_unpackhi_epi64(temp0_8x16b, temp1_8x16b);
181 
182                     _mm_storeu_si128((__m128i *)(pu1_u_dst_temp + (2 * dst_strd)), src_u_8x16b);
183                     _mm_storeu_si128((__m128i *)(pu1_v_dst_temp + (2 * dst_strd)), src_v_8x16b);
184 
185                     /**** Row 3 ***/
186                     src_uv0_8x16b = _mm_loadu_si128((__m128i *)(pu1_uv_src_temp + (3 * src_strd)));
187                     src_uv1_8x16b = _mm_loadu_si128((__m128i *)(pu1_uv_src_temp + (3 * src_strd) + 16));
188 
189                     temp0_8x16b = _mm_shuffle_epi8(src_uv0_8x16b, alt_first_mask);
190                     temp1_8x16b = _mm_shuffle_epi8(src_uv1_8x16b, alt_first_mask);
191 
192                     src_u_8x16b = _mm_unpacklo_epi64(temp0_8x16b, temp1_8x16b);
193                     src_v_8x16b = _mm_unpackhi_epi64(temp0_8x16b, temp1_8x16b);
194 
195                     _mm_storeu_si128((__m128i *)(pu1_u_dst_temp + (3 * dst_strd)), src_u_8x16b);
196                     _mm_storeu_si128((__m128i *)(pu1_v_dst_temp + (3 * dst_strd)), src_v_8x16b);
197 
198                     pu1_u_dst_temp += 16;
199                     pu1_v_dst_temp += 16;
200                     pu1_uv_src_temp += 32;
201                 }
202 
203                 pu1_u_dst += 4 * dst_strd;
204                 pu1_v_dst += 4 * dst_strd;
205                 pu1_uv_src += 4 * src_strd;
206                 //pu1_v_src += src_strd;
207             }
208             rows = num_rows & 0x3;
209             if(rows)
210             {
211                 for(i = 0; i < rows; i++)
212                 {
213                     UWORD8 *pu1_uv_src_temp, *pu1_u_dst_temp, *pu1_v_dst_temp;
214 
215                     pu1_uv_src_temp = pu1_uv_src;
216                     pu1_u_dst_temp =  pu1_u_dst;
217                     pu1_v_dst_temp =  pu1_v_dst;
218 
219                     for(j = 0; j < cols; j++)
220                     {
221 
222                         src_uv0_8x16b = _mm_loadu_si128((__m128i *)pu1_uv_src_temp);
223                         src_uv1_8x16b = _mm_loadu_si128((__m128i *)(pu1_uv_src_temp + 16));
224 
225                         temp0_8x16b = _mm_shuffle_epi8(src_uv0_8x16b, alt_first_mask);
226                         temp1_8x16b = _mm_shuffle_epi8(src_uv1_8x16b, alt_first_mask);
227 
228                         src_u_8x16b = _mm_unpacklo_epi64(temp0_8x16b, temp1_8x16b);
229                         src_v_8x16b = _mm_unpackhi_epi64(temp0_8x16b, temp1_8x16b);
230 
231                         _mm_storeu_si128((__m128i *)(pu1_u_dst_temp), src_u_8x16b);
232                         _mm_storeu_si128((__m128i *)(pu1_v_dst_temp), src_v_8x16b);
233 
234                         pu1_u_dst_temp += 16;
235                         pu1_v_dst_temp += 16;
236                         pu1_uv_src_temp += 32;
237                     }
238 
239                     pu1_u_dst += dst_strd;
240                     pu1_v_dst += dst_strd;
241                     pu1_uv_src += src_strd;
242                 }
243             }
244             pu1_u_dst -= (num_rows * dst_strd);
245             pu1_v_dst -= (num_rows * dst_strd);
246             num_cols &= 0x0F;
247         }
248         if(num_cols)
249         {
250             pu1_u_dst += (cols << 4);
251             pu1_v_dst += (cols << 4);
252             pu1_u_src += 2 * (cols << 4);
253             pu1_v_src += 2 * (cols << 4);
254             for(i = 0; i < num_rows; i++)
255             {
256                 for(j = 0; j < num_cols; j++)
257                 {
258                     pu1_u_dst[j] = pu1_u_src[j * 2];
259                     pu1_v_dst[j] = pu1_v_src[j * 2];
260                 }
261 
262                 pu1_u_dst += dst_strd;
263                 pu1_v_dst += dst_strd;
264                 pu1_u_src += src_strd;
265                 pu1_v_src += src_strd;
266             }
267         }
268     }
269     return;
270 }
271