1 /******************************************************************************
2  *
3  * Copyright (C) 2015 The Android Open Source Project
4  *
5  * Licensed under the Apache License, Version 2.0 (the "License");
6  * you may not use this file except in compliance with the License.
7  * You may obtain a copy of the License at:
8  *
9  * http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  *
17  *****************************************************************************
18  * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19 */
20 /*****************************************************************************/
21 /*                                                                           */
22 /*  File Name         : ih264_deblk_chroma_ssse3.c                           */
23 /*                                                                           */
24 /*  Description       : Contains function definitions for deblocking         */
25 /*                                                                           */
26 /*  List of Functions : ih264_deblk_chroma_vert_bs4_ssse3()                  */
27 /*                      ih264_deblk_chroma_horz_bs4_ssse3()                  */
28 /*                      ih264_deblk_chroma_vert_bslt4_ssse3()                */
29 /*                      ih264_deblk_chroma_horz_bslt4_ssse3()                */
30 /*                      ih264_deblk_chroma_vert_bs4_mbaff_ssse3()            */
31 /*                      ih264_deblk_chroma_vert_bslt4_mbaff_ssse3()          */
32 /*                                                                           */
33 /*  Issues / Problems : None                                                 */
34 /*                                                                           */
35 /*  Revision History  :                                                      */
36 /*                                                                           */
37 /*         DD MM YYYY   Author(s)       Changes (Describe the changes made)  */
38 /*         12 02 2015   Naveen Kumar P  Added chrom deblocking ssse3         */
39 /*                                      intrinsics                           */
40 /*                                                                           */
41 /*****************************************************************************/
42 
43 /*****************************************************************************/
44 /* File Includes                                                             */
45 /*****************************************************************************/
46 
47 /* System include files */
48 #include <stdio.h>
49 
50 /* User include files */
51 #include "ih264_typedefs.h"
52 #include "ih264_platform_macros.h"
53 #include "ih264_deblk_edge_filters.h"
54 #include "ih264_macros.h"
55 
56 /*****************************************************************************/
57 /* Function Definitions                                                      */
58 /*****************************************************************************/
59 
60 /*****************************************************************************/
61 /*                                                                           */
62 /*  Function Name : ih264_deblk_chroma_vert_bs4_ssse3()                      */
63 /*                                                                           */
64 /*  Description   : This function performs filtering of a chroma block       */
65 /*                  vertical edge when the boundary strength is set to 4 in  */
66 /*                  high profile.                                            */
67 /*                                                                           */
68 /*  Inputs        : pu1_src    - pointer to the src sample q0 of U           */
69 /*                  src_strd   - source stride                               */
70 /*                  alpha_cb   - alpha value for the boundary in U           */
71 /*                  beta_cb    - beta value for the boundary in U            */
72 /*                  alpha_cr   - alpha value for the boundary in V           */
73 /*                  beta_cr    - beta value for the boundary in V            */
74 /*                                                                           */
75 /*  Globals       : None                                                     */
76 /*                                                                           */
77 /*  Processing    : This operation is described in Sec. 8.7.2.4 under the    */
78 /*                  title "Filtering process for edges for bS equal to 4" in */
79 /*                  ITU T Rec H.264 with alpha and beta values different in  */
80 /*                  U and V.                                                 */
81 /*                                                                           */
82 /*  Outputs       : None                                                     */
83 /*                                                                           */
84 /*  Returns       : None                                                     */
85 /*                                                                           */
86 /*  Issues        : None                                                     */
87 /*                                                                           */
88 /*  Revision History:                                                        */
89 /*                                                                           */
90 /*         DD MM YYYY   Author(s)       Changes (Describe the changes made)  */
91 /*         12 02 2015   Naveen Kumar P  Initial version                      */
92 /*                                                                           */
93 /*****************************************************************************/
ih264_deblk_chroma_vert_bs4_ssse3(UWORD8 * pu1_src,WORD32 src_strd,WORD32 alpha_cb,WORD32 beta_cb,WORD32 alpha_cr,WORD32 beta_cr)94 void ih264_deblk_chroma_vert_bs4_ssse3(UWORD8 *pu1_src,
95                                        WORD32 src_strd,
96                                        WORD32 alpha_cb,
97                                        WORD32 beta_cb,
98                                        WORD32 alpha_cr,
99                                        WORD32 beta_cr)
100 {
101     UWORD8 *pu1_src_uv = pu1_src; /* Pointer to the src sample q0 of plane U*/
102     WORD32 alpha_cbcr = (alpha_cr << 16) + alpha_cb;
103     WORD32 beta_cbcr = (beta_cr << 16) + beta_cb;
104     __m128i linea, lineb, linec, lined, linee, linef, lineg, lineh;
105     __m128i temp1, temp2, temp3, temp4;
106 
107     __m128i q0_uv_16x8, p0_uv_16x8, q1_uv_16x8, p1_uv_16x8;
108     __m128i q0_uv_8x16, p0_uv_8x16, q1_uv_8x16, p1_uv_8x16;
109     __m128i flag1, flag2;
110     __m128i diff, alpha_cbcr_16x8, beta_cbcr_16x8;
111     __m128i zero = _mm_setzero_si128();
112     __m128i p0_uv_8x16_1, p0_uv_8x16_2, q0_uv_8x16_1, q0_uv_8x16_2;
113 
114     /* Load and transpose the pixel values */
115     linea = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4));
116     lineb = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + src_strd));
117     linec = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 2 * src_strd));
118     lined = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 3 * src_strd));
119     linee = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 4 * src_strd));
120     linef = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 5 * src_strd));
121     lineg = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 6 * src_strd));
122     lineh = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 7 * src_strd));
123 
124     temp1 = _mm_unpacklo_epi16(linea, lineb);
125     temp2 = _mm_unpacklo_epi16(linec, lined);
126     temp3 = _mm_unpacklo_epi16(linee, linef);
127     temp4 = _mm_unpacklo_epi16(lineg, lineh);
128 
129     p1_uv_8x16 = _mm_unpacklo_epi32(temp1, temp2);
130     p0_uv_8x16 = _mm_unpacklo_epi32(temp3, temp4);
131     q0_uv_8x16 = _mm_unpackhi_epi32(temp1, temp2);
132     q1_uv_8x16 = _mm_unpackhi_epi32(temp3, temp4);
133 
134     p1_uv_16x8 = _mm_unpacklo_epi64(p1_uv_8x16, p0_uv_8x16);
135     p0_uv_16x8 = _mm_unpackhi_epi64(p1_uv_8x16, p0_uv_8x16);
136     q0_uv_16x8 = _mm_unpacklo_epi64(q0_uv_8x16, q1_uv_8x16);
137     q1_uv_16x8 = _mm_unpackhi_epi64(q0_uv_8x16, q1_uv_8x16);
138     /* End of transpose */
139 
140     q0_uv_8x16 = _mm_unpacklo_epi8(q0_uv_16x8, zero);
141     q1_uv_8x16 = _mm_unpacklo_epi8(q1_uv_16x8, zero);
142     p1_uv_8x16 = _mm_unpacklo_epi8(p1_uv_16x8, zero);
143     p0_uv_8x16 = _mm_unpacklo_epi8(p0_uv_16x8, zero);
144 
145     diff = _mm_subs_epi16(p0_uv_8x16, q0_uv_8x16); //Condn 1
146     diff = _mm_abs_epi16(diff);
147     alpha_cbcr_16x8 = _mm_set1_epi32(alpha_cbcr);
148     flag1 = _mm_cmpgt_epi16(alpha_cbcr_16x8, diff);
149 
150     diff = _mm_subs_epi16(q1_uv_8x16, q0_uv_8x16); //Condtn 2
151     diff = _mm_abs_epi16(diff);
152     beta_cbcr_16x8 = _mm_set1_epi32(beta_cbcr);
153     flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
154 
155     diff = _mm_subs_epi16(p1_uv_8x16, p0_uv_8x16); //Condtn 3
156     diff = _mm_abs_epi16(diff);
157     flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
158 
159     temp1 = _mm_slli_epi16(p1_uv_8x16, 1);
160     temp2 = _mm_add_epi16(p0_uv_8x16, q1_uv_8x16);
161     temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(2));
162     temp1 = _mm_add_epi16(temp1, temp2);
163     p0_uv_8x16_1 = _mm_srai_epi16(temp1, 2);
164 
165     temp1 = _mm_slli_epi16(q1_uv_8x16, 1);
166     temp2 = _mm_add_epi16(p1_uv_8x16, q0_uv_8x16);
167     temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(2));
168     temp1 = _mm_add_epi16(temp1, temp2);
169     q0_uv_8x16_1 = _mm_srai_epi16(temp1, 2);
170 
171     q0_uv_8x16 = _mm_unpackhi_epi8(q0_uv_16x8, zero);
172     q1_uv_8x16 = _mm_unpackhi_epi8(q1_uv_16x8, zero);
173     p1_uv_8x16 = _mm_unpackhi_epi8(p1_uv_16x8, zero);
174     p0_uv_8x16 = _mm_unpackhi_epi8(p0_uv_16x8, zero);
175 
176     diff = _mm_subs_epi16(p0_uv_8x16, q0_uv_8x16); //Condn 1
177     diff = _mm_abs_epi16(diff);
178     alpha_cbcr_16x8 = _mm_set1_epi32(alpha_cbcr);
179     flag2 = _mm_cmpgt_epi16(alpha_cbcr_16x8, diff);
180 
181     diff = _mm_subs_epi16(q1_uv_8x16, q0_uv_8x16); //Condtn 2
182     diff = _mm_abs_epi16(diff);
183     beta_cbcr_16x8 = _mm_set1_epi32(beta_cbcr);
184     flag2 = _mm_and_si128(flag2, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
185 
186     diff = _mm_subs_epi16(p1_uv_8x16, p0_uv_8x16); //Condtn 3
187     diff = _mm_abs_epi16(diff);
188     flag2 = _mm_and_si128(flag2, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
189 
190     temp1 = _mm_slli_epi16(p1_uv_8x16, 1);
191     temp2 = _mm_add_epi16(p0_uv_8x16, q1_uv_8x16);
192     temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(2));
193     temp1 = _mm_add_epi16(temp1, temp2);
194     p0_uv_8x16_2 = _mm_srai_epi16(temp1, 2);
195 
196     temp1 = _mm_slli_epi16(q1_uv_8x16, 1);
197     temp2 = _mm_add_epi16(p1_uv_8x16, q0_uv_8x16);
198     temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(2));
199     temp1 = _mm_add_epi16(temp1, temp2);
200     q0_uv_8x16_2 = _mm_srai_epi16(temp1, 2);
201 
202     p0_uv_8x16_2 = _mm_packus_epi16(p0_uv_8x16_1, p0_uv_8x16_2);
203     q0_uv_8x16_2 = _mm_packus_epi16(q0_uv_8x16_1, q0_uv_8x16_2);
204 
205     flag1 = _mm_packs_epi16(flag1, flag2);
206 
207     p0_uv_8x16_1 = _mm_and_si128(p0_uv_16x8,
208                                  _mm_xor_si128(flag1, _mm_set1_epi8(0xFF)));
209     p0_uv_8x16_2 = _mm_and_si128(p0_uv_8x16_2, flag1);
210     p0_uv_16x8 = _mm_add_epi8(p0_uv_8x16_1, p0_uv_8x16_2);
211 
212     q0_uv_8x16_1 = _mm_and_si128(q0_uv_16x8,
213                                  _mm_xor_si128(flag1, _mm_set1_epi8(0xFF)));
214     q0_uv_8x16_2 = _mm_and_si128(q0_uv_8x16_2, flag1);
215     q0_uv_16x8 = _mm_add_epi8(q0_uv_8x16_1, q0_uv_8x16_2);
216 
217     /* Inverse-transpose and store back */
218     temp1 = _mm_unpacklo_epi16(p1_uv_16x8, p0_uv_16x8);
219     temp2 = _mm_unpackhi_epi16(p1_uv_16x8, p0_uv_16x8);
220     temp3 = _mm_unpacklo_epi16(q0_uv_16x8, q1_uv_16x8);
221     temp4 = _mm_unpackhi_epi16(q0_uv_16x8, q1_uv_16x8);
222 
223     linea = _mm_unpacklo_epi32(temp1, temp3);
224     lineb = _mm_srli_si128(linea, 8);
225     linec = _mm_unpackhi_epi32(temp1, temp3);
226     lined = _mm_srli_si128(linec, 8);
227     linee = _mm_unpacklo_epi32(temp2, temp4);
228     linef = _mm_srli_si128(linee, 8);
229     lineg = _mm_unpackhi_epi32(temp2, temp4);
230     lineh = _mm_srli_si128(lineg, 8);
231 
232     _mm_storel_epi64((__m128i *)(pu1_src_uv - 4), linea);
233     _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + src_strd), lineb);
234     _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 2 * src_strd), linec);
235     _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 3 * src_strd), lined);
236     _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 4 * src_strd), linee);
237     _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 5 * src_strd), linef);
238     _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 6 * src_strd), lineg);
239     _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 7 * src_strd), lineh);
240 
241 }
242 
243 /*****************************************************************************/
244 /*                                                                           */
245 /*  Function Name : ih264_deblk_chroma_horz_bs4_ssse3()                      */
246 /*                                                                           */
247 /*  Description   : This function performs filtering of a chroma block       */
248 /*                  horizontal edge when the boundary strength is set to 4   */
249 /*                  in high profile.                                         */
250 /*                                                                           */
251 /*  Inputs        : pu1_src    - pointer to the src sample q0 of U           */
252 /*                  src_strd   - source stride                               */
253 /*                  alpha_cb   - alpha value for the boundary in U           */
254 /*                  beta_cb    - beta value for the boundary in U            */
255 /*                  alpha_cr   - alpha value for the boundary in V           */
256 /*                  beta_cr    - beta value for the boundary in V            */
257 /*                                                                           */
258 /*  Globals       : None                                                     */
259 /*                                                                           */
260 /*  Processing    : This operation is described in Sec. 8.7.2.4 under the    */
261 /*                  title "Filtering process for edges for bS equal to 4" in */
262 /*                  ITU T Rec H.264 with alpha and beta values different in  */
263 /*                  U and V.                                                 */
264 /*                                                                           */
265 /*  Outputs       : None                                                     */
266 /*                                                                           */
267 /*  Returns       : None                                                     */
268 /*                                                                           */
269 /*  Issues        : None                                                     */
270 /*                                                                           */
271 /*  Revision History:                                                        */
272 /*                                                                           */
273 /*         DD MM YYYY   Author(s)       Changes (Describe the changes made)  */
274 /*         12 02 2015   Naveen Kumar P  Initial version                      */
275 /*                                                                           */
276 /*****************************************************************************/
ih264_deblk_chroma_horz_bs4_ssse3(UWORD8 * pu1_src,WORD32 src_strd,WORD32 alpha_cb,WORD32 beta_cb,WORD32 alpha_cr,WORD32 beta_cr)277 void ih264_deblk_chroma_horz_bs4_ssse3(UWORD8 *pu1_src,
278                                        WORD32 src_strd,
279                                        WORD32 alpha_cb,
280                                        WORD32 beta_cb,
281                                        WORD32 alpha_cr,
282                                        WORD32 beta_cr)
283 {
284     UWORD8 *pu1_src_uv = pu1_src; /* Pointer to the src sample q0 of plane U*/
285     WORD16 i16_posP1, i16_posP0, i16_posQ1;
286 
287     UWORD8 *pu1_HorzPixelUV; /*! < Pointer to the first pixel of the boundary */
288     WORD32 alpha_cbcr = (alpha_cr << 16) + alpha_cb;
289     WORD32 beta_cbcr = (beta_cr << 16) + beta_cb;
290     __m128i q0_uv_16x8, p0_uv_16x8, q1_uv_16x8, p1_uv_16x8;
291     __m128i q0_uv_8x16, p0_uv_8x16, q1_uv_8x16, p1_uv_8x16;
292     __m128i flag1, flag2;
293     __m128i diff, alpha_cbcr_16x8, beta_cbcr_16x8;
294     __m128i zero = _mm_setzero_si128();
295     __m128i p0_uv_8x16_1, p0_uv_8x16_2, q0_uv_8x16_1, q0_uv_8x16_2;
296     __m128i temp1, temp2;
297 
298     pu1_HorzPixelUV = pu1_src_uv - (src_strd << 1);
299 
300     i16_posQ1 = src_strd;
301     i16_posP0 = src_strd;
302     i16_posP1 = 0;
303 
304     q0_uv_16x8 = _mm_loadu_si128((__m128i *)(pu1_src_uv));
305     q1_uv_16x8 = _mm_loadu_si128((__m128i *)(pu1_src_uv + i16_posQ1));
306     p1_uv_16x8 = _mm_loadu_si128((__m128i *)(pu1_HorzPixelUV + i16_posP1));
307     p0_uv_16x8 = _mm_loadu_si128((__m128i *)(pu1_HorzPixelUV + i16_posP0));
308 
309     q0_uv_8x16 = _mm_unpacklo_epi8(q0_uv_16x8, zero);
310     q1_uv_8x16 = _mm_unpacklo_epi8(q1_uv_16x8, zero);
311     p1_uv_8x16 = _mm_unpacklo_epi8(p1_uv_16x8, zero);
312     p0_uv_8x16 = _mm_unpacklo_epi8(p0_uv_16x8, zero);
313 
314     diff = _mm_subs_epi16(p0_uv_8x16, q0_uv_8x16); //Condn 1
315     diff = _mm_abs_epi16(diff);
316     alpha_cbcr_16x8 = _mm_set1_epi32(alpha_cbcr);
317     flag1 = _mm_cmpgt_epi16(alpha_cbcr_16x8, diff);
318 
319     diff = _mm_subs_epi16(q1_uv_8x16, q0_uv_8x16); //Condtn 2
320     diff = _mm_abs_epi16(diff);
321     beta_cbcr_16x8 = _mm_set1_epi32(beta_cbcr);
322     flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
323 
324     diff = _mm_subs_epi16(p1_uv_8x16, p0_uv_8x16); //Condtn 3
325     diff = _mm_abs_epi16(diff);
326     flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
327 
328     temp1 = _mm_slli_epi16(p1_uv_8x16, 1);
329     temp2 = _mm_add_epi16(p0_uv_8x16, q1_uv_8x16);
330     temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(2));
331     temp1 = _mm_add_epi16(temp1, temp2);
332     p0_uv_8x16_1 = _mm_srai_epi16(temp1, 2);
333 
334     temp1 = _mm_slli_epi16(q1_uv_8x16, 1);
335     temp2 = _mm_add_epi16(p1_uv_8x16, q0_uv_8x16);
336     temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(2));
337     temp1 = _mm_add_epi16(temp1, temp2);
338     q0_uv_8x16_1 = _mm_srai_epi16(temp1, 2);
339 
340     q0_uv_8x16 = _mm_unpackhi_epi8(q0_uv_16x8, zero);
341     q1_uv_8x16 = _mm_unpackhi_epi8(q1_uv_16x8, zero);
342     p1_uv_8x16 = _mm_unpackhi_epi8(p1_uv_16x8, zero);
343     p0_uv_8x16 = _mm_unpackhi_epi8(p0_uv_16x8, zero);
344 
345     diff = _mm_subs_epi16(p0_uv_8x16, q0_uv_8x16); //Condn 1
346     diff = _mm_abs_epi16(diff);
347     alpha_cbcr_16x8 = _mm_set1_epi32(alpha_cbcr);
348     flag2 = _mm_cmpgt_epi16(alpha_cbcr_16x8, diff);
349 
350     diff = _mm_subs_epi16(q1_uv_8x16, q0_uv_8x16); //Condtn 2
351     diff = _mm_abs_epi16(diff);
352     beta_cbcr_16x8 = _mm_set1_epi32(beta_cbcr);
353     flag2 = _mm_and_si128(flag2, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
354 
355     diff = _mm_subs_epi16(p1_uv_8x16, p0_uv_8x16); //Condtn 3
356     diff = _mm_abs_epi16(diff);
357     flag2 = _mm_and_si128(flag2, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
358 
359     temp1 = _mm_slli_epi16(p1_uv_8x16, 1);
360     temp2 = _mm_add_epi16(p0_uv_8x16, q1_uv_8x16);
361     temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(2));
362     temp1 = _mm_add_epi16(temp1, temp2);
363     p0_uv_8x16_2 = _mm_srai_epi16(temp1, 2);
364 
365     temp1 = _mm_slli_epi16(q1_uv_8x16, 1);
366     temp2 = _mm_add_epi16(p1_uv_8x16, q0_uv_8x16);
367     temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(2));
368     temp1 = _mm_add_epi16(temp1, temp2);
369     q0_uv_8x16_2 = _mm_srai_epi16(temp1, 2);
370 
371     p0_uv_8x16_2 = _mm_packus_epi16(p0_uv_8x16_1, p0_uv_8x16_2);
372     q0_uv_8x16_2 = _mm_packus_epi16(q0_uv_8x16_1, q0_uv_8x16_2);
373 
374     flag1 = _mm_packs_epi16(flag1, flag2);
375 
376     p0_uv_8x16_1 = _mm_and_si128(p0_uv_16x8,
377                                  _mm_xor_si128(flag1, _mm_set1_epi8(0xFF)));
378     p0_uv_8x16_2 = _mm_and_si128(p0_uv_8x16_2, flag1);
379     p0_uv_8x16_1 = _mm_add_epi8(p0_uv_8x16_1, p0_uv_8x16_2);
380     _mm_storeu_si128((__m128i *)(pu1_HorzPixelUV + i16_posP0), p0_uv_8x16_1);
381 
382     q0_uv_8x16_1 = _mm_and_si128(q0_uv_16x8,
383                                  _mm_xor_si128(flag1, _mm_set1_epi8(0xFF)));
384     q0_uv_8x16_2 = _mm_and_si128(q0_uv_8x16_2, flag1);
385     q0_uv_8x16_1 = _mm_add_epi8(q0_uv_8x16_1, q0_uv_8x16_2);
386     _mm_storeu_si128((__m128i *)(pu1_src_uv), q0_uv_8x16_1);
387 
388 }
389 
390 /*****************************************************************************/
391 /*                                                                           */
392 /*  Function Name : ih264_deblk_chroma_vert_bslt4_ssse3()                    */
393 /*                                                                           */
394 /*  Description   : This function performs filtering of a chroma block       */
395 /*                  vertical edge when the boundary strength is less than 4  */
396 /*                  in high profile.                                         */
397 /*                                                                           */
398 /*  Inputs        : pu1_src          - pointer to the src sample q0 of U     */
399 /*                  src_strd         - source stride                         */
400 /*                  alpha_cb         - alpha value for the boundary in U     */
401 /*                  beta_cb          - beta value for the boundary in U      */
402 /*                  alpha_cr         - alpha value for the boundary in V     */
403 /*                  beta_cr          - beta value for the boundary in V      */
404 /*                  u4_bs            - packed Boundary strength array        */
405 /*                  pu1_cliptab_cb   - tc0_table for U                       */
406 /*                  pu1_cliptab_cr   - tc0_table for V                       */
407 /*                                                                           */
408 /*  Globals       : None                                                     */
409 /*                                                                           */
410 /*  Processing    : This operation is described in Sec. 8.7.2.3 under the    */
411 /*                  title "Filtering process for edges for bS less than 4"   */
412 /*                  in ITU T Rec H.264 with alpha and beta values different  */
413 /*                  in U and V.                                              */
414 /*                                                                           */
415 /*  Outputs       : None                                                     */
416 /*                                                                           */
417 /*  Returns       : None                                                     */
418 /*                                                                           */
419 /*  Issues        : None                                                     */
420 /*                                                                           */
421 /*  Revision History:                                                        */
422 /*                                                                           */
423 /*         DD MM YYYY   Author(s)       Changes (Describe the changes made)  */
424 /*         12 02 2015   Naveen Kumar P  Initial version                      */
425 /*                                                                           */
426 /*****************************************************************************/
ih264_deblk_chroma_vert_bslt4_ssse3(UWORD8 * pu1_src,WORD32 src_strd,WORD32 alpha_cb,WORD32 beta_cb,WORD32 alpha_cr,WORD32 beta_cr,UWORD32 u4_bs,const UWORD8 * pu1_cliptab_cb,const UWORD8 * pu1_cliptab_cr)427 void ih264_deblk_chroma_vert_bslt4_ssse3(UWORD8 *pu1_src,
428                                          WORD32 src_strd,
429                                          WORD32 alpha_cb,
430                                          WORD32 beta_cb,
431                                          WORD32 alpha_cr,
432                                          WORD32 beta_cr,
433                                          UWORD32 u4_bs,
434                                          const UWORD8 *pu1_cliptab_cb,
435                                          const UWORD8 *pu1_cliptab_cr)
436 {
437     UWORD8 *pu1_src_uv = pu1_src; /* Pointer to the src sample q0 of plane U*/
438     UWORD8 u1_Bs0, u1_Bs1, u1_Bs2, u1_Bs3;
439     WORD32 alpha_cbcr = (alpha_cr << 16) + alpha_cb;
440     WORD32 beta_cbcr = (beta_cr << 16) + beta_cb;
441     __m128i linea, lineb, linec, lined, linee, linef, lineg, lineh;
442     __m128i temp1, temp2, temp3, temp4;
443 
444     __m128i q0_uv_16x8, p0_uv_16x8, q1_uv_16x8, p1_uv_16x8;
445     __m128i q0_uv_8x16, p0_uv_8x16, q1_uv_8x16, p1_uv_8x16;
446     __m128i flag_bs, flag1, flag2;
447     __m128i diff, diff1, alpha_cbcr_16x8, beta_cbcr_16x8, in_macro;
448     __m128i zero = _mm_setzero_si128();
449     __m128i C0_uv_8x16;
450     __m128i p0_uv_8x16_1, p0_uv_8x16_2, q0_uv_8x16_1, q0_uv_8x16_2;
451 
452     u1_Bs0 = (u4_bs >> 24) & 0xff;
453     u1_Bs1 = (u4_bs >> 16) & 0xff;
454     u1_Bs2 = (u4_bs >> 8) & 0xff;
455     u1_Bs3 = (u4_bs >> 0) & 0xff;
456 
457     flag_bs = _mm_set_epi8(u1_Bs3, u1_Bs3, u1_Bs3, u1_Bs3, u1_Bs2, u1_Bs2,
458                            u1_Bs2, u1_Bs2, u1_Bs1, u1_Bs1, u1_Bs1, u1_Bs1,
459                            u1_Bs0, u1_Bs0, u1_Bs0, u1_Bs0);
460     flag_bs = _mm_cmpeq_epi8(flag_bs, zero); //Set flag to 1s and 0s
461     flag_bs = _mm_xor_si128(flag_bs, _mm_set1_epi8(0xFF)); //Invert for required mask
462 
463     /* Load and transpose the pixel values */
464     linea = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4));
465     lineb = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + src_strd));
466     linec = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 2 * src_strd));
467     lined = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 3 * src_strd));
468     linee = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 4 * src_strd));
469     linef = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 5 * src_strd));
470     lineg = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 6 * src_strd));
471     lineh = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 7 * src_strd));
472 
473     temp1 = _mm_unpacklo_epi16(linea, lineb);
474     temp2 = _mm_unpacklo_epi16(linec, lined);
475     temp3 = _mm_unpacklo_epi16(linee, linef);
476     temp4 = _mm_unpacklo_epi16(lineg, lineh);
477 
478     p1_uv_8x16 = _mm_unpacklo_epi32(temp1, temp2);
479     p0_uv_8x16 = _mm_unpacklo_epi32(temp3, temp4);
480     q0_uv_8x16 = _mm_unpackhi_epi32(temp1, temp2);
481     q1_uv_8x16 = _mm_unpackhi_epi32(temp3, temp4);
482 
483     p1_uv_16x8 = _mm_unpacklo_epi64(p1_uv_8x16, p0_uv_8x16);
484     p0_uv_16x8 = _mm_unpackhi_epi64(p1_uv_8x16, p0_uv_8x16);
485     q0_uv_16x8 = _mm_unpacklo_epi64(q0_uv_8x16, q1_uv_8x16);
486     q1_uv_16x8 = _mm_unpackhi_epi64(q0_uv_8x16, q1_uv_8x16);
487     /* End of transpose */
488 
489     q0_uv_8x16 = _mm_unpacklo_epi8(q0_uv_16x8, zero);
490     q1_uv_8x16 = _mm_unpacklo_epi8(q1_uv_16x8, zero);
491     p1_uv_8x16 = _mm_unpacklo_epi8(p1_uv_16x8, zero);
492     p0_uv_8x16 = _mm_unpacklo_epi8(p0_uv_16x8, zero);
493 
494     diff = _mm_subs_epi16(p0_uv_8x16, q0_uv_8x16); //Condn 1
495     diff = _mm_abs_epi16(diff);
496     alpha_cbcr_16x8 = _mm_set1_epi32(alpha_cbcr);
497     flag1 = _mm_cmpgt_epi16(alpha_cbcr_16x8, diff);
498 
499     diff = _mm_subs_epi16(q1_uv_8x16, q0_uv_8x16); //Condtn 2
500     diff = _mm_abs_epi16(diff);
501     beta_cbcr_16x8 = _mm_set1_epi32(beta_cbcr);
502     flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
503 
504     diff = _mm_subs_epi16(p1_uv_8x16, p0_uv_8x16); //Condtn 3
505     diff = _mm_abs_epi16(diff);
506     flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
507 
508     diff = _mm_subs_epi16(q0_uv_8x16, p0_uv_8x16);
509     diff = _mm_slli_epi16(diff, 2);
510     diff1 = _mm_subs_epi16(p1_uv_8x16, q1_uv_8x16);
511     diff = _mm_add_epi16(diff, diff1);
512     diff = _mm_add_epi16(diff, _mm_set1_epi16(4));
513     in_macro = _mm_srai_epi16(diff, 3);
514 
515     C0_uv_8x16 = _mm_set_epi16(pu1_cliptab_cr[u1_Bs1], pu1_cliptab_cb[u1_Bs1],
516                                pu1_cliptab_cr[u1_Bs1], pu1_cliptab_cb[u1_Bs1],
517                                pu1_cliptab_cr[u1_Bs0], pu1_cliptab_cb[u1_Bs0],
518                                pu1_cliptab_cr[u1_Bs0], pu1_cliptab_cb[u1_Bs0]);
519 
520     C0_uv_8x16 = _mm_add_epi16(C0_uv_8x16, _mm_set1_epi16(1));
521 
522     in_macro = _mm_min_epi16(C0_uv_8x16, in_macro); //CLIP3
523     C0_uv_8x16 = _mm_subs_epi16(zero, C0_uv_8x16);
524     in_macro = _mm_max_epi16(C0_uv_8x16, in_macro);
525 
526     p0_uv_8x16_1 = _mm_add_epi16(p0_uv_8x16, in_macro);
527     q0_uv_8x16_1 = _mm_sub_epi16(q0_uv_8x16, in_macro);
528 
529     q0_uv_8x16 = _mm_unpackhi_epi8(q0_uv_16x8, zero);
530     q1_uv_8x16 = _mm_unpackhi_epi8(q1_uv_16x8, zero);
531     p1_uv_8x16 = _mm_unpackhi_epi8(p1_uv_16x8, zero);
532     p0_uv_8x16 = _mm_unpackhi_epi8(p0_uv_16x8, zero);
533 
534     diff = _mm_subs_epi16(p0_uv_8x16, q0_uv_8x16); //Condn 1
535     diff = _mm_abs_epi16(diff);
536     alpha_cbcr_16x8 = _mm_set1_epi32(alpha_cbcr);
537     flag2 = _mm_cmpgt_epi16(alpha_cbcr_16x8, diff);
538 
539     diff = _mm_subs_epi16(q1_uv_8x16, q0_uv_8x16); //Condtn 2
540     diff = _mm_abs_epi16(diff);
541     beta_cbcr_16x8 = _mm_set1_epi32(beta_cbcr);
542     flag2 = _mm_and_si128(flag2, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
543 
544     diff = _mm_subs_epi16(p1_uv_8x16, p0_uv_8x16); //Condtn 3
545     diff = _mm_abs_epi16(diff);
546     flag2 = _mm_and_si128(flag2, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
547 
548     diff = _mm_subs_epi16(q0_uv_8x16, p0_uv_8x16);
549     diff = _mm_slli_epi16(diff, 2);
550     diff1 = _mm_subs_epi16(p1_uv_8x16, q1_uv_8x16);
551     diff = _mm_add_epi16(diff, diff1);
552     diff = _mm_add_epi16(diff, _mm_set1_epi16(4));
553     in_macro = _mm_srai_epi16(diff, 3);
554 
555     C0_uv_8x16 = _mm_set_epi16(pu1_cliptab_cr[u1_Bs3], pu1_cliptab_cb[u1_Bs3],
556                                pu1_cliptab_cr[u1_Bs3], pu1_cliptab_cb[u1_Bs3],
557                                pu1_cliptab_cr[u1_Bs2], pu1_cliptab_cb[u1_Bs2],
558                                pu1_cliptab_cr[u1_Bs2], pu1_cliptab_cb[u1_Bs2]);
559 
560     C0_uv_8x16 = _mm_add_epi16(C0_uv_8x16, _mm_set1_epi16(1));
561 
562     in_macro = _mm_min_epi16(C0_uv_8x16, in_macro); //CLIP3
563     C0_uv_8x16 = _mm_subs_epi16(zero, C0_uv_8x16);
564     in_macro = _mm_max_epi16(C0_uv_8x16, in_macro);
565 
566     p0_uv_8x16_2 = _mm_add_epi16(p0_uv_8x16, in_macro);
567     q0_uv_8x16_2 = _mm_sub_epi16(q0_uv_8x16, in_macro);
568 
569     p0_uv_8x16_2 = _mm_packus_epi16(p0_uv_8x16_1, p0_uv_8x16_2);
570     q0_uv_8x16_2 = _mm_packus_epi16(q0_uv_8x16_1, q0_uv_8x16_2);
571 
572     flag1 = _mm_packs_epi16(flag1, flag2);
573     flag1 = _mm_and_si128(flag1, flag_bs); //Final flag (BS condition + other 3 conditions)
574 
575     p0_uv_8x16_1 = _mm_and_si128(p0_uv_16x8,
576                                  _mm_xor_si128(flag1, _mm_set1_epi8(0xFF)));
577     p0_uv_8x16_2 = _mm_and_si128(p0_uv_8x16_2, flag1);
578     p0_uv_16x8 = _mm_add_epi8(p0_uv_8x16_1, p0_uv_8x16_2);
579 
580     q0_uv_8x16_1 = _mm_and_si128(q0_uv_16x8,
581                                  _mm_xor_si128(flag1, _mm_set1_epi8(0xFF)));
582     q0_uv_8x16_2 = _mm_and_si128(q0_uv_8x16_2, flag1);
583     q0_uv_16x8 = _mm_add_epi8(q0_uv_8x16_1, q0_uv_8x16_2);
584 
585     /* Inverse-transpose and store back */
586     temp1 = _mm_unpacklo_epi16(p1_uv_16x8, p0_uv_16x8);
587     temp2 = _mm_unpackhi_epi16(p1_uv_16x8, p0_uv_16x8);
588     temp3 = _mm_unpacklo_epi16(q0_uv_16x8, q1_uv_16x8);
589     temp4 = _mm_unpackhi_epi16(q0_uv_16x8, q1_uv_16x8);
590 
591     linea = _mm_unpacklo_epi32(temp1, temp3);
592     lineb = _mm_srli_si128(linea, 8);
593     linec = _mm_unpackhi_epi32(temp1, temp3);
594     lined = _mm_srli_si128(linec, 8);
595     linee = _mm_unpacklo_epi32(temp2, temp4);
596     linef = _mm_srli_si128(linee, 8);
597     lineg = _mm_unpackhi_epi32(temp2, temp4);
598     lineh = _mm_srli_si128(lineg, 8);
599 
600     _mm_storel_epi64((__m128i *)(pu1_src_uv - 4), linea);
601     _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + src_strd), lineb);
602     _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 2 * src_strd), linec);
603     _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 3 * src_strd), lined);
604     _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 4 * src_strd), linee);
605     _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 5 * src_strd), linef);
606     _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 6 * src_strd), lineg);
607     _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 7 * src_strd), lineh);
608 
609 }
610 
611 /*****************************************************************************/
612 /*                                                                           */
613 /*  Function Name : ih264_deblk_chroma_horz_bslt4_ssse3()                    */
614 /*                                                                           */
615 /*  Description   : This function performs filtering of a chroma block       */
616 /*                  horizontal edge when the boundary strength is less than  */
617 /*                  4 in high profile.                                       */
618 /*                                                                           */
619 /*  Inputs        : pu1_src          - pointer to the src sample q0 of U     */
620 /*                  src_strd         - source stride                         */
621 /*                  alpha_cb         - alpha value for the boundary in U     */
622 /*                  beta_cb          - beta value for the boundary in U      */
623 /*                  alpha_cr         - alpha value for the boundary in V     */
624 /*                  beta_cr          - beta value for the boundary in V      */
625 /*                  u4_bs            - packed Boundary strength array        */
626 /*                  pu1_cliptab_cb   - tc0_table for U                       */
627 /*                  pu1_cliptab_cr   - tc0_table for V                       */
628 /*                                                                           */
629 /*  Globals       : None                                                     */
630 /*                                                                           */
631 /*  Processing    : This operation is described in Sec. 8.7.2.3 under the    */
632 /*                  title "Filtering process for edges for bS less than 4"   */
633 /*                  in ITU T Rec H.264 with alpha and beta values different  */
634 /*                  in U and V.                                              */
635 /*                                                                           */
636 /*  Outputs       : None                                                     */
637 /*                                                                           */
638 /*  Returns       : None                                                     */
639 /*                                                                           */
640 /*  Issues        : None                                                     */
641 /*                                                                           */
642 /*  Revision History:                                                        */
643 /*                                                                           */
644 /*         DD MM YYYY   Author(s)       Changes (Describe the changes made)  */
645 /*         12 02 2015   Naveen Kumar P  Initial version                      */
646 /*                                                                           */
647 /*****************************************************************************/
ih264_deblk_chroma_horz_bslt4_ssse3(UWORD8 * pu1_src,WORD32 src_strd,WORD32 alpha_cb,WORD32 beta_cb,WORD32 alpha_cr,WORD32 beta_cr,UWORD32 u4_bs,const UWORD8 * pu1_cliptab_cb,const UWORD8 * pu1_cliptab_cr)648 void ih264_deblk_chroma_horz_bslt4_ssse3(UWORD8 *pu1_src,
649                                          WORD32 src_strd,
650                                          WORD32 alpha_cb,
651                                          WORD32 beta_cb,
652                                          WORD32 alpha_cr,
653                                          WORD32 beta_cr,
654                                          UWORD32 u4_bs,
655                                          const UWORD8 *pu1_cliptab_cb,
656                                          const UWORD8 *pu1_cliptab_cr)
657 {
658     UWORD8 *pu1_src_uv = pu1_src; /* Pointer to the src sample q0 of plane U*/
659     WORD16 i16_posP1, i16_posP0, i16_posQ1;
660     UWORD8 u1_Bs0, u1_Bs1, u1_Bs2, u1_Bs3;
661 
662     UWORD8 *pu1_HorzPixelUV; /*! < Pointer to the first pixel of the boundary */
663     WORD32 alpha_cbcr = (alpha_cr << 16) + alpha_cb;
664     WORD32 beta_cbcr = (beta_cr << 16) + beta_cb;
665     __m128i q0_uv_16x8, p0_uv_16x8, q1_uv_16x8, p1_uv_16x8;
666     __m128i q0_uv_8x16, p0_uv_8x16, q1_uv_8x16, p1_uv_8x16;
667     __m128i flag_bs, flag1, flag2;
668     __m128i diff, diff1, alpha_cbcr_16x8, beta_cbcr_16x8, in_macro;
669     __m128i zero = _mm_setzero_si128();
670     __m128i C0_uv_8x16;
671     __m128i p0_uv_8x16_1, p0_uv_8x16_2, q0_uv_8x16_1, q0_uv_8x16_2;
672 
673     pu1_HorzPixelUV = pu1_src_uv - (src_strd << 1);
674 
675     i16_posQ1 = src_strd;
676     i16_posP0 = src_strd;
677     i16_posP1 = 0;
678 
679     u1_Bs0 = (u4_bs >> 24) & 0xff;
680     u1_Bs1 = (u4_bs >> 16) & 0xff;
681     u1_Bs2 = (u4_bs >> 8) & 0xff;
682     u1_Bs3 = (u4_bs >> 0) & 0xff;
683 
684     flag_bs = _mm_set_epi8(u1_Bs3, u1_Bs3, u1_Bs3, u1_Bs3, u1_Bs2, u1_Bs2,
685                            u1_Bs2, u1_Bs2, u1_Bs1, u1_Bs1, u1_Bs1, u1_Bs1,
686                            u1_Bs0, u1_Bs0, u1_Bs0, u1_Bs0);
687     flag_bs = _mm_cmpeq_epi8(flag_bs, zero); //Set flag to 1s and 0s
688     flag_bs = _mm_xor_si128(flag_bs, _mm_set1_epi8(0xFF)); //Invert for required mask
689 
690     q0_uv_16x8 = _mm_loadu_si128((__m128i *)(pu1_src_uv));
691     q1_uv_16x8 = _mm_loadu_si128((__m128i *)(pu1_src_uv + i16_posQ1));
692     p1_uv_16x8 = _mm_loadu_si128((__m128i *)(pu1_HorzPixelUV + i16_posP1));
693     p0_uv_16x8 = _mm_loadu_si128((__m128i *)(pu1_HorzPixelUV + i16_posP0));
694 
695     q0_uv_8x16 = _mm_unpacklo_epi8(q0_uv_16x8, zero);
696     q1_uv_8x16 = _mm_unpacklo_epi8(q1_uv_16x8, zero);
697     p1_uv_8x16 = _mm_unpacklo_epi8(p1_uv_16x8, zero);
698     p0_uv_8x16 = _mm_unpacklo_epi8(p0_uv_16x8, zero);
699 
700     diff = _mm_subs_epi16(p0_uv_8x16, q0_uv_8x16); //Condn 1
701     diff = _mm_abs_epi16(diff);
702     alpha_cbcr_16x8 = _mm_set1_epi32(alpha_cbcr);
703     flag1 = _mm_cmpgt_epi16(alpha_cbcr_16x8, diff);
704 
705     diff = _mm_subs_epi16(q1_uv_8x16, q0_uv_8x16); //Condtn 2
706     diff = _mm_abs_epi16(diff);
707     beta_cbcr_16x8 = _mm_set1_epi32(beta_cbcr);
708     flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
709 
710     diff = _mm_subs_epi16(p1_uv_8x16, p0_uv_8x16); //Condtn 3
711     diff = _mm_abs_epi16(diff);
712     flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
713 
714     diff = _mm_subs_epi16(q0_uv_8x16, p0_uv_8x16);
715     diff = _mm_slli_epi16(diff, 2);
716     diff1 = _mm_subs_epi16(p1_uv_8x16, q1_uv_8x16);
717     diff = _mm_add_epi16(diff, diff1);
718     diff = _mm_add_epi16(diff, _mm_set1_epi16(4));
719     in_macro = _mm_srai_epi16(diff, 3);
720 
721     C0_uv_8x16 = _mm_set_epi16(pu1_cliptab_cr[u1_Bs1], pu1_cliptab_cb[u1_Bs1],
722                                pu1_cliptab_cr[u1_Bs1], pu1_cliptab_cb[u1_Bs1],
723                                pu1_cliptab_cr[u1_Bs0], pu1_cliptab_cb[u1_Bs0],
724                                pu1_cliptab_cr[u1_Bs0], pu1_cliptab_cb[u1_Bs0]);
725 
726     C0_uv_8x16 = _mm_add_epi16(C0_uv_8x16, _mm_set1_epi16(1));
727 
728     in_macro = _mm_min_epi16(C0_uv_8x16, in_macro); //CLIP3
729     C0_uv_8x16 = _mm_subs_epi16(zero, C0_uv_8x16);
730     in_macro = _mm_max_epi16(C0_uv_8x16, in_macro);
731 
732     p0_uv_8x16_1 = _mm_add_epi16(p0_uv_8x16, in_macro);
733     q0_uv_8x16_1 = _mm_sub_epi16(q0_uv_8x16, in_macro);
734 
735     q0_uv_8x16 = _mm_unpackhi_epi8(q0_uv_16x8, zero);
736     q1_uv_8x16 = _mm_unpackhi_epi8(q1_uv_16x8, zero);
737     p1_uv_8x16 = _mm_unpackhi_epi8(p1_uv_16x8, zero);
738     p0_uv_8x16 = _mm_unpackhi_epi8(p0_uv_16x8, zero);
739 
740     diff = _mm_subs_epi16(p0_uv_8x16, q0_uv_8x16); //Condn 1
741     diff = _mm_abs_epi16(diff);
742     alpha_cbcr_16x8 = _mm_set1_epi32(alpha_cbcr);
743     flag2 = _mm_cmpgt_epi16(alpha_cbcr_16x8, diff);
744 
745     diff = _mm_subs_epi16(q1_uv_8x16, q0_uv_8x16); //Condtn 2
746     diff = _mm_abs_epi16(diff);
747     beta_cbcr_16x8 = _mm_set1_epi32(beta_cbcr);
748     flag2 = _mm_and_si128(flag2, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
749 
750     diff = _mm_subs_epi16(p1_uv_8x16, p0_uv_8x16); //Condtn 3
751     diff = _mm_abs_epi16(diff);
752     flag2 = _mm_and_si128(flag2, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
753 
754     diff = _mm_subs_epi16(q0_uv_8x16, p0_uv_8x16);
755     diff = _mm_slli_epi16(diff, 2);
756     diff1 = _mm_subs_epi16(p1_uv_8x16, q1_uv_8x16);
757     diff = _mm_add_epi16(diff, diff1);
758     diff = _mm_add_epi16(diff, _mm_set1_epi16(4));
759     in_macro = _mm_srai_epi16(diff, 3);
760 
761     C0_uv_8x16 = _mm_set_epi16(pu1_cliptab_cr[u1_Bs3], pu1_cliptab_cb[u1_Bs3],
762                                pu1_cliptab_cr[u1_Bs3], pu1_cliptab_cb[u1_Bs3],
763                                pu1_cliptab_cr[u1_Bs2], pu1_cliptab_cb[u1_Bs2],
764                                pu1_cliptab_cr[u1_Bs2], pu1_cliptab_cb[u1_Bs2]);
765 
766     C0_uv_8x16 = _mm_add_epi16(C0_uv_8x16, _mm_set1_epi16(1));
767 
768     in_macro = _mm_min_epi16(C0_uv_8x16, in_macro); //CLIP3
769     C0_uv_8x16 = _mm_subs_epi16(zero, C0_uv_8x16);
770     in_macro = _mm_max_epi16(C0_uv_8x16, in_macro);
771 
772     p0_uv_8x16_2 = _mm_add_epi16(p0_uv_8x16, in_macro);
773     q0_uv_8x16_2 = _mm_sub_epi16(q0_uv_8x16, in_macro);
774 
775     p0_uv_8x16_2 = _mm_packus_epi16(p0_uv_8x16_1, p0_uv_8x16_2);
776     q0_uv_8x16_2 = _mm_packus_epi16(q0_uv_8x16_1, q0_uv_8x16_2);
777 
778     flag1 = _mm_packs_epi16(flag1, flag2);
779     flag1 = _mm_and_si128(flag1, flag_bs); //Final flag (BS condition + other 3 conditions)
780 
781     p0_uv_8x16_1 = _mm_and_si128(p0_uv_16x8,
782                                  _mm_xor_si128(flag1, _mm_set1_epi8(0xFF)));
783     p0_uv_8x16_2 = _mm_and_si128(p0_uv_8x16_2, flag1);
784     p0_uv_8x16_1 = _mm_add_epi8(p0_uv_8x16_1, p0_uv_8x16_2);
785     _mm_storeu_si128((__m128i *)(pu1_HorzPixelUV + i16_posP0), p0_uv_8x16_1);
786 
787     q0_uv_8x16_1 = _mm_and_si128(q0_uv_16x8,
788                                  _mm_xor_si128(flag1, _mm_set1_epi8(0xFF)));
789     q0_uv_8x16_2 = _mm_and_si128(q0_uv_8x16_2, flag1);
790     q0_uv_8x16_1 = _mm_add_epi8(q0_uv_8x16_1, q0_uv_8x16_2);
791     _mm_storeu_si128((__m128i *)(pu1_src_uv), q0_uv_8x16_1);
792 
793 }
794 
795 /*****************************************************************************/
796 /*                                                                           */
797 /*  Function Name : ih264_deblk_chroma_vert_bs4_mbaff_ssse3()                */
798 /*                                                                           */
799 /*  Description   : This function performs filtering of a chroma block       */
800 /*                  vertical edge when boundary strength is set to 4 in high */
801 /*                  profile.                                                 */
802 /*                                                                           */
803 /*  Inputs        : pu1_src          - pointer to the src sample q0 of U     */
804 /*                  src_strd         - source stride                         */
805 /*                  alpha_cb         - alpha value for the boundary in U     */
806 /*                  beta_cb          - beta value for the boundary in U      */
807 /*                  alpha_cr         - alpha value for the boundary in V     */
808 /*                  beta_cr          - beta value for the boundary in V      */
809 /*                  u4_bs            - packed Boundary strength array        */
810 /*                  pu1_cliptab_cb   - tc0_table for U                       */
811 /*                  pu1_cliptab_cr   - tc0_table for V                       */
812 /*                                                                           */
813 /*  Globals       : None                                                     */
814 /*                                                                           */
815 /*  Processing    : When the function is called twice, this operation is as  */
816 /*                  described in Sec. 8.7.2.4 under the title "Filtering     */
817 /*                  process for edges for bS equal to 4" in ITU T Rec H.264  */
818 /*                  with alpha and beta values different in U and V.         */
819 /*                                                                           */
820 /*  Outputs       : None                                                     */
821 /*                                                                           */
822 /*  Returns       : None                                                     */
823 /*                                                                           */
824 /*  Issues        : None                                                     */
825 /*                                                                           */
826 /*  Revision History:                                                        */
827 /*                                                                           */
828 /*         DD MM YYYY   Author(s)       Changes (Describe the changes made)  */
829 /*         12 02 2015   Naveen Kumar P  Initial version                      */
830 /*                                                                           */
831 /*****************************************************************************/
ih264_deblk_chroma_vert_bs4_mbaff_ssse3(UWORD8 * pu1_src,WORD32 src_strd,WORD32 alpha_cb,WORD32 beta_cb,WORD32 alpha_cr,WORD32 beta_cr)832 void ih264_deblk_chroma_vert_bs4_mbaff_ssse3(UWORD8 *pu1_src,
833                                              WORD32 src_strd,
834                                              WORD32 alpha_cb,
835                                              WORD32 beta_cb,
836                                              WORD32 alpha_cr,
837                                              WORD32 beta_cr)
838 {
839     UWORD8 *pu1_src_uv = pu1_src; /* Pointer to the src sample q0 of plane U*/
840     WORD32 alpha_cbcr = (alpha_cr << 16) + alpha_cb;
841     WORD32 beta_cbcr = (beta_cr << 16) + beta_cb;
842     __m128i linea, lineb, linec, lined;
843     __m128i temp1, temp2;
844 
845     __m128i q0_uv_16x8, p0_uv_16x8, q1_uv_16x8, p1_uv_16x8;
846     __m128i q0_uv_8x16, p0_uv_8x16, q1_uv_8x16, p1_uv_8x16;
847     __m128i flag1;
848     __m128i diff, alpha_cbcr_16x8, beta_cbcr_16x8;
849     __m128i zero = _mm_setzero_si128();
850     __m128i p0_uv_8x16_1, p0_uv_8x16_2, q0_uv_8x16_1, q0_uv_8x16_2;
851 
852     /* Load and transpose the pixel values */
853     linea = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4));
854     lineb = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + src_strd));
855     linec = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 2 * src_strd));
856     lined = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 3 * src_strd));
857 
858     temp1 = _mm_unpacklo_epi16(linea, lineb);
859     temp2 = _mm_unpacklo_epi16(linec, lined);
860 
861     p1_uv_16x8 = _mm_unpacklo_epi32(temp1, temp2);
862     p0_uv_16x8 = _mm_srli_si128(p1_uv_16x8, 8);
863     q0_uv_16x8 = _mm_unpackhi_epi32(temp1, temp2);
864     q1_uv_16x8 = _mm_srli_si128(q0_uv_16x8, 8);
865     /* End of transpose */
866 
867     q0_uv_8x16 = _mm_unpacklo_epi8(q0_uv_16x8, zero);
868     q1_uv_8x16 = _mm_unpacklo_epi8(q1_uv_16x8, zero);
869     p1_uv_8x16 = _mm_unpacklo_epi8(p1_uv_16x8, zero);
870     p0_uv_8x16 = _mm_unpacklo_epi8(p0_uv_16x8, zero);
871 
872     diff = _mm_subs_epi16(p0_uv_8x16, q0_uv_8x16); //Condn 1
873     diff = _mm_abs_epi16(diff);
874     alpha_cbcr_16x8 = _mm_set1_epi32(alpha_cbcr);
875     flag1 = _mm_cmpgt_epi16(alpha_cbcr_16x8, diff);
876 
877     diff = _mm_subs_epi16(q1_uv_8x16, q0_uv_8x16); //Condtn 2
878     diff = _mm_abs_epi16(diff);
879     beta_cbcr_16x8 = _mm_set1_epi32(beta_cbcr);
880     flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
881 
882     diff = _mm_subs_epi16(p1_uv_8x16, p0_uv_8x16); //Condtn 3
883     diff = _mm_abs_epi16(diff);
884     flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
885 
886     temp1 = _mm_slli_epi16(p1_uv_8x16, 1);
887     temp2 = _mm_add_epi16(p0_uv_8x16, q1_uv_8x16);
888     temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(2));
889     temp1 = _mm_add_epi16(temp1, temp2);
890     p0_uv_8x16_1 = _mm_srai_epi16(temp1, 2);
891 
892     temp1 = _mm_slli_epi16(q1_uv_8x16, 1);
893     temp2 = _mm_add_epi16(p1_uv_8x16, q0_uv_8x16);
894     temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(2));
895     temp1 = _mm_add_epi16(temp1, temp2);
896     q0_uv_8x16_1 = _mm_srai_epi16(temp1, 2);
897 
898     p0_uv_8x16_2 = _mm_packus_epi16(p0_uv_8x16_1, p0_uv_8x16_1);
899     q0_uv_8x16_2 = _mm_packus_epi16(q0_uv_8x16_1, q0_uv_8x16_1);
900 
901     flag1 = _mm_packs_epi16(flag1, flag1);
902 
903     p0_uv_8x16_1 = _mm_and_si128(p0_uv_16x8,
904                                  _mm_xor_si128(flag1, _mm_set1_epi8(0xFF)));
905     p0_uv_8x16_2 = _mm_and_si128(p0_uv_8x16_2, flag1);
906     p0_uv_16x8 = _mm_add_epi8(p0_uv_8x16_1, p0_uv_8x16_2);
907 
908     q0_uv_8x16_1 = _mm_and_si128(q0_uv_16x8,
909                                  _mm_xor_si128(flag1, _mm_set1_epi8(0xFF)));
910     q0_uv_8x16_2 = _mm_and_si128(q0_uv_8x16_2, flag1);
911     q0_uv_16x8 = _mm_add_epi8(q0_uv_8x16_1, q0_uv_8x16_2);
912 
913     /* Inverse-transpose and store back */
914     temp1 = _mm_unpacklo_epi16(p1_uv_16x8, p0_uv_16x8);
915     temp2 = _mm_unpacklo_epi16(q0_uv_16x8, q1_uv_16x8);
916 
917     linea = _mm_unpacklo_epi32(temp1, temp2);
918     lineb = _mm_srli_si128(linea, 8);
919     linec = _mm_unpackhi_epi32(temp1, temp2);
920     lined = _mm_srli_si128(linec, 8);
921 
922     _mm_storel_epi64((__m128i *)(pu1_src_uv - 4), linea);
923     _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + src_strd), lineb);
924     _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 2 * src_strd), linec);
925     _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 3 * src_strd), lined);
926 
927 }
928 
929 /*****************************************************************************/
930 /*                                                                           */
931 /*  Function Name : ih264_deblk_chroma_vert_bslt4_mbaff_ssse3()              */
932 /*                                                                           */
933 /*  Description   : This function performs filtering of a chroma block       */
934 /*                  vertical edge when boundary strength is less than 4 in   */
935 /*                  high profile.                                            */
936 /*                                                                           */
937 /*  Inputs        : pu1_src          - pointer to the src sample q0 of U     */
938 /*                  src_strd         - source stride                         */
939 /*                  alpha_cb         - alpha value for the boundary in U     */
940 /*                  beta_cb          - beta value for the boundary in U      */
941 /*                  alpha_cr         - alpha value for the boundary in V     */
942 /*                  beta_cr          - beta value for the boundary in V      */
943 /*                  u4_bs            - packed Boundary strength array        */
944 /*                  pu1_cliptab_cb   - tc0_table for U                       */
945 /*                  pu1_cliptab_cr   - tc0_table for V                       */
946 /*                                                                           */
947 /*  Globals       : None                                                     */
948 /*                                                                           */
949 /*  Processing    : When the function is called twice, this operation is as  */
950 /*                  described in Sec. 8.7.2.4 under the title "Filtering     */
951 /*                  process for edges for bS less than 4" in ITU T Rec H.264 */
952 /*                  with alpha and beta values different in U and V.         */
953 /*                                                                           */
954 /*  Outputs       : None                                                     */
955 /*                                                                           */
956 /*  Returns       : None                                                     */
957 /*                                                                           */
958 /*  Issues        : None                                                     */
959 /*                                                                           */
960 /*  Revision History:                                                        */
961 /*                                                                           */
962 /*         DD MM YYYY   Author(s)       Changes (Describe the changes made)  */
963 /*         12 02 2015   Naveen Kumar P  Initial version                      */
964 /*                                                                           */
965 /*****************************************************************************/
ih264_deblk_chroma_vert_bslt4_mbaff_ssse3(UWORD8 * pu1_src,WORD32 src_strd,WORD32 alpha_cb,WORD32 beta_cb,WORD32 alpha_cr,WORD32 beta_cr,UWORD32 u4_bs,const UWORD8 * pu1_cliptab_cb,const UWORD8 * pu1_cliptab_cr)966 void ih264_deblk_chroma_vert_bslt4_mbaff_ssse3(UWORD8 *pu1_src,
967                                                WORD32 src_strd,
968                                                WORD32 alpha_cb,
969                                                WORD32 beta_cb,
970                                                WORD32 alpha_cr,
971                                                WORD32 beta_cr,
972                                                UWORD32 u4_bs,
973                                                const UWORD8 *pu1_cliptab_cb,
974                                                const UWORD8 *pu1_cliptab_cr)
975 {
976     UWORD8 *pu1_src_uv = pu1_src; /* Pointer to the src sample q0 of plane U*/
977     UWORD8 u1_Bs0, u1_Bs1, u1_Bs2, u1_Bs3;
978     WORD32 alpha_cbcr = (alpha_cr << 16) + alpha_cb;
979     WORD32 beta_cbcr = (beta_cr << 16) + beta_cb;
980     __m128i linea, lineb, linec, lined;
981     __m128i temp1, temp2;
982 
983     __m128i q0_uv_16x8, p0_uv_16x8, q1_uv_16x8, p1_uv_16x8;
984     __m128i q0_uv_8x16, p0_uv_8x16, q1_uv_8x16, p1_uv_8x16;
985     __m128i flag_bs, flag1;
986     __m128i diff, diff1, alpha_cbcr_16x8, beta_cbcr_16x8, in_macro;
987     __m128i zero = _mm_setzero_si128();
988     __m128i C0_uv_8x16;
989     __m128i p0_uv_8x16_1, p0_uv_8x16_2, q0_uv_8x16_1, q0_uv_8x16_2;
990 
991     u1_Bs0 = (u4_bs >> 24) & 0xff;
992     u1_Bs1 = (u4_bs >> 16) & 0xff;
993     u1_Bs2 = (u4_bs >> 8) & 0xff;
994     u1_Bs3 = (u4_bs >> 0) & 0xff;
995 
996     flag_bs = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, u1_Bs3, u1_Bs3, u1_Bs2,
997                            u1_Bs2, u1_Bs1, u1_Bs1, u1_Bs0, u1_Bs0);
998     flag_bs = _mm_cmpeq_epi8(flag_bs, zero); //Set flag to 1s and 0s
999     flag_bs = _mm_xor_si128(flag_bs, _mm_set1_epi8(0xFF)); //Invert for required mask
1000 
1001     /* Load and transpose the pixel values */
1002     linea = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4));
1003     lineb = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + src_strd));
1004     linec = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 2 * src_strd));
1005     lined = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 3 * src_strd));
1006 
1007     temp1 = _mm_unpacklo_epi16(linea, lineb);
1008     temp2 = _mm_unpacklo_epi16(linec, lined);
1009 
1010     p1_uv_16x8 = _mm_unpacklo_epi32(temp1, temp2);
1011     p0_uv_16x8 = _mm_srli_si128(p1_uv_16x8, 8);
1012     q0_uv_16x8 = _mm_unpackhi_epi32(temp1, temp2);
1013     q1_uv_16x8 = _mm_srli_si128(q0_uv_16x8, 8);
1014     /* End of transpose */
1015 
1016     q0_uv_8x16 = _mm_unpacklo_epi8(q0_uv_16x8, zero);
1017     q1_uv_8x16 = _mm_unpacklo_epi8(q1_uv_16x8, zero);
1018     p1_uv_8x16 = _mm_unpacklo_epi8(p1_uv_16x8, zero);
1019     p0_uv_8x16 = _mm_unpacklo_epi8(p0_uv_16x8, zero);
1020 
1021     diff = _mm_subs_epi16(p0_uv_8x16, q0_uv_8x16); //Condn 1
1022     diff = _mm_abs_epi16(diff);
1023     alpha_cbcr_16x8 = _mm_set1_epi32(alpha_cbcr);
1024     flag1 = _mm_cmpgt_epi16(alpha_cbcr_16x8, diff);
1025 
1026     diff = _mm_subs_epi16(q1_uv_8x16, q0_uv_8x16); //Condtn 2
1027     diff = _mm_abs_epi16(diff);
1028     beta_cbcr_16x8 = _mm_set1_epi32(beta_cbcr);
1029     flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
1030 
1031     diff = _mm_subs_epi16(p1_uv_8x16, p0_uv_8x16); //Condtn 3
1032     diff = _mm_abs_epi16(diff);
1033     flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
1034 
1035     diff = _mm_subs_epi16(q0_uv_8x16, p0_uv_8x16);
1036     diff = _mm_slli_epi16(diff, 2);
1037     diff1 = _mm_subs_epi16(p1_uv_8x16, q1_uv_8x16);
1038     diff = _mm_add_epi16(diff, diff1);
1039     diff = _mm_add_epi16(diff, _mm_set1_epi16(4));
1040     in_macro = _mm_srai_epi16(diff, 3);
1041 
1042     C0_uv_8x16 = _mm_set_epi16(pu1_cliptab_cr[u1_Bs3], pu1_cliptab_cb[u1_Bs3],
1043                                pu1_cliptab_cr[u1_Bs2], pu1_cliptab_cb[u1_Bs2],
1044                                pu1_cliptab_cr[u1_Bs1], pu1_cliptab_cb[u1_Bs1],
1045                                pu1_cliptab_cr[u1_Bs0], pu1_cliptab_cb[u1_Bs0]);
1046 
1047     C0_uv_8x16 = _mm_add_epi16(C0_uv_8x16, _mm_set1_epi16(1));
1048 
1049     in_macro = _mm_min_epi16(C0_uv_8x16, in_macro); //CLIP3
1050     C0_uv_8x16 = _mm_subs_epi16(zero, C0_uv_8x16);
1051     in_macro = _mm_max_epi16(C0_uv_8x16, in_macro);
1052 
1053     p0_uv_8x16_1 = _mm_add_epi16(p0_uv_8x16, in_macro);
1054     q0_uv_8x16_1 = _mm_sub_epi16(q0_uv_8x16, in_macro);
1055 
1056     p0_uv_8x16_2 = _mm_packus_epi16(p0_uv_8x16_1, p0_uv_8x16_1);
1057     q0_uv_8x16_2 = _mm_packus_epi16(q0_uv_8x16_1, q0_uv_8x16_1);
1058 
1059     flag1 = _mm_packs_epi16(flag1, flag1);
1060     flag1 = _mm_and_si128(flag1, flag_bs); //Final flag (BS condition + other 3 conditions)
1061 
1062     p0_uv_8x16_1 = _mm_and_si128(p0_uv_16x8,
1063                                  _mm_xor_si128(flag1, _mm_set1_epi8(0xFF)));
1064     p0_uv_8x16_2 = _mm_and_si128(p0_uv_8x16_2, flag1);
1065     p0_uv_16x8 = _mm_add_epi8(p0_uv_8x16_1, p0_uv_8x16_2);
1066 
1067     q0_uv_8x16_1 = _mm_and_si128(q0_uv_16x8,
1068                                  _mm_xor_si128(flag1, _mm_set1_epi8(0xFF)));
1069     q0_uv_8x16_2 = _mm_and_si128(q0_uv_8x16_2, flag1);
1070     q0_uv_16x8 = _mm_add_epi8(q0_uv_8x16_1, q0_uv_8x16_2);
1071 
1072     /* Inverse-transpose and store back */
1073     temp1 = _mm_unpacklo_epi16(p1_uv_16x8, p0_uv_16x8);
1074     temp2 = _mm_unpacklo_epi16(q0_uv_16x8, q1_uv_16x8);
1075 
1076     linea = _mm_unpacklo_epi32(temp1, temp2);
1077     lineb = _mm_srli_si128(linea, 8);
1078     linec = _mm_unpackhi_epi32(temp1, temp2);
1079     lined = _mm_srli_si128(linec, 8);
1080 
1081     _mm_storel_epi64((__m128i *)(pu1_src_uv - 4), linea);
1082     _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + src_strd), lineb);
1083     _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 2 * src_strd), linec);
1084     _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 3 * src_strd), lined);
1085 
1086 }
1087 
1088