1 /******************************************************************************
2 *
3 * Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at:
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 *
17 ******************************************************************************/
18 /**
19 *******************************************************************************
20 * @file
21 *  ihevc_deblck_atom_intr.c
22 *
23 * @brief
24 *  Contains function definitions for deblocking filters
25 *
26 * @author
27 *  Rishab
28 *
29 * @par List of Functions:
30 *   - ihevc_deblk_luma_vert_ssse3()
31 *   - ihevc_deblk_luma_horz_ssse3()
32 *   - ihevc_deblk_chroma_vert_ssse3()
33 *   - ihevc_deblk_chroma_horz_ssse3()
34 *
35 * @remarks
36 *  None
37 *
38 *******************************************************************************
39 */
40 #include <stdlib.h>
41 #include <stdio.h>
42 #include <assert.h>
43 #include "ihevc_typedefs.h"
44 #include "ihevc_platform_macros.h"
45 #include "ihevc_macros.h"
46 #include "ihevc_deblk.h"
47 #include "ihevc_deblk_tables.h"
48 #include "ihevc_debug.h"
49 
50 #include "ihevc_tables_x86_intr.h"
51 
52 #include <immintrin.h>
53 /**
54 *******************************************************************************
55 *
56 * @brief
57 *       Decision process and filtering for the luma block vertical edge.
58 *
59 * @par Description:
60 *     The decision process for the luma block vertical edge is  carried out and
61 *     an appropriate filter is applied. The  boundary filter strength, bs should
62 *     be greater than 0.  The pcm flags and the transquant bypass flags should
63 *     be  taken care of by the calling function.
64 *
65 * @param[in] pu1_src
66 *  Pointer to the src sample q(0,0)
67 *
68 * @param[in] src_strd
69 *  Source stride
70 *
71 * @param[in] bs
72 *  Boundary filter strength of q(0,0)
73 *
74 * @param[in] quant_param_p
75 *  quantization parameter of p block
76 *
77 * @param[in] quant_param_q
78 *  quantization parameter of p block
79 *
80 * @param[in] beta_offset_div2
81 *
82 *
83 * @param[in] tc_offset_div2
84 *
85 *
86 * @param[in] filter_flag_p
87 *  flag whether to filter the p block
88 *
89 * @param[in] filter_flag_q
90 *  flag whether to filter the q block
91 *
92 * @returns
93 *
94 * @remarks
95 *  None
96 *
97 *******************************************************************************
98 */
99 
ihevc_deblk_luma_vert_ssse3(UWORD8 * pu1_src,WORD32 src_strd,WORD32 bs,WORD32 quant_param_p,WORD32 quant_param_q,WORD32 beta_offset_div2,WORD32 tc_offset_div2,WORD32 filter_flag_p,WORD32 filter_flag_q)100 void ihevc_deblk_luma_vert_ssse3(UWORD8 *pu1_src,
101                                  WORD32 src_strd,
102                                  WORD32 bs,
103                                  WORD32 quant_param_p,
104                                  WORD32 quant_param_q,
105                                  WORD32 beta_offset_div2,
106                                  WORD32 tc_offset_div2,
107                                  WORD32 filter_flag_p,
108                                  WORD32 filter_flag_q)
109 {
110     WORD32 qp_luma, beta_indx, tc_indx;
111     WORD32 beta, tc;
112     WORD32 d, dp, dq, d_sam0, d_sam3;
113 
114     WORD32 d3, d0, de_0, de_1, de_2, de_3;
115     WORD32 de, dep, deq;
116     __m128i src_row0_8x16b, src_row1_8x16b, src_row2_8x16b, src_row3_8x16b;
117 
118 
119     {
120         __m128i src_tmp_8x16b, coef_8x16b, mask_d_result_4x32b, mask_de_result_8x16b;
121         __m128i mask_16x8b, temp_coef0_8x16b, temp_coef1_8x16b;
122 
123 
124 
125         ASSERT((bs > 0) && (bs <= 3));
126         ASSERT(filter_flag_p || filter_flag_q);
127 
128         qp_luma = (quant_param_p + quant_param_q + 1) >> 1;
129         beta_indx = CLIP3(qp_luma + (beta_offset_div2 << 1), 0, 51);
130 
131         /* BS based on implementation can take value 3 if it is intra/inter egde          */
132         /* based on BS, tc index is calcuated by adding 2 * ( bs - 1) to QP and tc_offset */
133         /* for BS = 1 adding factor is (0*2), BS = 2 or 3 adding factor is (1*2)          */
134         /* the above desired functionallity is achieved by doing (2*(bs>>1))              */
135 
136         tc_indx = CLIP3(qp_luma + (2 * (bs >> 1)) + (tc_offset_div2 << 1), 0, 53);
137 
138         beta = gai4_ihevc_beta_table[beta_indx];
139         tc = gai4_ihevc_tc_table[tc_indx];
140         if(0 == tc)
141         {
142             return;
143         }
144         src_row0_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src - 4));
145         src_row3_8x16b = _mm_loadl_epi64((__m128i *)((pu1_src - 4) + 3 * src_strd));
146 
147         coef_8x16b = _mm_load_si128((__m128i *)(coef_d));
148         mask_16x8b =  _mm_load_si128((__m128i *)(shuffle_d));
149 
150         src_tmp_8x16b = _mm_unpacklo_epi64(src_row0_8x16b, src_row3_8x16b);
151         mask_de_result_8x16b = _mm_shuffle_epi8(src_tmp_8x16b, mask_16x8b);
152 
153         mask_d_result_4x32b = _mm_maddubs_epi16(src_tmp_8x16b, coef_8x16b);
154 
155 
156         //to get all 1's of 8 bit in (1)
157         temp_coef0_8x16b = _mm_cmpeq_epi16(src_tmp_8x16b, src_tmp_8x16b);
158         temp_coef1_8x16b = _mm_srli_epi16(temp_coef0_8x16b, 15);
159         //accumulating values foe dp3 dq3 , dp0 dq0 values
160         mask_d_result_4x32b = _mm_madd_epi16(mask_d_result_4x32b, temp_coef1_8x16b);
161 
162         temp_coef1_8x16b = _mm_packus_epi16(temp_coef1_8x16b, temp_coef1_8x16b);
163         // to get all 1,-1 sets of 16 bits in (0)
164         temp_coef0_8x16b = _mm_unpacklo_epi8(temp_coef0_8x16b, temp_coef1_8x16b);
165         //q33-q30,p33-p30,q03-q00,p03-p00,0,q30-p30,0,q00-p00
166         mask_de_result_8x16b = _mm_maddubs_epi16(mask_de_result_8x16b, temp_coef0_8x16b);
167         //to get 16 bit 1's
168         temp_coef0_8x16b = _mm_srli_epi16(temp_coef1_8x16b, 8);
169 
170 
171         // dq3 dp3 dq0 dp0
172         mask_d_result_4x32b = _mm_abs_epi32(mask_d_result_4x32b);
173         mask_16x8b = _mm_shuffle_epi32(mask_d_result_4x32b, 0xec);
174         mask_d_result_4x32b = _mm_shuffle_epi32(mask_d_result_4x32b, 0x49);
175         // dq dp d3 d0
176         mask_d_result_4x32b = _mm_add_epi32(mask_d_result_4x32b, mask_16x8b);
177         //|q33-q30|,|p33-p30|,|q03-q00|,|p03-p00|,0,|q30-p30|,0,|q00-p00|
178         mask_de_result_8x16b = _mm_abs_epi16(mask_de_result_8x16b);
179         //|q33-q30|+|p33-p30|,|q03-q00|+|p03-p00|,0+|q30-p30|,0+|q00-p00|
180         mask_de_result_8x16b = _mm_madd_epi16(mask_de_result_8x16b, temp_coef0_8x16b);
181 
182         ///store back in a single variable
183         temp_coef0_8x16b = _mm_srli_si128(mask_d_result_4x32b, 4);
184         temp_coef1_8x16b = _mm_srli_si128(mask_d_result_4x32b, 8);
185         mask_16x8b = _mm_srli_si128(mask_d_result_4x32b, 12);
186 
187         d0 = _mm_cvtsi128_si32(mask_d_result_4x32b);
188         d3 = _mm_cvtsi128_si32(temp_coef0_8x16b);
189         dp = _mm_cvtsi128_si32(temp_coef1_8x16b);
190         dq = _mm_cvtsi128_si32(mask_16x8b);
191         //getting d
192         d = d0 + d3;
193 
194         ///store back in a single variable
195         temp_coef0_8x16b = _mm_srli_si128(mask_de_result_8x16b, 4);
196         temp_coef1_8x16b = _mm_srli_si128(mask_de_result_8x16b, 8);
197         mask_16x8b = _mm_srli_si128(mask_de_result_8x16b, 12);
198 
199         de_0 = _mm_cvtsi128_si32(mask_de_result_8x16b);
200         de_1 = _mm_cvtsi128_si32(temp_coef0_8x16b);
201         de_2 = _mm_cvtsi128_si32(temp_coef1_8x16b);
202         de_3 = _mm_cvtsi128_si32(mask_16x8b);
203 
204         de = 0;
205         dep = 0;
206         deq = 0;
207         if(d < beta)
208         {
209             d_sam0 = 0;
210             if((2 * d0 < (beta >> 2))
211                             && (de_2 < (beta >> 3))
212                             && (de_0 < ((5 * tc + 1) >> 1)))
213             {
214                 d_sam0 = 1;
215             }
216 
217             d_sam3 = 0;
218             if((2 * d3 < (beta >> 2))
219                             && (de_3 < (beta >> 3))
220                             && de_1 < ((5 * tc + 1) >> 1))
221             {
222                 d_sam3 = 1;
223             }
224 
225             de = (d_sam0 & d_sam3) + 1;
226             dep = (dp < (beta + (beta >> 1)) >> 3) ? 1 : 0;
227             deq = (dq < (beta + (beta >> 1)) >> 3) ? 1 : 0;
228             if(tc <= 1)
229             {
230                 dep = 0;
231                 deq = 0;
232             }
233         }
234 
235     }
236 
237     if(de != 0)
238     {
239 
240 
241         src_row1_8x16b = _mm_loadl_epi64((__m128i *)((pu1_src - 4) + src_strd));
242         src_row2_8x16b = _mm_loadl_epi64((__m128i *)((pu1_src - 4) + 2 * src_strd));
243 
244         if(de == 2)
245         {
246             __m128i temp_pq_str0_16x8b;
247             __m128i temp_pq1_str0_16x8b, temp_pq1_str1_16x8b;
248             __m128i temp_pq2_str0_16x8b;
249             __m128i temp_pq_str1_16x8b;
250             __m128i temp_str0_16x8b, temp_str1_16x8b, temp_str2_16x8b, temp_str3_16x8b;
251             __m128i temp_max0_16x8b, temp_max1_16x8b, temp_min0_16x8b, temp_min1_16x8b;
252             __m128i const2_8x16b, const2tc_8x16b;
253             LWORD64 mask, tc2;
254             tc = tc << 1;
255             mask = (((LWORD64)filter_flag_q) << 63) | (((LWORD64)filter_flag_p) << 31);
256             tc2 = ((LWORD64)tc);
257 
258             const2_8x16b = _mm_cmpeq_epi16(src_row0_8x16b, src_row0_8x16b);
259             //q'0-q'1-2 ,p'0-p'1-2
260             src_row0_8x16b = _mm_unpacklo_epi64(src_row0_8x16b, src_row2_8x16b);
261             src_row1_8x16b = _mm_unpacklo_epi64(src_row1_8x16b, src_row3_8x16b);
262 
263             const2_8x16b = _mm_srli_epi16(const2_8x16b, 15);
264             temp_pq_str0_16x8b = _mm_srli_epi64(src_row0_8x16b, 16);
265             temp_pq_str1_16x8b = _mm_srli_epi64(src_row1_8x16b, 16);
266             //arranged x x x x x x x x q31 q30 q1 q10 p30 p31 p10 p11 , x x x x x x x x q21 q20 q01 q00 p20 p21 p00 p01
267             temp_str0_16x8b = _mm_unpacklo_epi16(temp_pq_str0_16x8b, temp_pq_str1_16x8b);
268             temp_str1_16x8b = _mm_unpackhi_epi16(temp_pq_str0_16x8b, temp_pq_str1_16x8b);
269 
270             const2_8x16b = _mm_packus_epi16(const2_8x16b, const2_8x16b);
271             //arranged q31 q30 q21 q20 q1 q10 q01 q00 p30 p31 p20 p21 p10 p11 p00 p01
272             temp_pq_str0_16x8b = _mm_unpacklo_epi32(temp_str0_16x8b, temp_str1_16x8b);
273 
274             temp_pq_str0_16x8b = _mm_maddubs_epi16(temp_pq_str0_16x8b, const2_8x16b);
275 
276             //q'1-2, p'1-2
277             temp_pq1_str0_16x8b = _mm_srli_epi64(src_row0_8x16b, 8);
278             temp_pq1_str1_16x8b = _mm_srli_epi64(src_row1_8x16b, 8);
279 
280             temp_str2_16x8b = _mm_unpacklo_epi16(temp_pq1_str0_16x8b, temp_pq1_str1_16x8b);
281             temp_str3_16x8b = _mm_unpackhi_epi16(temp_pq1_str0_16x8b, temp_pq1_str1_16x8b);
282 
283             temp_str2_16x8b = _mm_shuffle_epi32(temp_str2_16x8b, 0x58);
284             temp_str3_16x8b = _mm_shuffle_epi32(temp_str3_16x8b, 0x58);
285             // q30 p30 q20 p20 q10 p10 q01 q00 p30 q20 p20 q10 p10 q01 q00 p00
286             temp_pq1_str0_16x8b = _mm_unpackhi_epi32(temp_str2_16x8b, temp_str3_16x8b);
287             // q32 q31 q22 q21 q12 q11 q02 q01 p32 p31 p22 p21 p12 p11 p02 p01
288             temp_pq1_str1_16x8b = _mm_unpacklo_epi32(temp_str2_16x8b, temp_str3_16x8b);
289 
290             temp_pq1_str0_16x8b = _mm_maddubs_epi16(temp_pq1_str0_16x8b, const2_8x16b);
291             temp_pq1_str1_16x8b = _mm_maddubs_epi16(temp_pq1_str1_16x8b, const2_8x16b);
292 
293             //clipping mask design
294             temp_str1_16x8b = _mm_setzero_si128();
295             temp_str0_16x8b = _mm_loadl_epi64((__m128i *)(&mask));
296             const2tc_8x16b  = _mm_loadl_epi64((__m128i *)(&tc2));
297             temp_str0_16x8b = _mm_shuffle_epi32(temp_str0_16x8b, 0x44);
298             const2tc_8x16b  = _mm_shuffle_epi8(const2tc_8x16b, temp_str1_16x8b);
299 
300             //clipping mask design
301             temp_str0_16x8b = _mm_srai_epi32(temp_str0_16x8b, 31);
302             const2tc_8x16b = _mm_and_si128(const2tc_8x16b, temp_str0_16x8b);
303             //calculating Clipping MAX for all pixel values.
304             temp_max0_16x8b = _mm_adds_epu8(src_row0_8x16b, const2tc_8x16b);
305             temp_max1_16x8b = _mm_adds_epu8(src_row1_8x16b, const2tc_8x16b);
306 
307 
308             //q'2-q'0-2,p'2-p'0-2
309             temp_pq2_str0_16x8b = _mm_unpacklo_epi16(src_row0_8x16b, src_row2_8x16b);
310             temp_str3_16x8b     = _mm_unpacklo_epi16(src_row1_8x16b, src_row3_8x16b);
311 
312             temp_pq2_str0_16x8b = _mm_shuffle_epi32(temp_pq2_str0_16x8b, 0x5c);
313             temp_str3_16x8b     = _mm_shuffle_epi32(temp_str3_16x8b, 0x5c);
314 
315             const2_8x16b = _mm_slli_epi16(const2_8x16b, 1);
316             //arranged q33 q32 q23 q22 q13 q12 q03 q02 p33 p32 p23 p22 p13 p12 p03 p02
317             temp_str3_16x8b = _mm_unpacklo_epi16(temp_pq2_str0_16x8b, temp_str3_16x8b);
318 
319             temp_pq2_str0_16x8b = _mm_maddubs_epi16(temp_str3_16x8b, const2_8x16b);
320 
321             //calculating Clipping MIN for all pixel values.
322             temp_min0_16x8b = _mm_subs_epu8(src_row0_8x16b, const2tc_8x16b);
323             temp_min1_16x8b = _mm_subs_epu8(src_row1_8x16b, const2tc_8x16b);
324             //q'0-q'1-2 ,p'0-p'1-2
325             temp_pq_str1_16x8b = _mm_shuffle_epi32(temp_pq_str0_16x8b, 0x4e);
326             temp_pq_str0_16x8b = _mm_add_epi16(temp_pq_str0_16x8b, temp_pq_str1_16x8b);
327             //q'1-2 p'1-2
328             temp_pq1_str0_16x8b = _mm_add_epi16(temp_pq1_str0_16x8b, temp_pq1_str1_16x8b);
329             //to get 2 in 16 bit
330             const2_8x16b = _mm_srli_epi16(const2_8x16b, 8);
331             //to get q33 q23 q13 q03, p33 p23 p13 p03
332             temp_pq1_str1_16x8b = _mm_slli_epi16(temp_str3_16x8b, 8);
333             temp_pq_str1_16x8b = _mm_srli_epi16(temp_str3_16x8b, 8);
334             temp_pq1_str1_16x8b = _mm_srli_epi16(temp_pq1_str1_16x8b, 8);
335 
336             //q'1, p'1 (adding 2)
337             temp_pq1_str0_16x8b = _mm_add_epi16(temp_pq1_str0_16x8b, const2_8x16b);
338             //q'0-q'1,p'0-p'1
339             temp_pq_str0_16x8b = _mm_add_epi16(temp_pq_str0_16x8b, const2_8x16b);
340             //q'2-q'1,p'2-p'1
341             temp_pq2_str0_16x8b = _mm_add_epi16(temp_pq2_str0_16x8b, const2_8x16b);
342             //q'0 = (q'0-q'1)+q'1 ,p'0 = (p'0-p'1)+p'1;
343             temp_pq_str0_16x8b = _mm_add_epi16(temp_pq1_str0_16x8b, temp_pq_str0_16x8b);
344             //q'2 = (q'2-q'1)+q'1 ,p'2 = (p'2-p'1)+p'1;
345             temp_pq2_str0_16x8b = _mm_add_epi16(temp_pq1_str0_16x8b, temp_pq2_str0_16x8b);
346 
347             //normalisation of all modified pixels
348             temp_pq_str0_16x8b  = _mm_srai_epi16(temp_pq_str0_16x8b, 3);
349             temp_pq1_str0_16x8b = _mm_srai_epi16(temp_pq1_str0_16x8b, 2);
350             temp_pq2_str0_16x8b = _mm_srai_epi16(temp_pq2_str0_16x8b, 3);
351 
352             //getting p0 p1 together and p2 p3 together
353             temp_str0_16x8b = _mm_unpacklo_epi16(temp_pq1_str0_16x8b, temp_pq_str0_16x8b);
354             temp_str2_16x8b = _mm_unpacklo_epi16(temp_pq1_str1_16x8b, temp_pq2_str0_16x8b);
355             //getting q1 q0 together and  q3 q2 together
356             temp_pq_str0_16x8b = _mm_unpackhi_epi16(temp_pq_str0_16x8b, temp_pq1_str0_16x8b);
357             temp_pq2_str0_16x8b = _mm_unpackhi_epi16(temp_pq2_str0_16x8b, temp_pq_str1_16x8b);
358             //getting p's of row0 row1 together and of row2 row3 together
359             temp_pq_str1_16x8b = _mm_unpacklo_epi32(temp_str2_16x8b, temp_str0_16x8b);
360             temp_str2_16x8b    = _mm_unpackhi_epi32(temp_str2_16x8b, temp_str0_16x8b);
361             //getting q's of row0 row1 together and of row2 row3 together
362             temp_str0_16x8b    = _mm_unpacklo_epi32(temp_pq_str0_16x8b, temp_pq2_str0_16x8b);
363             temp_pq_str0_16x8b = _mm_unpackhi_epi32(temp_pq_str0_16x8b, temp_pq2_str0_16x8b);
364             //getting values for respective rows in 16 bit
365             src_row0_8x16b = _mm_unpacklo_epi64(temp_pq_str1_16x8b, temp_str0_16x8b);
366             src_row1_8x16b = _mm_unpackhi_epi64(temp_pq_str1_16x8b, temp_str0_16x8b);
367             src_row2_8x16b = _mm_unpacklo_epi64(temp_str2_16x8b, temp_pq_str0_16x8b);
368             src_row3_8x16b = _mm_unpackhi_epi64(temp_str2_16x8b, temp_pq_str0_16x8b);
369             //packing values to 8 bit
370             src_row0_8x16b = _mm_packus_epi16(src_row0_8x16b, src_row2_8x16b);
371             src_row1_8x16b = _mm_packus_epi16(src_row1_8x16b, src_row3_8x16b);
372             //Clipping MAX
373             src_row0_8x16b = _mm_min_epu8(src_row0_8x16b, temp_max0_16x8b);
374             src_row1_8x16b = _mm_min_epu8(src_row1_8x16b, temp_max1_16x8b);
375             //Clipping MIN
376             src_row0_8x16b = _mm_max_epu8(src_row0_8x16b, temp_min0_16x8b);
377             src_row1_8x16b = _mm_max_epu8(src_row1_8x16b, temp_min1_16x8b);
378             //separating row 2 and row 3
379             src_row2_8x16b = _mm_srli_si128(src_row0_8x16b, 8);
380             src_row3_8x16b = _mm_srli_si128(src_row1_8x16b, 8);
381 
382         }
383 
384         else
385         {
386 
387             __m128i tmp_delta0_8x16b, tmp_delta1_8x16b, tmp_delta2_8x16b, tmp_delta3_8x16b;
388             __m128i tmp0_const_8x16b, tmp1_const_8x16b, tmp2_const_8x16b, tmp3_const_8x16b;
389             __m128i coefdelta_0_8x16b, mask_pq_8x16b;
390             __m128i const2_8x16b, consttc_8x16b;
391 
392             LWORD64 mask1;
393             mask1 = (((LWORD64)(filter_flag_q & deq)) << 63) | (((LWORD64)filter_flag_q) << 47) | (((LWORD64)filter_flag_p) << 31) | (((LWORD64)(filter_flag_p & dep)) << 15);
394 
395             consttc_8x16b = _mm_set1_epi32(tc);
396 
397 
398             src_row0_8x16b = _mm_unpacklo_epi64(src_row0_8x16b, src_row1_8x16b);
399             src_row2_8x16b = _mm_unpacklo_epi64(src_row2_8x16b, src_row3_8x16b);
400 
401             tmp_delta2_8x16b = _mm_srli_epi64(src_row0_8x16b, 16);
402             tmp_delta3_8x16b = _mm_srli_epi64(src_row2_8x16b, 16);
403 
404             tmp_delta2_8x16b = _mm_shuffle_epi32(tmp_delta2_8x16b, 0x08);
405             tmp_delta3_8x16b = _mm_shuffle_epi32(tmp_delta3_8x16b, 0x08);
406             //arranged q31 q30 p30 p31  q21 q20 p20 p21  q1 q10 p10 p11 q01 q00 p00 p01
407             tmp_delta2_8x16b = _mm_unpacklo_epi64(tmp_delta2_8x16b, tmp_delta3_8x16b);
408 
409             coefdelta_0_8x16b = _mm_load_si128((__m128i *)coef_de1);
410             // (-3q1+9q0),(-9p0+3p1)
411             tmp_delta3_8x16b = _mm_maddubs_epi16(tmp_delta2_8x16b, coefdelta_0_8x16b);
412             //converting to 16 bit
413             consttc_8x16b = _mm_packs_epi32(consttc_8x16b, consttc_8x16b);
414             //getting -tc store
415             tmp1_const_8x16b = _mm_cmpeq_epi32(consttc_8x16b, consttc_8x16b);
416             //calc 10 *tc = 2*tc +8*tc ; 2*tc
417             tmp2_const_8x16b = _mm_slli_epi16(consttc_8x16b, 1);
418             //calc 10 *tc = 2*tc +8*tc ; 8*tc
419             tmp0_const_8x16b = _mm_slli_epi16(consttc_8x16b, 3);
420             //getting -tc store
421             tmp3_const_8x16b = _mm_sign_epi16(consttc_8x16b, tmp1_const_8x16b);
422             //calc 10 *tc
423             tmp2_const_8x16b = _mm_add_epi16(tmp2_const_8x16b, tmp0_const_8x16b);
424             //const 1
425             const2_8x16b = _mm_srli_epi16(tmp1_const_8x16b, 15);
426             tmp_delta0_8x16b = _mm_madd_epi16(tmp_delta3_8x16b, const2_8x16b);
427             const2_8x16b = _mm_srli_epi32(tmp1_const_8x16b, 31);
428             //getting the mask values
429             mask_pq_8x16b = _mm_loadl_epi64((__m128i *)(&mask1));
430             //loaded coef for delta1 calculation
431             coefdelta_0_8x16b = _mm_load_si128((__m128i *)coef_dep1);
432             //(-2q1+q0),(p0-2p1)
433             tmp_delta3_8x16b = _mm_maddubs_epi16(tmp_delta2_8x16b, coefdelta_0_8x16b);
434             //const 8
435             const2_8x16b = _mm_slli_epi32(const2_8x16b, 3);
436             //rearranging the mask values
437             mask_pq_8x16b = _mm_unpacklo_epi64(mask_pq_8x16b, mask_pq_8x16b);
438             //normalisation of the filter
439             tmp_delta0_8x16b = _mm_add_epi32(tmp_delta0_8x16b, const2_8x16b);
440             tmp_delta0_8x16b = _mm_srai_epi32(tmp_delta0_8x16b, 4);
441 
442             //getting deltaq0
443             tmp_delta2_8x16b = _mm_sign_epi32(tmp_delta0_8x16b, tmp1_const_8x16b);
444             //packing  d3q d2q d1q d0q d3p d2p d1p d0p
445             tmp_delta0_8x16b = _mm_packs_epi32(tmp_delta0_8x16b, tmp_delta2_8x16b);
446             //absolute delta
447             tmp_delta2_8x16b = _mm_abs_epi16(tmp_delta0_8x16b);
448             //Clipping of delta0
449             tmp_delta0_8x16b = _mm_min_epi16(tmp_delta0_8x16b, consttc_8x16b);
450             //mask for |delta| < 10*tc
451             tmp0_const_8x16b = _mm_cmpgt_epi16(tmp2_const_8x16b, tmp_delta2_8x16b);
452             //Clipping of delta0
453             tmp_delta0_8x16b = _mm_max_epi16(tmp_delta0_8x16b, tmp3_const_8x16b);
454 
455 
456             //delta 1 calc starts
457 
458             //getting q32 q22 q12 q02 p32 p12 p22 p02
459             tmp2_const_8x16b = _mm_loadl_epi64((__m128i *)(shuffle0));
460             tmp_delta2_8x16b = _mm_shuffle_epi8(src_row0_8x16b, tmp2_const_8x16b);
461             tmp_delta1_8x16b =  _mm_shuffle_epi8(src_row2_8x16b, tmp2_const_8x16b);
462             tmp_delta1_8x16b = _mm_unpacklo_epi32(tmp_delta2_8x16b, tmp_delta1_8x16b);
463             //constant 1
464             const2_8x16b = _mm_srli_epi16(tmp1_const_8x16b, 15);
465             //tc>>1 16 bit
466             consttc_8x16b = _mm_srai_epi16(consttc_8x16b, 1);
467 
468             //getting -tc>>1 store  16 bit
469             tmp1_const_8x16b = _mm_sign_epi16(consttc_8x16b, tmp1_const_8x16b);
470             //2*delta0
471             tmp2_const_8x16b = _mm_add_epi16(tmp_delta0_8x16b, tmp_delta0_8x16b);
472 
473             //getting  all respective q's and p's together
474             tmp3_const_8x16b = _mm_load_si128((__m128i *)(shuffle1));
475             tmp_delta3_8x16b = _mm_shuffle_epi8(tmp_delta3_8x16b, tmp3_const_8x16b);
476             //final adds for deltap1 and deltaq1
477             tmp_delta3_8x16b = _mm_add_epi16(tmp_delta3_8x16b, const2_8x16b);
478             tmp_delta1_8x16b = _mm_add_epi16(tmp_delta1_8x16b, tmp2_const_8x16b);
479             tmp_delta1_8x16b = _mm_add_epi16(tmp_delta1_8x16b, tmp_delta3_8x16b);
480             tmp2_const_8x16b = _mm_setzero_si128();
481             tmp_delta1_8x16b = _mm_srai_epi16(tmp_delta1_8x16b, 2);
482 
483             // clipping delta1
484             tmp_delta1_8x16b = _mm_min_epi16(tmp_delta1_8x16b, consttc_8x16b);
485             // clipping delta1
486             tmp_delta1_8x16b = _mm_max_epi16(tmp_delta1_8x16b, tmp1_const_8x16b);
487 
488             //getting the mask ready
489             mask_pq_8x16b = _mm_srai_epi16(mask_pq_8x16b, 15);
490             //masking of the delta values |delta|<10*tc
491             tmp_delta1_8x16b = _mm_and_si128(tmp_delta1_8x16b, tmp0_const_8x16b);
492             tmp_delta0_8x16b = _mm_and_si128(tmp_delta0_8x16b, tmp0_const_8x16b);
493             //packing dq1 dq0 dp0 dp1
494             tmp1_const_8x16b = _mm_unpacklo_epi16(tmp_delta1_8x16b, tmp_delta0_8x16b);
495             tmp_delta0_8x16b = _mm_unpackhi_epi16(tmp_delta0_8x16b, tmp_delta1_8x16b);
496             tmp_delta1_8x16b = _mm_unpackhi_epi32(tmp1_const_8x16b, tmp_delta0_8x16b);
497             tmp_delta0_8x16b = _mm_unpacklo_epi32(tmp1_const_8x16b, tmp_delta0_8x16b);
498 
499             //masking of the delta values dep, deq , filter_p ,filter_q
500             tmp_delta0_8x16b = _mm_and_si128(tmp_delta0_8x16b, mask_pq_8x16b);
501             tmp_delta1_8x16b = _mm_and_si128(tmp_delta1_8x16b, mask_pq_8x16b);
502             //converting 8bit to 16 bit
503             src_row0_8x16b = _mm_unpacklo_epi8(src_row0_8x16b, tmp2_const_8x16b);
504             src_row1_8x16b = _mm_unpacklo_epi8(src_row1_8x16b, tmp2_const_8x16b);
505             src_row2_8x16b = _mm_unpacklo_epi8(src_row2_8x16b, tmp2_const_8x16b);
506             src_row3_8x16b = _mm_unpacklo_epi8(src_row3_8x16b, tmp2_const_8x16b);
507             //shuffle values loaded
508             tmp0_const_8x16b = _mm_load_si128((__m128i *)shuffle2);
509             tmp1_const_8x16b = _mm_load_si128((__m128i *)shuffle3);
510             //arranging each row delta in different registers
511             tmp_delta3_8x16b = _mm_shuffle_epi8(tmp_delta1_8x16b, tmp1_const_8x16b);
512             tmp_delta2_8x16b = _mm_shuffle_epi8(tmp_delta1_8x16b, tmp0_const_8x16b);
513             tmp_delta1_8x16b = _mm_shuffle_epi8(tmp_delta0_8x16b, tmp1_const_8x16b);
514             tmp_delta0_8x16b = _mm_shuffle_epi8(tmp_delta0_8x16b, tmp0_const_8x16b);
515 
516             //adding the respective delta
517             src_row3_8x16b = _mm_add_epi16(tmp_delta3_8x16b, src_row3_8x16b);
518             src_row2_8x16b = _mm_add_epi16(tmp_delta2_8x16b, src_row2_8x16b);
519             src_row1_8x16b = _mm_add_epi16(tmp_delta1_8x16b, src_row1_8x16b);
520             src_row0_8x16b = _mm_add_epi16(tmp_delta0_8x16b, src_row0_8x16b);
521             //saturating to 8 bit
522             src_row2_8x16b = _mm_packus_epi16(src_row2_8x16b, src_row3_8x16b);
523             src_row0_8x16b = _mm_packus_epi16(src_row0_8x16b, src_row1_8x16b);
524             //separating different rows
525             src_row1_8x16b = _mm_srli_si128(src_row0_8x16b, 8);
526             src_row3_8x16b = _mm_srli_si128(src_row2_8x16b, 8);
527         }
528 
529         _mm_storel_epi64((__m128i *)(pu1_src - 4), src_row0_8x16b);
530         _mm_storel_epi64((__m128i *)((pu1_src - 4) + src_strd), src_row1_8x16b);
531         _mm_storel_epi64((__m128i *)((pu1_src - 4) + 2 * src_strd), src_row2_8x16b);
532         _mm_storel_epi64((__m128i *)((pu1_src - 4) + 3 * src_strd), src_row3_8x16b);
533     }
534 }
535 
ihevc_deblk_luma_horz_ssse3(UWORD8 * pu1_src,WORD32 src_strd,WORD32 bs,WORD32 quant_param_p,WORD32 quant_param_q,WORD32 beta_offset_div2,WORD32 tc_offset_div2,WORD32 filter_flag_p,WORD32 filter_flag_q)536 void ihevc_deblk_luma_horz_ssse3(UWORD8 *pu1_src,
537                                  WORD32 src_strd,
538                                  WORD32 bs,
539                                  WORD32 quant_param_p,
540                                  WORD32 quant_param_q,
541                                  WORD32 beta_offset_div2,
542                                  WORD32 tc_offset_div2,
543                                  WORD32 filter_flag_p,
544                                  WORD32 filter_flag_q)
545 {
546     WORD32 qp_luma, beta_indx, tc_indx;
547     WORD32 beta, tc;
548 
549     WORD32 d0, d3, dp, dq, d;
550     WORD32 de_0, de_1, de_2, de_3;
551     WORD32 d_sam0, d_sam3;
552     WORD32 de, dep, deq;
553 
554     __m128i src_q0_8x16b, src_q1_8x16b, src_p0_8x16b, src_p1_8x16b, src_q2_8x16b;
555     __m128i tmp_pq_str1_8x16b, src_p2_8x16b, tmp_pq_str0_8x16b;
556 
557 
558 
559 
560     {
561         __m128i src_tmp_p_0_8x16b, src_tmp_p_1_8x16b, src_tmp_q_0_8x16b, src_tmp_q_1_8x16b;
562         __m128i coef_8x16b, mask_d_result_4x32b, mask_de_result_8x16b;
563         __m128i mask_16x8b, temp_coef0_8x16b, temp_coef1_8x16b;
564 
565         ASSERT((bs > 0));
566         ASSERT(filter_flag_p || filter_flag_q);
567 
568         qp_luma = (quant_param_p + quant_param_q + 1) >> 1;
569         beta_indx = CLIP3(qp_luma + (beta_offset_div2 << 1), 0, 51);
570 
571         /* BS based on implementation can take value 3 if it is intra/inter egde          */
572         /* based on BS, tc index is calcuated by adding 2 * ( bs - 1) to QP and tc_offset */
573         /* for BS = 1 adding factor is (0*2), BS = 2 or 3 adding factor is (1*2)          */
574         /* the above desired functionallity is achieved by doing (2*(bs>>1))              */
575 
576         tc_indx = CLIP3(qp_luma + 2 * (bs >> 1) + (tc_offset_div2 << 1), 0, 53);
577 
578         beta = gai4_ihevc_beta_table[beta_indx];
579         tc = gai4_ihevc_tc_table[tc_indx];
580         if(0 == tc)
581         {
582             return;
583         }
584         src_q0_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src));
585         src_q1_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd));
586         src_p0_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src - src_strd));
587         src_p1_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src - 2 * src_strd));
588         src_q2_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src + 2 * src_strd));
589         tmp_pq_str1_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src + 3 * src_strd));
590         src_p2_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src - 3 * src_strd));
591         tmp_pq_str0_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src - 4 * src_strd));
592 
593 
594         src_tmp_p_0_8x16b = _mm_unpacklo_epi8(src_p1_8x16b, src_p0_8x16b);
595         src_tmp_p_1_8x16b = _mm_unpacklo_epi8(tmp_pq_str0_8x16b, src_p2_8x16b);
596 
597         src_tmp_q_0_8x16b = _mm_unpacklo_epi8(src_q0_8x16b, src_q1_8x16b);
598         src_tmp_q_1_8x16b = _mm_unpacklo_epi8(src_q2_8x16b, tmp_pq_str1_8x16b);
599 
600         src_tmp_p_0_8x16b = _mm_unpacklo_epi16(src_tmp_p_1_8x16b, src_tmp_p_0_8x16b);
601         src_tmp_q_0_8x16b = _mm_unpacklo_epi16(src_tmp_q_0_8x16b, src_tmp_q_1_8x16b);
602 
603         src_tmp_p_0_8x16b = _mm_shuffle_epi32(src_tmp_p_0_8x16b, 0x6c);
604         src_tmp_q_0_8x16b = _mm_shuffle_epi32(src_tmp_q_0_8x16b, 0x6c);
605 
606         coef_8x16b = _mm_load_si128((__m128i *)(coef_d));
607         mask_16x8b =  _mm_load_si128((__m128i *)(shuffle_d));
608 
609         src_tmp_p_0_8x16b = _mm_unpacklo_epi32(src_tmp_p_0_8x16b, src_tmp_q_0_8x16b);
610         //WORD32 shuffle_d[4]={0x80800403,0x80800c0b,0x03000704,0x0b080f0c};
611         mask_de_result_8x16b = _mm_shuffle_epi8(src_tmp_p_0_8x16b, mask_16x8b);
612 
613         mask_d_result_4x32b = _mm_maddubs_epi16(src_tmp_p_0_8x16b, coef_8x16b);
614 
615 
616         //to get all 1's of 8 bit in (1)
617         temp_coef0_8x16b = _mm_cmpeq_epi16(src_tmp_p_0_8x16b, src_tmp_p_0_8x16b);
618         temp_coef1_8x16b = _mm_srli_epi16(temp_coef0_8x16b, 15);
619         //accumulating values foe dp3 dq3 , dp0 dq0 values
620         mask_d_result_4x32b = _mm_madd_epi16(mask_d_result_4x32b, temp_coef1_8x16b);
621 
622         temp_coef1_8x16b = _mm_packus_epi16(temp_coef1_8x16b, temp_coef1_8x16b);
623         // to get all 1,-1 sets of 16 bits in (0)
624         temp_coef0_8x16b = _mm_unpacklo_epi8(temp_coef0_8x16b, temp_coef1_8x16b);
625         //q33-q30,p33-p30,q03-q00,p03-p00,0,q30-p30,0,q00-p00
626         mask_de_result_8x16b = _mm_maddubs_epi16(mask_de_result_8x16b, temp_coef0_8x16b);
627         //to get 16 bit 1's
628         temp_coef0_8x16b = _mm_srli_epi16(temp_coef1_8x16b, 8);
629 
630 
631         // dq3 dp3 dq0 dp0
632         mask_d_result_4x32b = _mm_abs_epi32(mask_d_result_4x32b);
633         mask_16x8b = _mm_shuffle_epi32(mask_d_result_4x32b, 0xec);
634         mask_d_result_4x32b = _mm_shuffle_epi32(mask_d_result_4x32b, 0x49);
635         // dq dp d3 d0
636         mask_d_result_4x32b = _mm_add_epi32(mask_d_result_4x32b, mask_16x8b);
637         //|q33-q30|,|p33-p30|,|q03-q00|,|p03-p00|,0,|q30-p30|,0,|q00-p00|
638         mask_de_result_8x16b = _mm_abs_epi16(mask_de_result_8x16b);
639         //|q33-q30|+|p33-p30|,|q03-q00|+|p03-p00|,0+|q30-p30|,0+|q00-p00|
640         mask_de_result_8x16b = _mm_madd_epi16(mask_de_result_8x16b, temp_coef0_8x16b);
641 
642         ///store back in a single variable
643         temp_coef0_8x16b = _mm_srli_si128(mask_d_result_4x32b, 4);
644         temp_coef1_8x16b = _mm_srli_si128(mask_d_result_4x32b, 8);
645         mask_16x8b = _mm_srli_si128(mask_d_result_4x32b, 12);
646 
647         d0 = _mm_cvtsi128_si32(mask_d_result_4x32b);
648         d3 = _mm_cvtsi128_si32(temp_coef0_8x16b);
649         dp = _mm_cvtsi128_si32(temp_coef1_8x16b);
650         dq = _mm_cvtsi128_si32(mask_16x8b);
651         //getting d
652         d = d0 + d3;
653 
654         ///store back in a single variable
655         temp_coef0_8x16b = _mm_srli_si128(mask_de_result_8x16b, 4);
656         temp_coef1_8x16b = _mm_srli_si128(mask_de_result_8x16b, 8);
657         mask_16x8b = _mm_srli_si128(mask_de_result_8x16b, 12);
658 
659         de_0 = _mm_cvtsi128_si32(mask_de_result_8x16b);
660         de_1 = _mm_cvtsi128_si32(temp_coef0_8x16b);
661         de_2 = _mm_cvtsi128_si32(temp_coef1_8x16b);
662         de_3 = _mm_cvtsi128_si32(mask_16x8b);
663 
664         de = 0;
665         dep = 0;
666         deq = 0;
667         if(d < beta)
668         {
669             d_sam0 = 0;
670             if((2 * d0 < (beta >> 2))
671                             && (de_2 < (beta >> 3))
672                             && (de_0 < ((5 * tc + 1) >> 1)))
673             {
674                 d_sam0 = 1;
675             }
676 
677             d_sam3 = 0;
678             if((2 * d3 < (beta >> 2))
679                             && (de_3 < (beta >> 3))
680                             && de_1 < ((5 * tc + 1) >> 1))
681             {
682                 d_sam3 = 1;
683             }
684 
685             de = (d_sam0 & d_sam3) + 1;
686             dep = (dp < (beta + (beta >> 1)) >> 3) ? 1 : 0;
687             deq = (dq < (beta + (beta >> 1)) >> 3) ? 1 : 0;
688             if(tc <= 1)
689             {
690                 dep = 0;
691                 deq = 0;
692             }
693         }
694 
695     }
696 
697     if(de != 0)
698     {
699 
700         if(2 == de)
701         {
702 
703             __m128i temp_pq0_str0_16x8b;
704             __m128i temp_pq1_str0_16x8b, temp_pq1_str1_16x8b;
705             __m128i temp_pq2_str0_16x8b;
706             __m128i temp_str0_16x8b, temp_str1_16x8b;
707             __m128i const2_8x16b, const2tc_8x16b;
708 
709             LWORD64 mask, tc2;
710             tc = tc << 1;
711             mask = (((LWORD64)filter_flag_q) << 63) | (((LWORD64)filter_flag_p) << 31);
712             tc2 = ((LWORD64)tc);
713 
714             const2_8x16b = _mm_cmpeq_epi16(src_p1_8x16b, src_p1_8x16b);
715             //q'0-q'1-2 ,p'0-p'1-2
716             temp_pq0_str0_16x8b = _mm_unpacklo_epi8(src_p1_8x16b, src_p0_8x16b);
717             temp_str0_16x8b   = _mm_unpacklo_epi8(src_q0_8x16b, src_q1_8x16b);
718             const2_8x16b = _mm_srli_epi16(const2_8x16b, 15);
719             //arranged q31 q30 q21 q20 q1 q10 q01 q00 p30 p31 p20 p21 p10 p11 p00 p01
720             temp_pq0_str0_16x8b = _mm_unpacklo_epi64(temp_pq0_str0_16x8b, temp_str0_16x8b);
721 
722             const2_8x16b = _mm_packus_epi16(const2_8x16b, const2_8x16b);
723             temp_pq0_str0_16x8b = _mm_maddubs_epi16(temp_pq0_str0_16x8b, const2_8x16b);
724 
725             //q'1-2, p'1-2
726             temp_pq1_str0_16x8b = _mm_unpacklo_epi8(src_p0_8x16b, src_q0_8x16b);
727             temp_pq1_str1_16x8b = _mm_unpacklo_epi8(src_q1_8x16b, src_q2_8x16b);
728             temp_str1_16x8b = _mm_unpacklo_epi8(src_p1_8x16b, src_p2_8x16b);
729             // q30 p30 q20 p20 q10 p10 q01 q00 p30 q20 p20 q10 p10 q01 q00 p00
730             temp_pq1_str0_16x8b = _mm_unpacklo_epi64(temp_pq1_str0_16x8b, temp_pq1_str0_16x8b);
731             // q32 q31 q22 q21 q12 q11 q02 q01 p32 p31 p22 p21 p12 p11 p02 p01
732             temp_pq1_str1_16x8b = _mm_unpacklo_epi64(temp_str1_16x8b, temp_pq1_str1_16x8b);
733 
734             temp_pq1_str0_16x8b = _mm_maddubs_epi16(temp_pq1_str0_16x8b, const2_8x16b);
735             temp_pq1_str1_16x8b = _mm_maddubs_epi16(temp_pq1_str1_16x8b, const2_8x16b);
736 
737             //clipping mask design
738             temp_str1_16x8b = _mm_setzero_si128();
739             temp_str0_16x8b = _mm_loadl_epi64((__m128i *)(&mask));
740             const2tc_8x16b  = _mm_loadl_epi64((__m128i *)(&tc2));
741             temp_str0_16x8b = _mm_shuffle_epi32(temp_str0_16x8b, 0x44);
742             const2tc_8x16b  = _mm_shuffle_epi8(const2tc_8x16b, temp_str1_16x8b);
743 
744             //clipping mask design
745             temp_str0_16x8b = _mm_srai_epi32(temp_str0_16x8b, 31);
746             const2tc_8x16b = _mm_and_si128(const2tc_8x16b, temp_str0_16x8b);
747             //calculating Clipping MAX for all pixel values.
748             src_p0_8x16b = _mm_unpacklo_epi32(src_p0_8x16b, src_q0_8x16b);
749             src_q0_8x16b = _mm_unpacklo_epi32(src_p1_8x16b, src_q1_8x16b);
750             //for clipping calc
751             src_p1_8x16b = _mm_unpacklo_epi64(src_p0_8x16b, src_q0_8x16b);
752             //saving the unmodified data of q1 p1 q0 p0
753             src_q1_8x16b = _mm_unpackhi_epi64(src_p0_8x16b, src_q0_8x16b);
754             //CLIpping MAX and MIN for q1 p1 q0 p0
755             src_p0_8x16b = _mm_adds_epu8(src_p1_8x16b, const2tc_8x16b);
756             src_p1_8x16b = _mm_subs_epu8(src_p1_8x16b, const2tc_8x16b);
757 
758 
759             //q'2-q'0-2,p'2-p'0-2
760             tmp_pq_str0_8x16b = _mm_unpacklo_epi8(src_p2_8x16b, tmp_pq_str0_8x16b);
761             temp_pq2_str0_16x8b = _mm_unpacklo_epi8(src_q2_8x16b, tmp_pq_str1_8x16b);
762             const2_8x16b = _mm_slli_epi16(const2_8x16b, 1);
763             //arranged q33 q32 q23 q22 q13 q12 q03 q02 p32 p33 p22 p23 p12 p13 p02 p03
764             temp_pq2_str0_16x8b = _mm_unpacklo_epi64(tmp_pq_str0_8x16b, temp_pq2_str0_16x8b);
765             src_p2_8x16b = _mm_unpacklo_epi32(src_p2_8x16b, src_q2_8x16b);
766             temp_pq2_str0_16x8b = _mm_maddubs_epi16(temp_pq2_str0_16x8b, const2_8x16b);
767 
768             //calculating Clipping MAX and MIN for p2 and q2 .
769             tmp_pq_str0_8x16b = _mm_adds_epu8(src_p2_8x16b, const2tc_8x16b);
770             tmp_pq_str1_8x16b = _mm_subs_epu8(src_p2_8x16b, const2tc_8x16b);
771             //q'0-q'1-2 ,p'0-p'1-2
772             temp_str0_16x8b = _mm_shuffle_epi32(temp_pq0_str0_16x8b, 0x4e);
773             temp_pq0_str0_16x8b = _mm_add_epi16(temp_pq0_str0_16x8b, temp_str0_16x8b);
774             //q'1-2 p'1-2
775             temp_pq1_str0_16x8b = _mm_add_epi16(temp_pq1_str0_16x8b, temp_pq1_str1_16x8b);
776             //to get 2 in 16 bit
777             const2_8x16b = _mm_srli_epi16(const2_8x16b, 8);
778 
779 
780             //q'1, p'1 (adding 2)
781             temp_pq1_str0_16x8b = _mm_add_epi16(temp_pq1_str0_16x8b, const2_8x16b);
782             //q'0-q'1,p'0-p'1
783             temp_pq0_str0_16x8b = _mm_add_epi16(temp_pq0_str0_16x8b, const2_8x16b);
784             //q'2-q'1,p'2-p'1
785             temp_pq2_str0_16x8b = _mm_add_epi16(temp_pq2_str0_16x8b, const2_8x16b);
786             //q'0 = (q'0-q'1)+q'1 ,p'0 = (p'0-p'1)+p'1;
787             temp_pq0_str0_16x8b = _mm_add_epi16(temp_pq1_str0_16x8b, temp_pq0_str0_16x8b);
788             //q'2 = (q'2-q'1)+q'1 ,p'2 = (p'2-p'1)+p'1;
789             temp_pq2_str0_16x8b = _mm_add_epi16(temp_pq1_str0_16x8b, temp_pq2_str0_16x8b);
790 
791             //normalisation of all modified pixels
792             temp_pq0_str0_16x8b = _mm_srai_epi16(temp_pq0_str0_16x8b, 3);
793             temp_pq1_str0_16x8b = _mm_srai_epi16(temp_pq1_str0_16x8b, 2);
794             temp_pq2_str0_16x8b = _mm_srai_epi16(temp_pq2_str0_16x8b, 3);
795             //q'1 p'1 q'0 p'0
796             temp_pq0_str0_16x8b = _mm_packus_epi16(temp_pq0_str0_16x8b, temp_pq1_str0_16x8b);
797             temp_pq2_str0_16x8b = _mm_packus_epi16(temp_pq2_str0_16x8b, temp_pq2_str0_16x8b);
798             //pack with the unmodified data of q2 and p2
799             src_p2_8x16b = _mm_unpackhi_epi64(temp_pq2_str0_16x8b, src_p2_8x16b);
800             //Clipping MAX and MIN for q'1 p'1 q'0 p'0 and q'2  p'2
801             temp_pq0_str0_16x8b = _mm_min_epu8(temp_pq0_str0_16x8b, src_p0_8x16b);
802             src_p2_8x16b = _mm_min_epu8(src_p2_8x16b, tmp_pq_str0_8x16b);
803             temp_pq0_str0_16x8b = _mm_max_epu8(temp_pq0_str0_16x8b, src_p1_8x16b);
804             src_p2_8x16b = _mm_max_epu8(src_p2_8x16b, tmp_pq_str1_8x16b);
805             //Reshuffling q'1 p'1 q'0 p'0 along with unmodified data
806             src_p0_8x16b = _mm_unpacklo_epi32(temp_pq0_str0_16x8b, src_q1_8x16b);
807             src_p1_8x16b = _mm_unpackhi_epi32(temp_pq0_str0_16x8b, src_q1_8x16b);
808             src_p2_8x16b = _mm_shuffle_epi32(src_p2_8x16b, 0xd8);
809             src_q0_8x16b = _mm_srli_si128(src_p0_8x16b, 8);
810             src_q1_8x16b = _mm_srli_si128(src_p1_8x16b, 8);
811             src_q2_8x16b = _mm_srli_si128(src_p2_8x16b, 8);
812 
813             _mm_storel_epi64((__m128i *)(pu1_src - 3 * src_strd), src_p2_8x16b);
814             _mm_storel_epi64((__m128i *)(pu1_src - 2 * src_strd), src_p1_8x16b);
815             _mm_storel_epi64((__m128i *)(pu1_src - src_strd), src_p0_8x16b);
816             _mm_storel_epi64((__m128i *)(pu1_src), src_q0_8x16b);
817             _mm_storel_epi64((__m128i *)(pu1_src + src_strd), src_q1_8x16b);
818             _mm_storel_epi64((__m128i *)(pu1_src + 2 * src_strd), src_q2_8x16b);
819 
820 
821         }
822 
823         else
824         {
825 
826             __m128i tmp_delta0_8x16b, tmp_delta1_8x16b;
827             __m128i tmp0_const_8x16b, tmp1_const_8x16b, tmp2_const_8x16b;
828             __m128i coefdelta_0_8x16b;
829             __m128i const2_8x16b, consttc_8x16b;
830 
831             LWORD64 maskp0, maskp1, maskq0, maskq1;
832             maskp0 = (LWORD64)filter_flag_p;
833             maskq0 = (LWORD64)filter_flag_q;
834             maskp1 = (LWORD64)dep;
835             maskq1 = (LWORD64)deq;
836             consttc_8x16b = _mm_set1_epi32(tc);
837 
838             tmp_delta0_8x16b = _mm_unpacklo_epi8(src_p1_8x16b, src_p0_8x16b);
839             tmp_delta1_8x16b = _mm_unpacklo_epi8(src_q0_8x16b, src_q1_8x16b);
840             //arranged q31 q30 p30 p31  q21 q20 p20 p21  q1 q10 p10 p11 q01 q00 p00 p01
841             tmp_delta1_8x16b = _mm_unpacklo_epi16(tmp_delta0_8x16b, tmp_delta1_8x16b);
842 
843             coefdelta_0_8x16b = _mm_load_si128((__m128i *)coef_de1);
844             // (-3q1+9q0),(-9p0+3p1)
845             tmp_delta0_8x16b = _mm_maddubs_epi16(tmp_delta1_8x16b, coefdelta_0_8x16b);
846 
847             //getting -tc store
848             tmp2_const_8x16b = _mm_cmpeq_epi32(consttc_8x16b, consttc_8x16b);
849 
850             //getting tc in 16 bit
851             consttc_8x16b = _mm_packs_epi32(consttc_8x16b, consttc_8x16b);
852             //calc 10 *tc = 2*tc +8*tc ; 2*tc
853             tmp_pq_str0_8x16b = _mm_slli_epi16(consttc_8x16b, 1);
854             //calc 10 *tc = 2*tc +8*tc ; 8*tc
855             tmp_pq_str1_8x16b = _mm_slli_epi16(consttc_8x16b, 3);
856 
857             //const 1
858             const2_8x16b = _mm_srli_epi16(tmp2_const_8x16b, 15);
859             //calc 10 *tc
860             tmp_pq_str0_8x16b = _mm_add_epi16(tmp_pq_str0_8x16b, tmp_pq_str1_8x16b);
861             //delta0 without normalisation and clipping
862             tmp_delta0_8x16b = _mm_madd_epi16(tmp_delta0_8x16b, const2_8x16b);
863 
864             const2_8x16b = _mm_srli_epi32(tmp2_const_8x16b, 31);
865 
866             //loaded coef for delta1 calculation
867             coefdelta_0_8x16b = _mm_load_si128((__m128i *)coef_dep1);
868             //(-2q1+q0),(p0-2p1)
869             tmp_delta1_8x16b = _mm_maddubs_epi16(tmp_delta1_8x16b, coefdelta_0_8x16b);
870             //const 8
871             const2_8x16b = _mm_slli_epi32(const2_8x16b, 3);
872 
873             //normalisation of the filter
874             tmp_delta0_8x16b = _mm_add_epi32(tmp_delta0_8x16b, const2_8x16b);
875             tmp_delta0_8x16b = _mm_srai_epi32(tmp_delta0_8x16b, 4);
876 
877             //getting deltaq0
878             tmp_pq_str1_8x16b = _mm_sign_epi32(tmp_delta0_8x16b, tmp2_const_8x16b);
879             //getting -tc
880             tmp1_const_8x16b = _mm_sign_epi16(consttc_8x16b, tmp2_const_8x16b);
881             //packing  d03q d02q d01q d0q d03p d02p d01p d00p
882             tmp_delta0_8x16b = _mm_packs_epi32(tmp_delta0_8x16b, tmp_pq_str1_8x16b);
883             //absolute delta
884             tmp_pq_str1_8x16b = _mm_abs_epi16(tmp_delta0_8x16b);
885 
886             //Clipping of delta0
887             tmp_delta0_8x16b = _mm_min_epi16(tmp_delta0_8x16b, consttc_8x16b);
888             //tc>>1 16 bit
889             consttc_8x16b = _mm_srai_epi16(consttc_8x16b, 1);
890             //Clipping of delta0
891             tmp_delta0_8x16b = _mm_max_epi16(tmp_delta0_8x16b, tmp1_const_8x16b);
892 
893             //(-tc)>>1 16 bit
894             tmp1_const_8x16b = _mm_sign_epi16(consttc_8x16b, tmp2_const_8x16b);
895             //mask for |delta| < 10*tc
896             tmp_pq_str0_8x16b = _mm_cmpgt_epi16(tmp_pq_str0_8x16b, tmp_pq_str1_8x16b);
897             //delta 1 calc starts
898 
899             //getting q32 q22 q12 q02 p32 p12 p22 p02
900             tmp0_const_8x16b = _mm_setzero_si128();
901             src_q2_8x16b = _mm_unpacklo_epi8(src_q2_8x16b, tmp0_const_8x16b);
902             src_p2_8x16b = _mm_unpacklo_epi8(src_p2_8x16b, tmp0_const_8x16b);
903             src_p2_8x16b = _mm_unpacklo_epi64(src_p2_8x16b, src_q2_8x16b);
904             //constant 1
905             const2_8x16b = _mm_srli_epi16(tmp2_const_8x16b, 15);
906             //2*delta0
907             tmp2_const_8x16b = _mm_add_epi16(tmp_delta0_8x16b, tmp_delta0_8x16b);
908             //getting  all respective q's and p's together
909             coefdelta_0_8x16b = _mm_load_si128((__m128i *)(shuffle1));
910             tmp_delta1_8x16b = _mm_shuffle_epi8(tmp_delta1_8x16b, coefdelta_0_8x16b);
911             //final adds for deltap1 and deltaq1
912             tmp_delta1_8x16b = _mm_add_epi16(tmp_delta1_8x16b, const2_8x16b);
913             src_p2_8x16b = _mm_add_epi16(src_p2_8x16b, tmp2_const_8x16b);
914             tmp_delta1_8x16b = _mm_add_epi16(tmp_delta1_8x16b, src_p2_8x16b);
915             tmp_delta1_8x16b = _mm_srai_epi16(tmp_delta1_8x16b, 2);
916 
917             //mask0= (((LWORD64)filter_flag_q)<<63)| (((LWORD64)filter_flag_p)<<31);
918             tmp_pq_str1_8x16b = _mm_loadl_epi64((__m128i *)(&(maskq0)));
919             src_p2_8x16b = _mm_loadl_epi64((__m128i *)(&(maskp0)));
920 
921             //   src_p2_8x16b = _mm_set_epi32(filter_flag_q,filter_flag_p,filter_flag_q,filter_flag_p);
922             //mask1= (((LWORD64)(filter_flag_q&deq))<<63)|(((LWORD64)(filter_flag_p & dep))<<31);
923             src_q2_8x16b = _mm_loadl_epi64((__m128i *)(&(maskq1)));
924             coefdelta_0_8x16b = _mm_loadl_epi64((__m128i *)(&(maskp1)));
925 
926             src_p2_8x16b = _mm_unpacklo_epi32(src_p2_8x16b, tmp_pq_str1_8x16b);
927             src_q2_8x16b = _mm_unpacklo_epi32(coefdelta_0_8x16b, src_q2_8x16b);
928             //src_q2_8x16b = _mm_set_epi32(deq,dep,deq,dep);
929             src_q2_8x16b = _mm_and_si128(src_q2_8x16b, src_p2_8x16b);
930 
931             //rearranging the mask values
932             src_q2_8x16b = _mm_shuffle_epi32(src_q2_8x16b, 0x50);
933             src_p2_8x16b = _mm_shuffle_epi32(src_p2_8x16b, 0x50);
934 
935             src_q2_8x16b = _mm_slli_epi32(src_q2_8x16b, 31);
936             src_p2_8x16b = _mm_slli_epi32(src_p2_8x16b, 31);
937             src_q2_8x16b = _mm_srai_epi32(src_q2_8x16b, 31);
938             src_p2_8x16b = _mm_srai_epi32(src_p2_8x16b, 31);
939 
940             //combining mask delta1
941             tmp_pq_str1_8x16b = _mm_and_si128(tmp_pq_str0_8x16b, src_q2_8x16b);
942             // clipping delta1
943             tmp_delta1_8x16b = _mm_min_epi16(tmp_delta1_8x16b, consttc_8x16b);
944             //combining mask delat0
945             tmp_pq_str0_8x16b = _mm_and_si128(tmp_pq_str0_8x16b, src_p2_8x16b);
946             // clipping delta1
947             tmp_delta1_8x16b = _mm_max_epi16(tmp_delta1_8x16b, tmp1_const_8x16b);
948 
949 
950             //masking of the delta values |delta|<10*tc
951             tmp_delta1_8x16b = _mm_and_si128(tmp_delta1_8x16b, tmp_pq_str1_8x16b);
952             tmp_delta0_8x16b = _mm_and_si128(tmp_delta0_8x16b, tmp_pq_str0_8x16b);
953             //separating p and q delta 0 and addinq p0 and q0
954             tmp_pq_str0_8x16b = _mm_unpacklo_epi64(tmp_delta0_8x16b, tmp0_const_8x16b);
955             tmp_pq_str1_8x16b = _mm_unpackhi_epi64(tmp_delta0_8x16b, tmp0_const_8x16b);
956             src_p0_8x16b = _mm_unpacklo_epi8(src_p0_8x16b, tmp0_const_8x16b);
957             src_q0_8x16b = _mm_unpacklo_epi8(src_q0_8x16b, tmp0_const_8x16b);
958             src_p0_8x16b = _mm_add_epi16(src_p0_8x16b, tmp_pq_str0_8x16b);
959             src_q0_8x16b = _mm_add_epi16(src_q0_8x16b, tmp_pq_str1_8x16b);
960             //separating p and q delta 0 and addinq p0 and q0
961             tmp_pq_str0_8x16b = _mm_unpacklo_epi64(tmp_delta1_8x16b, tmp0_const_8x16b);
962             tmp_pq_str1_8x16b = _mm_unpackhi_epi64(tmp_delta1_8x16b, tmp0_const_8x16b);
963             src_p1_8x16b = _mm_unpacklo_epi8(src_p1_8x16b, tmp0_const_8x16b);
964             src_q1_8x16b = _mm_unpacklo_epi8(src_q1_8x16b, tmp0_const_8x16b);
965             src_p1_8x16b = _mm_add_epi16(src_p1_8x16b, tmp_pq_str0_8x16b);
966             src_q1_8x16b = _mm_add_epi16(src_q1_8x16b, tmp_pq_str1_8x16b);
967             //packing p1 q1 and p0 q0 to 8 bit
968             src_p1_8x16b = _mm_packus_epi16(src_p1_8x16b, src_q1_8x16b);
969             src_p0_8x16b = _mm_packus_epi16(src_p0_8x16b, src_q0_8x16b);
970 
971             src_q1_8x16b = _mm_srli_si128(src_p1_8x16b, 8);
972             src_q0_8x16b = _mm_srli_si128(src_p0_8x16b, 8);
973 
974             _mm_storel_epi64((__m128i *)(pu1_src - 2 * src_strd), src_p1_8x16b);
975             _mm_storel_epi64((__m128i *)(pu1_src - src_strd), src_p0_8x16b);
976             _mm_storel_epi64((__m128i *)(pu1_src), src_q0_8x16b);
977             _mm_storel_epi64((__m128i *)(pu1_src + src_strd), src_q1_8x16b);
978 
979 
980         }
981 
982 
983 
984     }
985 
986 }
987 
ihevc_deblk_chroma_vert_ssse3(UWORD8 * pu1_src,WORD32 src_strd,WORD32 quant_param_p,WORD32 quant_param_q,WORD32 qp_offset_u,WORD32 qp_offset_v,WORD32 tc_offset_div2,WORD32 filter_flag_p,WORD32 filter_flag_q)988 void ihevc_deblk_chroma_vert_ssse3(UWORD8 *pu1_src,
989                                    WORD32 src_strd,
990                                    WORD32 quant_param_p,
991                                    WORD32 quant_param_q,
992                                    WORD32 qp_offset_u,
993                                    WORD32 qp_offset_v,
994                                    WORD32 tc_offset_div2,
995                                    WORD32 filter_flag_p,
996                                    WORD32 filter_flag_q)
997 {
998     WORD32 qp_indx_u, qp_chroma_u;
999     WORD32 qp_indx_v, qp_chroma_v;
1000     WORD32 tc_indx_u, tc_u;
1001     WORD32 tc_indx_v, tc_v;
1002 
1003     __m128i src_row_0_16x8b, tmp_pxl_0_16x8b, src_row_2_16x8b, tmp_pxl_1_16x8b;
1004     ASSERT(filter_flag_p || filter_flag_q);
1005 
1006     /* chroma processing is done only if BS is 2             */
1007     /* this function is assumed to be called only if BS is 2 */
1008     qp_indx_u = qp_offset_u + ((quant_param_p + quant_param_q + 1) >> 1);
1009     qp_chroma_u = qp_indx_u < 0 ? qp_indx_u : (qp_indx_u > 57 ? qp_indx_u - 6 : gai4_ihevc_qp_table[qp_indx_u]);
1010 
1011     qp_indx_v = qp_offset_v + ((quant_param_p + quant_param_q + 1) >> 1);
1012     qp_chroma_v = qp_indx_v < 0 ? qp_indx_v : (qp_indx_v > 57 ? qp_indx_v - 6 : gai4_ihevc_qp_table[qp_indx_v]);
1013 
1014     tc_indx_u = CLIP3(qp_chroma_u + 2 + (tc_offset_div2 << 1), 0, 53);
1015     tc_u = gai4_ihevc_tc_table[tc_indx_u];
1016 
1017     tc_indx_v = CLIP3(qp_chroma_v + 2 + (tc_offset_div2 << 1), 0, 53);
1018     tc_v = gai4_ihevc_tc_table[tc_indx_v];
1019 
1020     if(0 == tc_u && 0 == tc_v)
1021     {
1022         return;
1023     }
1024     src_row_0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src - 4));
1025     tmp_pxl_0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd - 4));
1026     src_row_2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 2 * src_strd - 4));
1027     tmp_pxl_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 3 * src_strd - 4));
1028 
1029     {
1030         LWORD64 mask_tc, mask_flag, mask;
1031         __m128i delta_vu0_16x8b, delta_vu1_16x8b;
1032         __m128i mask_tc_16x8, mask_16x8b, mask_flag_p_16x8b, mask_flag_q_16x8b;
1033         __m128i min_0_16x8b;
1034         __m128i const_16x8b;
1035         mask_flag = (((LWORD64)filter_flag_p) << 31) | (((LWORD64)filter_flag_q) << 63);
1036         mask_tc = (((LWORD64)tc_v) << 16) | ((LWORD64)tc_u);
1037         mask = 0xffff00000000ffffLL;
1038 
1039         src_row_0_16x8b = _mm_unpacklo_epi64(src_row_0_16x8b, tmp_pxl_0_16x8b);
1040         src_row_2_16x8b = _mm_unpacklo_epi64(src_row_2_16x8b, tmp_pxl_1_16x8b);
1041 
1042         mask_16x8b = _mm_load_si128((__m128i *)(shuffle_uv));
1043         // qv11 qu11 qv10 qu10 qv01 qu01 qv00 qu00 pv10 pu10 pv11 pu11 pv00 pu00 pv01 pu01
1044         // qv31 qu31 qv30 qu30 qv21 qu21 qv20 qu20 pv30 pu30 pv31 pu31 pv20 pu20 pv21 pu21
1045         delta_vu0_16x8b = _mm_shuffle_epi8(src_row_0_16x8b, mask_16x8b);
1046         delta_vu1_16x8b = _mm_shuffle_epi8(src_row_2_16x8b, mask_16x8b);
1047 
1048         tmp_pxl_0_16x8b = _mm_unpacklo_epi64(delta_vu0_16x8b, delta_vu1_16x8b);
1049         tmp_pxl_1_16x8b = _mm_unpackhi_epi64(delta_vu0_16x8b, delta_vu1_16x8b);
1050         // pv30 pv31 pu30 pu31 pv20 pv21 pu20 pu21 pv10 pv11 pu10 pu11 pv00 pv01 pu00 pu01
1051         // qv31 qv30 qu31 qu30 qv21 qv20 qu21 qu20 qv11 qv10 qu11 qu10 qv01 qv00 qu01 qu00
1052         delta_vu0_16x8b = _mm_load_si128((__m128i *)delta0);
1053         delta_vu1_16x8b = _mm_load_si128((__m128i *)delta1);
1054 
1055         delta_vu0_16x8b = _mm_maddubs_epi16(tmp_pxl_0_16x8b, delta_vu0_16x8b);
1056         delta_vu1_16x8b = _mm_maddubs_epi16(tmp_pxl_1_16x8b, delta_vu1_16x8b);
1057 
1058         //generating offset 4
1059         const_16x8b = _mm_cmpeq_epi16(tmp_pxl_0_16x8b, tmp_pxl_0_16x8b);
1060         // filter flag mask and tc mask
1061         mask_tc_16x8 = _mm_loadl_epi64((__m128i *)(&mask_tc));
1062         mask_flag_q_16x8b = _mm_loadl_epi64((__m128i *)(&mask_flag));
1063 
1064         mask_tc_16x8 = _mm_shuffle_epi32(mask_tc_16x8, 0x00);
1065         mask_flag_q_16x8b = _mm_srai_epi32(mask_flag_q_16x8b, 31);
1066         //-tc
1067         min_0_16x8b = _mm_sign_epi16(mask_tc_16x8, const_16x8b);
1068         //converting const 1
1069         const_16x8b = _mm_srli_epi16(const_16x8b, 15);
1070 
1071         //filterp and filterq flag
1072         mask_flag_p_16x8b = _mm_shuffle_epi32(mask_flag_q_16x8b, 0x00);
1073         mask_flag_q_16x8b = _mm_shuffle_epi32(mask_flag_q_16x8b, 0x55);
1074 
1075         //modified delta with a filter (1 -4 4 -1) available in 16 bit
1076         delta_vu0_16x8b = _mm_add_epi16(delta_vu0_16x8b, delta_vu1_16x8b);
1077         //converting const 4
1078         const_16x8b = _mm_slli_epi16(const_16x8b, 2);
1079 
1080         mask_16x8b = _mm_loadl_epi64((__m128i *)(&mask));
1081         //offset addition
1082         delta_vu0_16x8b = _mm_add_epi16(delta_vu0_16x8b, const_16x8b);
1083         //eliminating q1
1084         tmp_pxl_1_16x8b = _mm_slli_epi16(tmp_pxl_1_16x8b, 8);
1085 
1086         const_16x8b = _mm_setzero_si128();
1087         //filter after normalisation
1088         delta_vu0_16x8b = _mm_srai_epi16(delta_vu0_16x8b, 3);
1089         mask_16x8b = _mm_shuffle_epi32(mask_16x8b, 0x44);
1090 
1091         //clipping MAX
1092         delta_vu0_16x8b = _mm_min_epi16(delta_vu0_16x8b, mask_tc_16x8);
1093         //getting p0 and eliminating p1
1094         tmp_pxl_0_16x8b = _mm_srli_epi16(tmp_pxl_0_16x8b, 8);
1095         //clipping MIN
1096         delta_vu0_16x8b = _mm_max_epi16(delta_vu0_16x8b, min_0_16x8b);
1097         //getting q0
1098         tmp_pxl_1_16x8b = _mm_srli_epi16(tmp_pxl_1_16x8b, 8);
1099         //masking filter flag
1100         delta_vu1_16x8b = _mm_and_si128(delta_vu0_16x8b, mask_flag_q_16x8b);
1101         delta_vu0_16x8b = _mm_and_si128(delta_vu0_16x8b, mask_flag_p_16x8b);
1102 
1103         // q-delta ,p+delta
1104         tmp_pxl_1_16x8b = _mm_sub_epi16(tmp_pxl_1_16x8b, delta_vu1_16x8b);
1105         tmp_pxl_0_16x8b = _mm_add_epi16(tmp_pxl_0_16x8b, delta_vu0_16x8b);
1106         //merging q0 and p0 of respective rows
1107         delta_vu1_16x8b = _mm_unpackhi_epi32(tmp_pxl_0_16x8b, tmp_pxl_1_16x8b);
1108         delta_vu0_16x8b = _mm_unpacklo_epi32(tmp_pxl_0_16x8b, tmp_pxl_1_16x8b);
1109         // row 0 and row 1 packed , row2 and row3 packed
1110         delta_vu0_16x8b = _mm_packus_epi16(delta_vu0_16x8b, const_16x8b);
1111         delta_vu1_16x8b = _mm_packus_epi16(delta_vu1_16x8b, const_16x8b);
1112         //removing older pixel values
1113         src_row_0_16x8b = _mm_and_si128(src_row_0_16x8b, mask_16x8b);
1114         src_row_2_16x8b = _mm_and_si128(src_row_2_16x8b, mask_16x8b);
1115         //arranging modified pixels
1116         delta_vu0_16x8b = _mm_shuffle_epi32(delta_vu0_16x8b, 0xd8);
1117         delta_vu1_16x8b = _mm_shuffle_epi32(delta_vu1_16x8b, 0xd8);
1118         delta_vu0_16x8b = _mm_slli_epi64(delta_vu0_16x8b, 16);
1119         delta_vu1_16x8b = _mm_slli_epi64(delta_vu1_16x8b, 16);
1120         //plugging the modified values
1121         src_row_0_16x8b = _mm_or_si128(src_row_0_16x8b, delta_vu0_16x8b);
1122         src_row_2_16x8b = _mm_or_si128(src_row_2_16x8b, delta_vu1_16x8b);
1123 
1124 
1125         //geting values for row1 and row 3
1126         tmp_pxl_0_16x8b = _mm_srli_si128(src_row_0_16x8b, 8);
1127         tmp_pxl_1_16x8b = _mm_srli_si128(src_row_2_16x8b, 8);
1128 
1129         _mm_storel_epi64((__m128i *)(pu1_src - 4), src_row_0_16x8b);
1130         _mm_storel_epi64((__m128i *)((pu1_src - 4) + src_strd), tmp_pxl_0_16x8b);
1131         _mm_storel_epi64((__m128i *)((pu1_src - 4) + 2 * src_strd), src_row_2_16x8b);
1132         _mm_storel_epi64((__m128i *)((pu1_src - 4) + 3 * src_strd), tmp_pxl_1_16x8b);
1133     }
1134 
1135 
1136 
1137 }
1138 
ihevc_deblk_chroma_horz_ssse3(UWORD8 * pu1_src,WORD32 src_strd,WORD32 quant_param_p,WORD32 quant_param_q,WORD32 qp_offset_u,WORD32 qp_offset_v,WORD32 tc_offset_div2,WORD32 filter_flag_p,WORD32 filter_flag_q)1139 void ihevc_deblk_chroma_horz_ssse3(UWORD8 *pu1_src,
1140                                    WORD32 src_strd,
1141                                    WORD32 quant_param_p,
1142                                    WORD32 quant_param_q,
1143                                    WORD32 qp_offset_u,
1144                                    WORD32 qp_offset_v,
1145                                    WORD32 tc_offset_div2,
1146                                    WORD32 filter_flag_p,
1147                                    WORD32 filter_flag_q)
1148 {
1149     WORD32 qp_indx_u, qp_chroma_u;
1150     WORD32 qp_indx_v, qp_chroma_v;
1151     WORD32 tc_indx_u, tc_u;
1152     WORD32 tc_indx_v, tc_v;
1153 
1154 
1155     __m128i tmp_p0_16x8b, src_p0_16x8b, src_q0_16x8b, tmp_q0_16x8b;
1156 
1157     ASSERT(filter_flag_p || filter_flag_q);
1158 
1159     /* chroma processing is done only if BS is 2             */
1160     /* this function is assumed to be called only if BS is 2 */
1161     qp_indx_u = qp_offset_u + ((quant_param_p + quant_param_q + 1) >> 1);
1162     qp_chroma_u = qp_indx_u < 0 ? qp_indx_u : (qp_indx_u > 57 ? qp_indx_u - 6 : gai4_ihevc_qp_table[qp_indx_u]);
1163 
1164     qp_indx_v = qp_offset_v + ((quant_param_p + quant_param_q + 1) >> 1);
1165     qp_chroma_v = qp_indx_v < 0 ? qp_indx_v : (qp_indx_v > 57 ? qp_indx_v - 6 : gai4_ihevc_qp_table[qp_indx_v]);
1166 
1167     tc_indx_u = CLIP3(qp_chroma_u + 2 + (tc_offset_div2 << 1), 0, 53);
1168     tc_u = gai4_ihevc_tc_table[tc_indx_u];
1169 
1170     tc_indx_v = CLIP3(qp_chroma_v + 2 + (tc_offset_div2 << 1), 0, 53);
1171     tc_v = gai4_ihevc_tc_table[tc_indx_v];
1172 
1173     if(0 == tc_u && 0 == tc_v)
1174     {
1175         return;
1176     }
1177     tmp_p0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src - 2 * src_strd));
1178     src_p0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src - src_strd));
1179     src_q0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src));
1180     tmp_q0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd));
1181 
1182     {
1183         LWORD64 mask_tc, mask_flag;
1184         __m128i delta_vu0_16x8b, delta_vu1_16x8b;
1185         __m128i mask_tc_16x8, mask_16x8b, mask_flag_p_16x8b, mask_flag_q_16x8b;
1186         __m128i min_0_16x8b;
1187         __m128i const_16x8b;
1188         mask_flag = (((LWORD64)filter_flag_p) << 31) | (((LWORD64)filter_flag_q) << 63);
1189         mask_tc = (((LWORD64)tc_v) << 16) | ((LWORD64)tc_u);
1190 
1191         tmp_p0_16x8b = _mm_unpacklo_epi8(tmp_p0_16x8b, src_p0_16x8b);
1192         tmp_q0_16x8b = _mm_unpacklo_epi8(src_q0_16x8b, tmp_q0_16x8b);
1193 
1194         // pv30 pv31 pu30 pu31 pv20 pv21 pu20 pu21 pv10 pv11 pu10 pu11 pv00 pv01 pu00 pu01
1195         // qv31 qv30 qu31 qu30 qv21 qv20 qu21 qu20 qv11 qv10 qu11 qu10 qv01 qv00 qu01 qu00
1196         delta_vu0_16x8b = _mm_load_si128((__m128i *)delta0);
1197         delta_vu1_16x8b = _mm_load_si128((__m128i *)delta1);
1198 
1199         delta_vu0_16x8b = _mm_maddubs_epi16(tmp_p0_16x8b, delta_vu0_16x8b);
1200         delta_vu1_16x8b = _mm_maddubs_epi16(tmp_q0_16x8b, delta_vu1_16x8b);
1201 
1202 
1203         // filter flag mask and tc mask
1204         mask_tc_16x8 = _mm_loadl_epi64((__m128i *)(&mask_tc));
1205         mask_flag_q_16x8b = _mm_loadl_epi64((__m128i *)(&mask_flag));
1206 
1207         //generating offset 4
1208         const_16x8b = _mm_cmpeq_epi16(tmp_p0_16x8b, tmp_p0_16x8b);
1209         // filter flag mask and tc mask
1210         mask_tc_16x8 = _mm_shuffle_epi32(mask_tc_16x8, 0x00);
1211         mask_flag_q_16x8b = _mm_srai_epi32(mask_flag_q_16x8b, 31);
1212         //-tc
1213         min_0_16x8b = _mm_sign_epi16(mask_tc_16x8, const_16x8b);
1214         //converting const 1
1215         const_16x8b = _mm_srli_epi16(const_16x8b, 15);
1216 
1217         //filterp
1218         mask_flag_p_16x8b = _mm_shuffle_epi32(mask_flag_q_16x8b, 0x00);
1219 
1220 
1221         //converting const 4
1222         const_16x8b = _mm_slli_epi16(const_16x8b, 2);
1223         //modified delta with a filter (1 -4 4 -1) available in 16 bit
1224         delta_vu0_16x8b = _mm_add_epi16(delta_vu0_16x8b, delta_vu1_16x8b);
1225 
1226         //filterq flag
1227         mask_flag_q_16x8b = _mm_shuffle_epi32(mask_flag_q_16x8b, 0x55);
1228         //offset addition
1229         delta_vu0_16x8b = _mm_add_epi16(delta_vu0_16x8b, const_16x8b);
1230         mask_16x8b = _mm_setzero_si128();
1231         //filter after normalisation
1232         delta_vu0_16x8b = _mm_srai_epi16(delta_vu0_16x8b, 3);
1233 
1234         //converting p0 to 16bit
1235         src_p0_16x8b = _mm_unpacklo_epi8(src_p0_16x8b, mask_16x8b);
1236         //clipping MAX
1237         delta_vu0_16x8b = _mm_min_epi16(delta_vu0_16x8b, mask_tc_16x8);
1238         //converting q0 to 16bit
1239         src_q0_16x8b = _mm_unpacklo_epi8(src_q0_16x8b, mask_16x8b);
1240         //clipping MIN
1241         delta_vu0_16x8b = _mm_max_epi16(delta_vu0_16x8b, min_0_16x8b);
1242 
1243         //masking filter flag
1244         delta_vu1_16x8b = _mm_and_si128(delta_vu0_16x8b, mask_flag_q_16x8b);
1245         delta_vu0_16x8b = _mm_and_si128(delta_vu0_16x8b, mask_flag_p_16x8b);
1246 
1247         // q-delta ,p+delta
1248         src_q0_16x8b = _mm_sub_epi16(src_q0_16x8b, delta_vu1_16x8b);
1249         src_p0_16x8b = _mm_add_epi16(src_p0_16x8b, delta_vu0_16x8b);
1250 
1251         // p0 and q0 packed
1252         src_q0_16x8b = _mm_packus_epi16(src_q0_16x8b, mask_16x8b);
1253         src_p0_16x8b = _mm_packus_epi16(src_p0_16x8b, mask_16x8b);
1254 
1255 
1256 
1257         _mm_storel_epi64((__m128i *)(pu1_src - src_strd), src_p0_16x8b);
1258         _mm_storel_epi64((__m128i *)(pu1_src), src_q0_16x8b);
1259 
1260     }
1261 
1262 
1263 }
1264