1 /******************************************************************************
2  *
3  * Copyright (C) 2015 The Android Open Source Project
4  *
5  * Licensed under the Apache License, Version 2.0 (the "License");
6  * you may not use this file except in compliance with the License.
7  * You may obtain a copy of the License at:
8  *
9  * http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  *
17  *****************************************************************************
18  * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19 */
20 /*****************************************************************************/
21 /*                                                                           */
22 /*  File Name         : ih264_deblk_luma_ssse3.c                             */
23 /*                                                                           */
24 /*  Description       : Contains function definitions for deblocking         */
25 /*                                                                           */
26 /*  List of Functions : ih264_deblk_luma_vert_bs4_ssse3()                    */
27 /*                      ih264_deblk_luma_horz_bs4_ssse3()                    */
28 /*                      ih264_deblk_luma_vert_bslt4_ssse3()                  */
29 /*                      ih264_deblk_luma_horz_bslt4_ssse3()                  */
30 /*                      ih264_deblk_luma_vert_bs4_mbaff_ssse3()              */
31 /*                      ih264_deblk_luma_vert_bslt4_mbaff_ssse3()            */
32 /*                                                                           */
33 /*  Issues / Problems : None                                                 */
34 /*                                                                           */
35 /*  Revision History  :                                                      */
36 /*                                                                           */
37 /*         DD MM YYYY   Author(s)       Changes (Describe the changes made)  */
38 /*         12 02 2015   Naveen Kumar P  Added luma deblocking ssse3          */
39 /*                                      intrinsics                           */
40 /*                                                                           */
41 /*****************************************************************************/
42 
43 /*****************************************************************************/
44 /* File Includes                                                             */
45 /*****************************************************************************/
46 
47 /* System include files */
48 #include <stdio.h>
49 
50 /* User include files */
51 #include "ih264_typedefs.h"
52 #include "ih264_platform_macros.h"
53 #include "ih264_deblk_edge_filters.h"
54 #include "ih264_macros.h"
55 
56 /*****************************************************************************/
57 /* Function Definitions                                                      */
58 /*****************************************************************************/
59 
60 /*****************************************************************************/
61 /*                                                                           */
62 /*  Function Name : ih264_deblk_luma_vert_bs4_ssse3()                        */
63 /*                                                                           */
64 /*  Description   : This function performs filtering of a luma block         */
65 /*                  vertical edge when the boundary strength is set to 4.    */
66 /*                                                                           */
67 /*  Inputs        : pu1_src    - pointer to the src sample q0                */
68 /*                  src_strd   - source stride                               */
69 /*                  alpha      - alpha value for the boundary                */
70 /*                  beta       - beta value for the boundary                 */
71 /*                                                                           */
72 /*  Globals       : None                                                     */
73 /*                                                                           */
74 /*  Processing    : This operation is described in Sec. 8.7.2.4 under the    */
75 /*                  title "Filtering process for edges for bS equal to 4" in */
76 /*                  ITU T Rec H.264.                                         */
77 /*                                                                           */
78 /*  Outputs       : None                                                     */
79 /*                                                                           */
80 /*  Returns       : None                                                     */
81 /*                                                                           */
82 /*  Issues        : None                                                     */
83 /*                                                                           */
84 /*  Revision History:                                                        */
85 /*                                                                           */
86 /*         DD MM YYYY   Author(s)       Changes (Describe the changes made)  */
87 /*         12 02 2015   Naveen Kumar P  Initial version                      */
88 /*                                                                           */
89 /*****************************************************************************/
ih264_deblk_luma_vert_bs4_ssse3(UWORD8 * pu1_src,WORD32 src_strd,WORD32 alpha,WORD32 beta)90 void ih264_deblk_luma_vert_bs4_ssse3(UWORD8 *pu1_src,
91                                      WORD32 src_strd,
92                                      WORD32 alpha,
93                                      WORD32 beta)
94 {
95     __m128i zero = _mm_setzero_si128();
96     __m128i q0_16x8, q1_16x8, q2_16x8, q3_16x8;
97     __m128i p0_16x8, p1_16x8, p2_16x8, p3_16x8;
98     __m128i q0_8x16, q1_8x16, q2_8x16, q3_8x16;
99     __m128i p0_8x16, p1_8x16, p2_8x16, p3_8x16;
100     __m128i q0_16x8_1;
101     __m128i p0_16x8_1;
102     __m128i q0_16x8_2, q1_16x8_2, q2_16x8_2;
103     __m128i p0_16x8_2, p1_16x8_2, p2_16x8_2;
104     __m128i temp1, temp2, temp3, temp4, temp5, temp6;
105     __m128i Alpha_8x16, Beta_8x16;
106     __m128i flag1_16x8, flag2_16x8, flag3_16x8, flag4_16x8;
107     __m128i const_val2_16x8 = _mm_set1_epi16(2);
108     __m128i line1, line2, line3, line4, line5, line6, line7, line8;
109 
110     Alpha_8x16 = _mm_set1_epi16(alpha);
111     Beta_8x16 = _mm_set1_epi16(beta);
112 
113     line1 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 0 * src_strd));
114     line2 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 1 * src_strd));
115     line3 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 2 * src_strd));
116     line4 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 3 * src_strd));
117     line5 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 4 * src_strd));
118     line6 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 5 * src_strd));
119     line7 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 6 * src_strd));
120     line8 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 7 * src_strd));
121 
122     temp1 = _mm_unpacklo_epi8(line1, line2);
123     temp2 = _mm_unpacklo_epi8(line3, line4);
124     temp3 = _mm_unpacklo_epi8(line5, line6);
125     temp4 = _mm_unpacklo_epi8(line7, line8);
126 
127     line1 = _mm_unpacklo_epi16(temp1, temp2);
128     line2 = _mm_unpackhi_epi16(temp1, temp2);
129     line3 = _mm_unpacklo_epi16(temp3, temp4);
130     line4 = _mm_unpackhi_epi16(temp3, temp4);
131 
132     p1_8x16 = _mm_unpacklo_epi32(line1, line3);
133     p0_8x16 = _mm_unpackhi_epi32(line1, line3);
134     q0_8x16 = _mm_unpacklo_epi32(line2, line4);
135     q1_8x16 = _mm_unpackhi_epi32(line2, line4);
136 
137     line1 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 8 * src_strd));
138     line2 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 9 * src_strd));
139     line3 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 10 * src_strd));
140     line4 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 11 * src_strd));
141     line5 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 12 * src_strd));
142     line6 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 13 * src_strd));
143     line7 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 14 * src_strd));
144     line8 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 15 * src_strd));
145 
146     temp1 = _mm_unpacklo_epi8(line1, line2);
147     temp2 = _mm_unpacklo_epi8(line3, line4);
148     temp3 = _mm_unpacklo_epi8(line5, line6);
149     temp4 = _mm_unpacklo_epi8(line7, line8);
150 
151     line1 = _mm_unpacklo_epi16(temp1, temp2);
152     line2 = _mm_unpackhi_epi16(temp1, temp2);
153     line3 = _mm_unpacklo_epi16(temp3, temp4);
154     line4 = _mm_unpackhi_epi16(temp3, temp4);
155 
156     temp1 = _mm_unpacklo_epi32(line1, line3);
157     temp2 = _mm_unpackhi_epi32(line1, line3);
158     temp3 = _mm_unpacklo_epi32(line2, line4);
159     temp4 = _mm_unpackhi_epi32(line2, line4);
160 
161     p3_16x8 = _mm_unpacklo_epi64(p1_8x16, temp1);
162     p2_16x8 = _mm_unpackhi_epi64(p1_8x16, temp1);
163     q2_16x8 = _mm_unpacklo_epi64(q1_8x16, temp4);
164     q3_16x8 = _mm_unpackhi_epi64(q1_8x16, temp4);
165     p1_16x8 = _mm_unpacklo_epi64(p0_8x16, temp2);
166     p0_16x8 = _mm_unpackhi_epi64(p0_8x16, temp2);
167     q0_16x8 = _mm_unpacklo_epi64(q0_8x16, temp3);
168     q1_16x8 = _mm_unpackhi_epi64(q0_8x16, temp3);
169 
170     //Cond1 (ABS(p0 - q0) < alpha)
171     temp1 = _mm_subs_epu8(q0_16x8, p0_16x8);
172     temp2 = _mm_subs_epu8(p0_16x8, q0_16x8);
173     temp1 = _mm_add_epi8(temp1, temp2);
174 
175     temp2 = _mm_unpacklo_epi8(temp1, zero);
176     temp1 = _mm_unpackhi_epi8(temp1, zero);
177 
178     temp2 = _mm_cmpgt_epi16(Alpha_8x16, temp2);
179     temp1 = _mm_cmpgt_epi16(Alpha_8x16, temp1);
180 
181     flag1_16x8 = _mm_packs_epi16(temp2, temp1);
182 
183     //Cond2 (ABS(q1 - q0) < beta)
184     temp1 = _mm_subs_epu8(q0_16x8, q1_16x8);
185     temp2 = _mm_subs_epu8(q1_16x8, q0_16x8);
186     temp1 = _mm_add_epi8(temp1, temp2);
187 
188     temp2 = _mm_unpacklo_epi8(temp1, zero);
189     temp1 = _mm_unpackhi_epi8(temp1, zero);
190 
191     temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2);
192     temp1 = _mm_cmpgt_epi16(Beta_8x16, temp1);
193 
194     flag2_16x8 = _mm_packs_epi16(temp2, temp1);
195 
196     flag1_16x8 = _mm_and_si128(flag1_16x8, flag2_16x8);
197 
198     //Cond3 (ABS(p1 - p0) < beta)
199     temp1 = _mm_subs_epu8(p0_16x8, p1_16x8);
200     temp2 = _mm_subs_epu8(p1_16x8, p0_16x8);
201     temp1 = _mm_add_epi8(temp1, temp2);
202 
203     temp2 = _mm_unpacklo_epi8(temp1, zero);
204     temp1 = _mm_unpackhi_epi8(temp1, zero);
205 
206     temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2);
207     temp1 = _mm_cmpgt_epi16(Beta_8x16, temp1);
208 
209     flag2_16x8 = _mm_packs_epi16(temp2, temp1);
210 
211     // !((ABS(p0 - q0) < alpha) || (ABS(q1 - q0) < beta) || (ABS(p1 - p0) < beta))
212     flag1_16x8 = _mm_and_si128(flag1_16x8, flag2_16x8);
213 
214     // (ABS(p0 - q0) < ((alpha >> 2) + 2))
215     temp1 = _mm_subs_epu8(p0_16x8, q0_16x8);
216     temp2 = _mm_subs_epu8(q0_16x8, p0_16x8);
217     temp1 = _mm_add_epi8(temp1, temp2);
218     Alpha_8x16 = _mm_srai_epi16(Alpha_8x16, 2);
219     Alpha_8x16 = _mm_add_epi16(Alpha_8x16, const_val2_16x8);
220 
221     temp2 = _mm_unpacklo_epi8(temp1, zero);
222     temp1 = _mm_unpackhi_epi8(temp1, zero);
223     temp2 = _mm_cmpgt_epi16(Alpha_8x16, temp2);
224     temp1 = _mm_cmpgt_epi16(Alpha_8x16, temp1);
225 
226     flag2_16x8 = _mm_packs_epi16(temp2, temp1);
227     flag2_16x8 = _mm_and_si128(flag1_16x8, flag2_16x8);
228 
229     // (ABS(p2 - p0) < beta)
230     temp1 = _mm_subs_epu8(p0_16x8, p2_16x8);
231     temp2 = _mm_subs_epu8(p2_16x8, p0_16x8);
232     temp1 = _mm_add_epi8(temp1, temp2);
233 
234     temp2 = _mm_unpacklo_epi8(temp1, zero);
235     temp1 = _mm_unpackhi_epi8(temp1, zero);
236     temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2);
237     temp1 = _mm_cmpgt_epi16(Beta_8x16, temp1);
238 
239     flag3_16x8 = _mm_packs_epi16(temp2, temp1);
240     flag3_16x8 = _mm_and_si128(flag3_16x8, flag2_16x8);
241 
242     // (ABS(q2 - q0) < beta)
243     temp1 = _mm_subs_epu8(q0_16x8, q2_16x8);
244     temp2 = _mm_subs_epu8(q2_16x8, q0_16x8);
245     temp1 = _mm_add_epi8(temp1, temp2);
246 
247     temp2 = _mm_unpacklo_epi8(temp1, zero);
248     temp1 = _mm_unpackhi_epi8(temp1, zero);
249     temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2);
250     temp1 = _mm_cmpgt_epi16(Beta_8x16, temp1);
251 
252     flag4_16x8 = _mm_packs_epi16(temp2, temp1);
253     flag4_16x8 = _mm_and_si128(flag4_16x8, flag2_16x8);
254 
255     // First 8 pixels
256     p3_8x16 = _mm_unpacklo_epi8(p3_16x8, zero);
257     p2_8x16 = _mm_unpacklo_epi8(p2_16x8, zero);
258     p1_8x16 = _mm_unpacklo_epi8(p1_16x8, zero);
259     p0_8x16 = _mm_unpacklo_epi8(p0_16x8, zero);
260     q0_8x16 = _mm_unpacklo_epi8(q0_16x8, zero);
261     q1_8x16 = _mm_unpacklo_epi8(q1_16x8, zero);
262     q2_8x16 = _mm_unpacklo_epi8(q2_16x8, zero);
263     q3_8x16 = _mm_unpacklo_epi8(q3_16x8, zero);
264 
265     // p0_1 and q0_1
266     temp1 = _mm_add_epi16(p0_8x16, q1_8x16);
267     temp2 = _mm_add_epi16(p1_8x16, q0_8x16);
268     temp5 = _mm_add_epi16(temp1, const_val2_16x8);
269     temp6 = _mm_add_epi16(temp2, const_val2_16x8);
270     temp3 = _mm_slli_epi16(p1_8x16, 1);
271     temp4 = _mm_slli_epi16(q1_8x16, 1);
272     temp1 = _mm_add_epi16(temp5, temp3);
273     temp2 = _mm_add_epi16(temp6, temp4);
274     p0_16x8_1 = _mm_srai_epi16(temp1, 2);
275     q0_16x8_1 = _mm_srai_epi16(temp2, 2);
276 
277     // p1_2 and q1_2
278     temp6 = _mm_add_epi16(temp6, p0_8x16);
279     temp5 = _mm_add_epi16(temp5, q0_8x16);
280     temp1 = _mm_add_epi16(temp6, p2_8x16);
281     temp2 = _mm_add_epi16(temp5, q2_8x16);
282     p1_16x8_2 = _mm_srai_epi16(temp1, 2);
283     q1_16x8_2 = _mm_srai_epi16(temp2, 2);
284 
285     // p0_2 and q0_2
286     temp1 = _mm_add_epi16(temp3, p2_8x16);
287     temp2 = _mm_add_epi16(temp4, q2_8x16);
288     temp1 = _mm_add_epi16(temp1, q1_8x16);
289     temp2 = _mm_add_epi16(temp2, p1_8x16);
290     temp3 = _mm_add_epi16(p0_8x16, q0_8x16);
291     temp3 = _mm_slli_epi16(temp3, 1);
292     temp1 = _mm_add_epi16(temp1, temp3);
293     temp2 = _mm_add_epi16(temp2, temp3);
294     temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(4));
295     temp2 = _mm_add_epi16(temp2, _mm_set1_epi16(4));
296     p0_16x8_2 = _mm_srai_epi16(temp1, 3);
297     q0_16x8_2 = _mm_srai_epi16(temp2, 3);
298 
299     // p2_2 and q2_2
300     temp1 = _mm_add_epi16(temp6, const_val2_16x8);
301     temp2 = _mm_add_epi16(temp5, const_val2_16x8);
302     temp3 = _mm_slli_epi16(p2_8x16, 1);
303     temp4 = _mm_slli_epi16(q2_8x16, 1);
304     temp3 = _mm_add_epi16(p2_8x16, temp3);
305     temp4 = _mm_add_epi16(q2_8x16, temp4);
306     temp5 = _mm_slli_epi16(p3_8x16, 1);
307     temp6 = _mm_slli_epi16(q3_8x16, 1);
308     temp1 = _mm_add_epi16(temp1, temp3);
309     temp2 = _mm_add_epi16(temp2, temp4);
310     temp1 = _mm_add_epi16(temp1, temp5);
311     temp2 = _mm_add_epi16(temp2, temp6);
312     p2_16x8_2 = _mm_srai_epi16(temp1, 3);
313     q2_16x8_2 = _mm_srai_epi16(temp2, 3);
314 
315     // Second 8 pixels and packing with first 8 pixels
316     p3_8x16 = _mm_unpackhi_epi8(p3_16x8, zero);
317     p2_8x16 = _mm_unpackhi_epi8(p2_16x8, zero);
318     p1_8x16 = _mm_unpackhi_epi8(p1_16x8, zero);
319     p0_8x16 = _mm_unpackhi_epi8(p0_16x8, zero);
320     q0_8x16 = _mm_unpackhi_epi8(q0_16x8, zero);
321     q1_8x16 = _mm_unpackhi_epi8(q1_16x8, zero);
322     q2_8x16 = _mm_unpackhi_epi8(q2_16x8, zero);
323     q3_8x16 = _mm_unpackhi_epi8(q3_16x8, zero);
324 
325     // p0_1 and q0_1
326     temp1 = _mm_add_epi16(p0_8x16, q1_8x16);
327     temp2 = _mm_add_epi16(p1_8x16, q0_8x16);
328     temp5 = _mm_add_epi16(temp1, const_val2_16x8);
329     temp6 = _mm_add_epi16(temp2, const_val2_16x8);
330     temp3 = _mm_slli_epi16(p1_8x16, 1);
331     temp4 = _mm_slli_epi16(q1_8x16, 1);
332     temp1 = _mm_add_epi16(temp5, temp3);
333     temp2 = _mm_add_epi16(temp6, temp4);
334     temp1 = _mm_srai_epi16(temp1, 2);
335     temp2 = _mm_srai_epi16(temp2, 2);
336     p0_16x8_1 = _mm_packus_epi16(p0_16x8_1, temp1);
337     q0_16x8_1 = _mm_packus_epi16(q0_16x8_1, temp2);
338 
339     // p1_2 and q1_2
340     temp6 = _mm_add_epi16(temp6, p0_8x16);
341     temp5 = _mm_add_epi16(temp5, q0_8x16);
342     temp1 = _mm_add_epi16(temp6, p2_8x16);
343     temp2 = _mm_add_epi16(temp5, q2_8x16);
344     temp1 = _mm_srai_epi16(temp1, 2);
345     temp2 = _mm_srai_epi16(temp2, 2);
346     p1_16x8_2 = _mm_packus_epi16(p1_16x8_2, temp1);
347     q1_16x8_2 = _mm_packus_epi16(q1_16x8_2, temp2);
348 
349     // p0_2 and q0_2
350     temp1 = _mm_add_epi16(temp3, p2_8x16);
351     temp2 = _mm_add_epi16(temp4, q2_8x16);
352     temp1 = _mm_add_epi16(temp1, q1_8x16);
353     temp2 = _mm_add_epi16(temp2, p1_8x16);
354     temp3 = _mm_add_epi16(p0_8x16, q0_8x16);
355     temp3 = _mm_slli_epi16(temp3, 1);
356     temp1 = _mm_add_epi16(temp1, temp3);
357     temp2 = _mm_add_epi16(temp2, temp3);
358     temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(4));
359     temp2 = _mm_add_epi16(temp2, _mm_set1_epi16(4));
360     temp1 = _mm_srai_epi16(temp1, 3);
361     temp2 = _mm_srai_epi16(temp2, 3);
362     p0_16x8_2 = _mm_packus_epi16(p0_16x8_2, temp1);
363     q0_16x8_2 = _mm_packus_epi16(q0_16x8_2, temp2);
364 
365     // p2_2 and q2_2
366     temp1 = _mm_add_epi16(temp6, const_val2_16x8);
367     temp2 = _mm_add_epi16(temp5, const_val2_16x8);
368     temp3 = _mm_slli_epi16(p2_8x16, 1);
369     temp4 = _mm_slli_epi16(q2_8x16, 1);
370     temp3 = _mm_add_epi16(p2_8x16, temp3);
371     temp4 = _mm_add_epi16(q2_8x16, temp4);
372     temp5 = _mm_slli_epi16(p3_8x16, 1);
373     temp6 = _mm_slli_epi16(q3_8x16, 1);
374     temp1 = _mm_add_epi16(temp1, temp3);
375     temp2 = _mm_add_epi16(temp2, temp4);
376     temp1 = _mm_add_epi16(temp1, temp5);
377     temp2 = _mm_add_epi16(temp2, temp6);
378     temp1 = _mm_srai_epi16(temp1, 3);
379     temp2 = _mm_srai_epi16(temp2, 3);
380     p2_16x8_2 = _mm_packus_epi16(p2_16x8_2, temp1);
381     q2_16x8_2 = _mm_packus_epi16(q2_16x8_2, temp2);
382 
383     // p0 and q0
384     p0_16x8 = _mm_and_si128(p0_16x8,
385                             _mm_xor_si128(flag1_16x8, _mm_set1_epi8(0xFF)));
386     p0_16x8_1 = _mm_and_si128(p0_16x8_1, flag1_16x8);
387     p0_16x8 = _mm_add_epi8(p0_16x8, p0_16x8_1);
388     q0_16x8 = _mm_and_si128(q0_16x8,
389                             _mm_xor_si128(flag1_16x8, _mm_set1_epi8(0xFF)));
390     q0_16x8_1 = _mm_and_si128(q0_16x8_1, flag1_16x8);
391     q0_16x8 = _mm_add_epi8(q0_16x8, q0_16x8_1);
392 
393     // p0 and q0
394     p0_16x8 = _mm_and_si128(p0_16x8,
395                             _mm_xor_si128(flag3_16x8, _mm_set1_epi8(0xFF)));
396     p0_16x8_2 = _mm_and_si128(p0_16x8_2, flag3_16x8);
397     p0_16x8 = _mm_add_epi8(p0_16x8, p0_16x8_2);
398     q0_16x8 = _mm_and_si128(q0_16x8,
399                             _mm_xor_si128(flag4_16x8, _mm_set1_epi8(0xFF)));
400     q0_16x8_2 = _mm_and_si128(q0_16x8_2, flag4_16x8);
401     q0_16x8 = _mm_add_epi8(q0_16x8, q0_16x8_2);
402 
403     // p1 and q1
404     p1_16x8 = _mm_and_si128(p1_16x8,
405                             _mm_xor_si128(flag3_16x8, _mm_set1_epi8(0xFF)));
406     p1_16x8_2 = _mm_and_si128(p1_16x8_2, flag3_16x8);
407     p1_16x8 = _mm_add_epi8(p1_16x8, p1_16x8_2);
408     q1_16x8 = _mm_and_si128(q1_16x8,
409                             _mm_xor_si128(flag4_16x8, _mm_set1_epi8(0xFF)));
410     q1_16x8_2 = _mm_and_si128(q1_16x8_2, flag4_16x8);
411     q1_16x8 = _mm_add_epi8(q1_16x8, q1_16x8_2);
412 
413     // p2 and q2
414     p2_16x8 = _mm_and_si128(p2_16x8,
415                             _mm_xor_si128(flag3_16x8, _mm_set1_epi8(0xFF)));
416     p2_16x8_2 = _mm_and_si128(p2_16x8_2, flag3_16x8);
417     p2_16x8 = _mm_add_epi8(p2_16x8, p2_16x8_2);
418     q2_16x8 = _mm_and_si128(q2_16x8,
419                             _mm_xor_si128(flag4_16x8, _mm_set1_epi8(0xFF)));
420     q2_16x8_2 = _mm_and_si128(q2_16x8_2, flag4_16x8);
421     q2_16x8 = _mm_add_epi8(q2_16x8, q2_16x8_2);
422 
423     temp1 = _mm_unpacklo_epi8(p3_16x8, p2_16x8);
424     temp2 = _mm_unpacklo_epi8(p1_16x8, p0_16x8);
425     temp3 = _mm_unpacklo_epi8(q0_16x8, q1_16x8);
426     temp4 = _mm_unpacklo_epi8(q2_16x8, q3_16x8);
427 
428     p3_8x16 = _mm_unpacklo_epi16(temp1, temp2);
429     p2_8x16 = _mm_unpackhi_epi16(temp1, temp2);
430     q2_8x16 = _mm_unpacklo_epi16(temp3, temp4);
431     q3_8x16 = _mm_unpackhi_epi16(temp3, temp4);
432 
433     line1 = _mm_unpacklo_epi32(p3_8x16, q2_8x16);
434     line2 = _mm_srli_si128(line1, 8);
435     line3 = _mm_unpackhi_epi32(p3_8x16, q2_8x16);
436     line4 = _mm_srli_si128(line3, 8);
437     line5 = _mm_unpacklo_epi32(p2_8x16, q3_8x16);
438     line6 = _mm_srli_si128(line5, 8);
439     line7 = _mm_unpackhi_epi32(p2_8x16, q3_8x16);
440     line8 = _mm_srli_si128(line7, 8);
441 
442     _mm_storel_epi64((__m128i *)(pu1_src - 4 + 0 * src_strd), line1);
443     _mm_storel_epi64((__m128i *)(pu1_src - 4 + 1 * src_strd), line2);
444     _mm_storel_epi64((__m128i *)(pu1_src - 4 + 2 * src_strd), line3);
445     _mm_storel_epi64((__m128i *)(pu1_src - 4 + 3 * src_strd), line4);
446     _mm_storel_epi64((__m128i *)(pu1_src - 4 + 4 * src_strd), line5);
447     _mm_storel_epi64((__m128i *)(pu1_src - 4 + 5 * src_strd), line6);
448     _mm_storel_epi64((__m128i *)(pu1_src - 4 + 6 * src_strd), line7);
449     _mm_storel_epi64((__m128i *)(pu1_src - 4 + 7 * src_strd), line8);
450 
451     temp1 = _mm_unpackhi_epi8(p3_16x8, p2_16x8);
452     temp2 = _mm_unpackhi_epi8(p1_16x8, p0_16x8);
453     temp3 = _mm_unpackhi_epi8(q0_16x8, q1_16x8);
454     temp4 = _mm_unpackhi_epi8(q2_16x8, q3_16x8);
455 
456     p3_8x16 = _mm_unpacklo_epi16(temp1, temp2);
457     p2_8x16 = _mm_unpackhi_epi16(temp1, temp2);
458     q2_8x16 = _mm_unpacklo_epi16(temp3, temp4);
459     q3_8x16 = _mm_unpackhi_epi16(temp3, temp4);
460 
461     line1 = _mm_unpacklo_epi32(p3_8x16, q2_8x16);
462     line2 = _mm_srli_si128(line1, 8);
463     line3 = _mm_unpackhi_epi32(p3_8x16, q2_8x16);
464     line4 = _mm_srli_si128(line3, 8);
465     line5 = _mm_unpacklo_epi32(p2_8x16, q3_8x16);
466     line6 = _mm_srli_si128(line5, 8);
467     line7 = _mm_unpackhi_epi32(p2_8x16, q3_8x16);
468     line8 = _mm_srli_si128(line7, 8);
469 
470     _mm_storel_epi64((__m128i *)(pu1_src - 4 + 8 * src_strd), line1);
471     _mm_storel_epi64((__m128i *)(pu1_src - 4 + 9 * src_strd), line2);
472     _mm_storel_epi64((__m128i *)(pu1_src - 4 + 10 * src_strd), line3);
473     _mm_storel_epi64((__m128i *)(pu1_src - 4 + 11 * src_strd), line4);
474     _mm_storel_epi64((__m128i *)(pu1_src - 4 + 12 * src_strd), line5);
475     _mm_storel_epi64((__m128i *)(pu1_src - 4 + 13 * src_strd), line6);
476     _mm_storel_epi64((__m128i *)(pu1_src - 4 + 14 * src_strd), line7);
477     _mm_storel_epi64((__m128i *)(pu1_src - 4 + 15 * src_strd), line8);
478 
479 }
480 
481 /*****************************************************************************/
482 /*                                                                           */
483 /*  Function Name : ih264_deblk_luma_horz_bs4_ssse3()                        */
484 /*                                                                           */
485 /*  Description   : This function performs filtering of a luma block         */
486 /*                  horizontal edge when the boundary strength is set to 4.  */
487 /*                                                                           */
488 /*  Inputs        : pu1_src    - pointer to the src sample q0                */
489 /*                  src_strd   - source stride                               */
490 /*                  alpha      - alpha value for the boundary                */
491 /*                  beta       - beta value for the boundary                 */
492 /*                                                                           */
493 /*  Globals       : None                                                     */
494 /*                                                                           */
495 /*  Processing    : This operation is described in Sec. 8.7.2.4 under the    */
496 /*                  title "Filtering process for edges for bS equal to 4" in */
497 /*                  ITU T Rec H.264.                                         */
498 /*                                                                           */
499 /*  Outputs       : None                                                     */
500 /*                                                                           */
501 /*  Returns       : None                                                     */
502 /*                                                                           */
503 /*  Issues        : None                                                     */
504 /*                                                                           */
505 /*  Revision History:                                                        */
506 /*                                                                           */
507 /*         DD MM YYYY   Author(s)       Changes (Describe the changes made)  */
508 /*         12 02 2015   Naveen Kumar P  Initial version                      */
509 /*                                                                           */
510 /*****************************************************************************/
ih264_deblk_luma_horz_bs4_ssse3(UWORD8 * pu1_src,WORD32 src_strd,WORD32 alpha,WORD32 beta)511 void ih264_deblk_luma_horz_bs4_ssse3(UWORD8 *pu1_src,
512                                      WORD32 src_strd,
513                                      WORD32 alpha,
514                                      WORD32 beta)
515 {
516     WORD16 i16_posP3, i16_posP2, i16_posP1, i16_posP0;
517     WORD16 i16_posQ1, i16_posQ2, i16_posQ3;
518     UWORD8 *pu1_HorzPixel;
519     __m128i zero = _mm_setzero_si128();
520     __m128i q0_16x8, q1_16x8, q2_16x8, q3_16x8;
521     __m128i p0_16x8, p1_16x8, p2_16x8, p3_16x8;
522     __m128i q0_8x16, q1_8x16, q2_8x16, q3_8x16;
523     __m128i p0_8x16, p1_8x16, p2_8x16, p3_8x16;
524     __m128i q0_16x8_1;
525     __m128i p0_16x8_1;
526     __m128i q0_16x8_2, q1_16x8_2, q2_16x8_2;
527     __m128i p0_16x8_2, p1_16x8_2, p2_16x8_2;
528     __m128i temp1, temp2, temp3, temp4, temp5, temp6;
529     __m128i Alpha_8x16, Beta_8x16;
530     __m128i flag1_16x8, flag2_16x8, flag3_16x8, flag4_16x8;
531     __m128i const_val2_16x8 = _mm_set1_epi16(2);
532 
533     pu1_HorzPixel = pu1_src - (src_strd << 2);
534 
535     i16_posQ1 = src_strd;
536     i16_posQ2 = X2(src_strd);
537     i16_posQ3 = X3(src_strd);
538     i16_posP0 = X3(src_strd);
539     i16_posP1 = X2(src_strd);
540     i16_posP2 = src_strd;
541     i16_posP3 = 0;
542 
543     Alpha_8x16 = _mm_set1_epi16(alpha);
544     Beta_8x16 = _mm_set1_epi16(beta);
545 
546     p3_16x8 = _mm_loadu_si128((__m128i *)(pu1_HorzPixel + i16_posP3));
547     p2_16x8 = _mm_loadu_si128((__m128i *)(pu1_HorzPixel + i16_posP2));
548     p1_16x8 = _mm_loadu_si128((__m128i *)(pu1_HorzPixel + i16_posP1));
549     p0_16x8 = _mm_loadu_si128((__m128i *)(pu1_HorzPixel + i16_posP0));
550     q0_16x8 = _mm_loadu_si128((__m128i *)(pu1_src));
551     q1_16x8 = _mm_loadu_si128((__m128i *)(pu1_src + i16_posQ1));
552     q2_16x8 = _mm_loadu_si128((__m128i *)(pu1_src + i16_posQ2));
553     q3_16x8 = _mm_loadu_si128((__m128i *)(pu1_src + i16_posQ3));
554 
555     //Cond1 (ABS(p0 - q0) < alpha)
556     temp1 = _mm_subs_epu8(q0_16x8, p0_16x8);
557     temp2 = _mm_subs_epu8(p0_16x8, q0_16x8);
558     temp1 = _mm_add_epi8(temp1, temp2);
559 
560     temp2 = _mm_unpacklo_epi8(temp1, zero);
561     temp1 = _mm_unpackhi_epi8(temp1, zero);
562 
563     temp2 = _mm_cmpgt_epi16(Alpha_8x16, temp2);
564     temp1 = _mm_cmpgt_epi16(Alpha_8x16, temp1);
565 
566     flag1_16x8 = _mm_packs_epi16(temp2, temp1);
567 
568     //Cond2 (ABS(q1 - q0) < beta)
569     temp1 = _mm_subs_epu8(q0_16x8, q1_16x8);
570     temp2 = _mm_subs_epu8(q1_16x8, q0_16x8);
571     temp1 = _mm_add_epi8(temp1, temp2);
572 
573     temp2 = _mm_unpacklo_epi8(temp1, zero);
574     temp1 = _mm_unpackhi_epi8(temp1, zero);
575 
576     temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2);
577     temp1 = _mm_cmpgt_epi16(Beta_8x16, temp1);
578 
579     flag2_16x8 = _mm_packs_epi16(temp2, temp1);
580 
581     flag1_16x8 = _mm_and_si128(flag1_16x8, flag2_16x8);
582 
583     //Cond3 (ABS(p1 - p0) < beta)
584     temp1 = _mm_subs_epu8(p0_16x8, p1_16x8);
585     temp2 = _mm_subs_epu8(p1_16x8, p0_16x8);
586     temp1 = _mm_add_epi8(temp1, temp2);
587 
588     temp2 = _mm_unpacklo_epi8(temp1, zero);
589     temp1 = _mm_unpackhi_epi8(temp1, zero);
590 
591     temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2);
592     temp1 = _mm_cmpgt_epi16(Beta_8x16, temp1);
593 
594     flag2_16x8 = _mm_packs_epi16(temp2, temp1);
595 
596     // !((ABS(p0 - q0) < alpha) || (ABS(q1 - q0) < beta) || (ABS(p1 - p0) < beta))
597     flag1_16x8 = _mm_and_si128(flag1_16x8, flag2_16x8);
598 
599     // (ABS(p0 - q0) < ((alpha >> 2) + 2))
600     temp1 = _mm_subs_epu8(p0_16x8, q0_16x8);
601     temp2 = _mm_subs_epu8(q0_16x8, p0_16x8);
602     temp1 = _mm_add_epi8(temp1, temp2);
603     Alpha_8x16 = _mm_srai_epi16(Alpha_8x16, 2);
604     Alpha_8x16 = _mm_add_epi16(Alpha_8x16, const_val2_16x8);
605 
606     temp2 = _mm_unpacklo_epi8(temp1, zero);
607     temp1 = _mm_unpackhi_epi8(temp1, zero);
608     temp2 = _mm_cmpgt_epi16(Alpha_8x16, temp2);
609     temp1 = _mm_cmpgt_epi16(Alpha_8x16, temp1);
610 
611     flag2_16x8 = _mm_packs_epi16(temp2, temp1);
612     flag2_16x8 = _mm_and_si128(flag1_16x8, flag2_16x8);
613 
614     // (ABS(p2 - p0) < beta)
615     temp1 = _mm_subs_epu8(p0_16x8, p2_16x8);
616     temp2 = _mm_subs_epu8(p2_16x8, p0_16x8);
617     temp1 = _mm_add_epi8(temp1, temp2);
618 
619     temp2 = _mm_unpacklo_epi8(temp1, zero);
620     temp1 = _mm_unpackhi_epi8(temp1, zero);
621     temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2);
622     temp1 = _mm_cmpgt_epi16(Beta_8x16, temp1);
623 
624     flag3_16x8 = _mm_packs_epi16(temp2, temp1);
625     flag3_16x8 = _mm_and_si128(flag3_16x8, flag2_16x8);
626 
627     // (ABS(q2 - q0) < beta)
628     temp1 = _mm_subs_epu8(q0_16x8, q2_16x8);
629     temp2 = _mm_subs_epu8(q2_16x8, q0_16x8);
630     temp1 = _mm_add_epi8(temp1, temp2);
631 
632     temp2 = _mm_unpacklo_epi8(temp1, zero);
633     temp1 = _mm_unpackhi_epi8(temp1, zero);
634     temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2);
635     temp1 = _mm_cmpgt_epi16(Beta_8x16, temp1);
636 
637     flag4_16x8 = _mm_packs_epi16(temp2, temp1);
638     flag4_16x8 = _mm_and_si128(flag4_16x8, flag2_16x8);
639 
640     // First 8 pixels
641     p3_8x16 = _mm_unpacklo_epi8(p3_16x8, zero);
642     p2_8x16 = _mm_unpacklo_epi8(p2_16x8, zero);
643     p1_8x16 = _mm_unpacklo_epi8(p1_16x8, zero);
644     p0_8x16 = _mm_unpacklo_epi8(p0_16x8, zero);
645     q0_8x16 = _mm_unpacklo_epi8(q0_16x8, zero);
646     q1_8x16 = _mm_unpacklo_epi8(q1_16x8, zero);
647     q2_8x16 = _mm_unpacklo_epi8(q2_16x8, zero);
648     q3_8x16 = _mm_unpacklo_epi8(q3_16x8, zero);
649 
650     // p0_1 and q0_1
651     temp1 = _mm_add_epi16(p0_8x16, q1_8x16);
652     temp2 = _mm_add_epi16(p1_8x16, q0_8x16);
653     temp5 = _mm_add_epi16(temp1, const_val2_16x8);
654     temp6 = _mm_add_epi16(temp2, const_val2_16x8);
655     temp3 = _mm_slli_epi16(p1_8x16, 1);
656     temp4 = _mm_slli_epi16(q1_8x16, 1);
657     temp1 = _mm_add_epi16(temp5, temp3);
658     temp2 = _mm_add_epi16(temp6, temp4);
659     p0_16x8_1 = _mm_srai_epi16(temp1, 2);
660     q0_16x8_1 = _mm_srai_epi16(temp2, 2);
661 
662     // p1_2 and q1_2
663     temp6 = _mm_add_epi16(temp6, p0_8x16);
664     temp5 = _mm_add_epi16(temp5, q0_8x16);
665     temp1 = _mm_add_epi16(temp6, p2_8x16);
666     temp2 = _mm_add_epi16(temp5, q2_8x16);
667     p1_16x8_2 = _mm_srai_epi16(temp1, 2);
668     q1_16x8_2 = _mm_srai_epi16(temp2, 2);
669 
670     // p0_2 and q0_2
671     temp1 = _mm_add_epi16(temp3, p2_8x16);
672     temp2 = _mm_add_epi16(temp4, q2_8x16);
673     temp1 = _mm_add_epi16(temp1, q1_8x16);
674     temp2 = _mm_add_epi16(temp2, p1_8x16);
675     temp3 = _mm_add_epi16(p0_8x16, q0_8x16);
676     temp3 = _mm_slli_epi16(temp3, 1);
677     temp1 = _mm_add_epi16(temp1, temp3);
678     temp2 = _mm_add_epi16(temp2, temp3);
679     temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(4));
680     temp2 = _mm_add_epi16(temp2, _mm_set1_epi16(4));
681     p0_16x8_2 = _mm_srai_epi16(temp1, 3);
682     q0_16x8_2 = _mm_srai_epi16(temp2, 3);
683 
684     // p2_2 and q2_2
685     temp1 = _mm_add_epi16(temp6, const_val2_16x8);
686     temp2 = _mm_add_epi16(temp5, const_val2_16x8);
687     temp3 = _mm_slli_epi16(p2_8x16, 1);
688     temp4 = _mm_slli_epi16(q2_8x16, 1);
689     temp3 = _mm_add_epi16(p2_8x16, temp3);
690     temp4 = _mm_add_epi16(q2_8x16, temp4);
691     temp5 = _mm_slli_epi16(p3_8x16, 1);
692     temp6 = _mm_slli_epi16(q3_8x16, 1);
693     temp1 = _mm_add_epi16(temp1, temp3);
694     temp2 = _mm_add_epi16(temp2, temp4);
695     temp1 = _mm_add_epi16(temp1, temp5);
696     temp2 = _mm_add_epi16(temp2, temp6);
697     p2_16x8_2 = _mm_srai_epi16(temp1, 3);
698     q2_16x8_2 = _mm_srai_epi16(temp2, 3);
699 
700     // Second 8 pixels and packing with first 8 pixels
701     p3_8x16 = _mm_unpackhi_epi8(p3_16x8, zero);
702     p2_8x16 = _mm_unpackhi_epi8(p2_16x8, zero);
703     p1_8x16 = _mm_unpackhi_epi8(p1_16x8, zero);
704     p0_8x16 = _mm_unpackhi_epi8(p0_16x8, zero);
705     q0_8x16 = _mm_unpackhi_epi8(q0_16x8, zero);
706     q1_8x16 = _mm_unpackhi_epi8(q1_16x8, zero);
707     q2_8x16 = _mm_unpackhi_epi8(q2_16x8, zero);
708     q3_8x16 = _mm_unpackhi_epi8(q3_16x8, zero);
709 
710     // p0_1 and q0_1
711     temp1 = _mm_add_epi16(p0_8x16, q1_8x16);
712     temp2 = _mm_add_epi16(p1_8x16, q0_8x16);
713     temp5 = _mm_add_epi16(temp1, const_val2_16x8);
714     temp6 = _mm_add_epi16(temp2, const_val2_16x8);
715     temp3 = _mm_slli_epi16(p1_8x16, 1);
716     temp4 = _mm_slli_epi16(q1_8x16, 1);
717     temp1 = _mm_add_epi16(temp5, temp3);
718     temp2 = _mm_add_epi16(temp6, temp4);
719     temp1 = _mm_srai_epi16(temp1, 2);
720     temp2 = _mm_srai_epi16(temp2, 2);
721     p0_16x8_1 = _mm_packus_epi16(p0_16x8_1, temp1);
722     q0_16x8_1 = _mm_packus_epi16(q0_16x8_1, temp2);
723 
724     // p1_2 and q1_2
725     temp6 = _mm_add_epi16(temp6, p0_8x16);
726     temp5 = _mm_add_epi16(temp5, q0_8x16);
727     temp1 = _mm_add_epi16(temp6, p2_8x16);
728     temp2 = _mm_add_epi16(temp5, q2_8x16);
729     temp1 = _mm_srai_epi16(temp1, 2);
730     temp2 = _mm_srai_epi16(temp2, 2);
731     p1_16x8_2 = _mm_packus_epi16(p1_16x8_2, temp1);
732     q1_16x8_2 = _mm_packus_epi16(q1_16x8_2, temp2);
733 
734     // p0_2 and q0_2
735     temp1 = _mm_add_epi16(temp3, p2_8x16);
736     temp2 = _mm_add_epi16(temp4, q2_8x16);
737     temp1 = _mm_add_epi16(temp1, q1_8x16);
738     temp2 = _mm_add_epi16(temp2, p1_8x16);
739     temp3 = _mm_add_epi16(p0_8x16, q0_8x16);
740     temp3 = _mm_slli_epi16(temp3, 1);
741     temp1 = _mm_add_epi16(temp1, temp3);
742     temp2 = _mm_add_epi16(temp2, temp3);
743     temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(4));
744     temp2 = _mm_add_epi16(temp2, _mm_set1_epi16(4));
745     temp1 = _mm_srai_epi16(temp1, 3);
746     temp2 = _mm_srai_epi16(temp2, 3);
747     p0_16x8_2 = _mm_packus_epi16(p0_16x8_2, temp1);
748     q0_16x8_2 = _mm_packus_epi16(q0_16x8_2, temp2);
749 
750     // p2_2 and q2_2
751     temp1 = _mm_add_epi16(temp6, const_val2_16x8);
752     temp2 = _mm_add_epi16(temp5, const_val2_16x8);
753     temp3 = _mm_slli_epi16(p2_8x16, 1);
754     temp4 = _mm_slli_epi16(q2_8x16, 1);
755     temp3 = _mm_add_epi16(p2_8x16, temp3);
756     temp4 = _mm_add_epi16(q2_8x16, temp4);
757     temp5 = _mm_slli_epi16(p3_8x16, 1);
758     temp6 = _mm_slli_epi16(q3_8x16, 1);
759     temp1 = _mm_add_epi16(temp1, temp3);
760     temp2 = _mm_add_epi16(temp2, temp4);
761     temp1 = _mm_add_epi16(temp1, temp5);
762     temp2 = _mm_add_epi16(temp2, temp6);
763     temp1 = _mm_srai_epi16(temp1, 3);
764     temp2 = _mm_srai_epi16(temp2, 3);
765     p2_16x8_2 = _mm_packus_epi16(p2_16x8_2, temp1);
766     q2_16x8_2 = _mm_packus_epi16(q2_16x8_2, temp2);
767 
768     // p0 and q0
769     p0_16x8 = _mm_and_si128(p0_16x8,
770                             _mm_xor_si128(flag1_16x8, _mm_set1_epi8(0xFF)));
771     p0_16x8_1 = _mm_and_si128(p0_16x8_1, flag1_16x8);
772     p0_16x8 = _mm_add_epi8(p0_16x8, p0_16x8_1);
773     q0_16x8 = _mm_and_si128(q0_16x8,
774                             _mm_xor_si128(flag1_16x8, _mm_set1_epi8(0xFF)));
775     q0_16x8_1 = _mm_and_si128(q0_16x8_1, flag1_16x8);
776     q0_16x8 = _mm_add_epi8(q0_16x8, q0_16x8_1);
777 
778     // p0 and q0
779     p0_16x8 = _mm_and_si128(p0_16x8,
780                             _mm_xor_si128(flag3_16x8, _mm_set1_epi8(0xFF)));
781     p0_16x8_2 = _mm_and_si128(p0_16x8_2, flag3_16x8);
782     p0_16x8 = _mm_add_epi8(p0_16x8, p0_16x8_2);
783     q0_16x8 = _mm_and_si128(q0_16x8,
784                             _mm_xor_si128(flag4_16x8, _mm_set1_epi8(0xFF)));
785     q0_16x8_2 = _mm_and_si128(q0_16x8_2, flag4_16x8);
786     q0_16x8 = _mm_add_epi8(q0_16x8, q0_16x8_2);
787 
788     // p1 and q1
789     p1_16x8 = _mm_and_si128(p1_16x8,
790                             _mm_xor_si128(flag3_16x8, _mm_set1_epi8(0xFF)));
791     p1_16x8_2 = _mm_and_si128(p1_16x8_2, flag3_16x8);
792     p1_16x8 = _mm_add_epi8(p1_16x8, p1_16x8_2);
793     q1_16x8 = _mm_and_si128(q1_16x8,
794                             _mm_xor_si128(flag4_16x8, _mm_set1_epi8(0xFF)));
795     q1_16x8_2 = _mm_and_si128(q1_16x8_2, flag4_16x8);
796     q1_16x8 = _mm_add_epi8(q1_16x8, q1_16x8_2);
797 
798     // p2 and q2
799     p2_16x8 = _mm_and_si128(p2_16x8,
800                             _mm_xor_si128(flag3_16x8, _mm_set1_epi8(0xFF)));
801     p2_16x8_2 = _mm_and_si128(p2_16x8_2, flag3_16x8);
802     p2_16x8 = _mm_add_epi8(p2_16x8, p2_16x8_2);
803     q2_16x8 = _mm_and_si128(q2_16x8,
804                             _mm_xor_si128(flag4_16x8, _mm_set1_epi8(0xFF)));
805     q2_16x8_2 = _mm_and_si128(q2_16x8_2, flag4_16x8);
806     q2_16x8 = _mm_add_epi8(q2_16x8, q2_16x8_2);
807 
808     _mm_storeu_si128((__m128i *)(pu1_HorzPixel + i16_posP2), p2_16x8);
809     _mm_storeu_si128((__m128i *)(pu1_HorzPixel + i16_posP1), p1_16x8);
810     _mm_storeu_si128((__m128i *)(pu1_HorzPixel + i16_posP0), p0_16x8);
811 
812     _mm_storeu_si128((__m128i *)(pu1_src), q0_16x8);
813     _mm_storeu_si128((__m128i *)(pu1_src + i16_posQ1), q1_16x8);
814     _mm_storeu_si128((__m128i *)(pu1_src + i16_posQ2), q2_16x8);
815 
816 }
817 
818 /*****************************************************************************/
819 /*                                                                           */
820 /*  Function Name : ih264_deblk_luma_vert_bslt4_ssse3()                      */
821 /*                                                                           */
822 /*  Description   : This function performs filtering of a luma block         */
823 /*                  vertical edge when the boundary strength is less than 4. */
824 /*                                                                           */
825 /*  Inputs        : pu1_src       - pointer to the src sample q0             */
826 /*                  src_strd      - source stride                            */
827 /*                  alpha         - alpha value for the boundary             */
828 /*                  beta          - beta value for the boundary              */
829 /*                  u4_bs         - packed Boundary strength array           */
830 /*                  pu1_cliptab   - tc0_table                                */
831 /*                                                                           */
832 /*  Globals       : None                                                     */
833 /*                                                                           */
834 /*  Processing    : This operation is described in Sec. 8.7.2.3 under the    */
835 /*                  title "Filtering process for edges for bS less than 4"   */
836 /*                  in ITU T Rec H.264.                                      */
837 /*                                                                           */
838 /*  Outputs       : None                                                     */
839 /*                                                                           */
840 /*  Returns       : None                                                     */
841 /*                                                                           */
842 /*  Issues        : None                                                     */
843 /*                                                                           */
844 /*  Revision History:                                                        */
845 /*                                                                           */
846 /*         DD MM YYYY   Author(s)       Changes (Describe the changes made)  */
847 /*         12 02 2015   Naveen Kumar P  Initial version                      */
848 /*                                                                           */
849 /*****************************************************************************/
ih264_deblk_luma_vert_bslt4_ssse3(UWORD8 * pu1_src,WORD32 src_strd,WORD32 alpha,WORD32 beta,UWORD32 u4_bs,const UWORD8 * pu1_cliptab)850 void ih264_deblk_luma_vert_bslt4_ssse3(UWORD8 *pu1_src,
851                                        WORD32 src_strd,
852                                        WORD32 alpha,
853                                        WORD32 beta,
854                                        UWORD32 u4_bs,
855                                        const UWORD8 *pu1_cliptab)
856 {
857     UWORD8 u1_Bs, u1_Bs1;
858 
859     WORD32 j = 0;
860 
861     __m128i linea, lineb, linec, lined, linee, linef, lineg, lineh;
862     __m128i int1, int2, int3, int4, high1, high2;
863     __m128i flag, flag1, i_C, i_C0;
864     __m128i i_Ap, i_Aq, diff, const1, const2, in_macro, in_macrotemp, temp,
865                     temp1;
866     __m128i zero = _mm_setzero_si128();
867 
868     for(j = 0; j <= 8 * src_strd; j += 8 * src_strd)
869     {
870         //Transpose
871         linea = _mm_loadl_epi64((__m128i *)(pu1_src - 3 + j));
872         lineb = _mm_loadl_epi64((__m128i *)(pu1_src - 3 + src_strd + j));
873         linec = _mm_loadl_epi64((__m128i *)(pu1_src - 3 + 2 * src_strd + j));
874         lined = _mm_loadl_epi64((__m128i *)(pu1_src - 3 + 3 * src_strd + j));
875 
876         linea = _mm_unpacklo_epi8(linea, zero);
877         lineb = _mm_unpacklo_epi8(lineb, zero);
878         linec = _mm_unpacklo_epi8(linec, zero);
879         lined = _mm_unpacklo_epi8(lined, zero);
880 
881         int1 = _mm_unpacklo_epi16(linea, lineb);
882         lineb = _mm_unpackhi_epi16(linea, lineb);
883 
884         int2 = _mm_unpacklo_epi16(linec, lined);
885         lined = _mm_unpackhi_epi16(linec, lined);
886 
887         linea = _mm_unpacklo_epi16(int1, int2);
888         int1 = _mm_unpackhi_epi16(int1, int2);
889 
890         linec = _mm_unpacklo_epi16(lineb, lined);
891         high1 = _mm_unpackhi_epi16(lineb, lined);
892 
893         linee = _mm_loadl_epi64((__m128i *)(pu1_src - 3 + 4 * src_strd + j));
894         linef = _mm_loadl_epi64((__m128i *)(pu1_src - 3 + 5 * src_strd + j));
895         lineg = _mm_loadl_epi64((__m128i *)(pu1_src - 3 + 6 * src_strd + j));
896         lineh = _mm_loadl_epi64((__m128i *)(pu1_src - 3 + 7 * src_strd + j));
897 
898         linee = _mm_unpacklo_epi8(linee, zero);
899         linef = _mm_unpacklo_epi8(linef, zero);
900         lineg = _mm_unpacklo_epi8(lineg, zero);
901         lineh = _mm_unpacklo_epi8(lineh, zero);
902 
903         int2 = _mm_unpacklo_epi16(linee, linef);
904         linef = _mm_unpackhi_epi16(linee, linef);
905 
906         int3 = _mm_unpacklo_epi16(lineg, lineh);
907         lineh = _mm_unpackhi_epi16(lineg, lineh);
908 
909         linee = _mm_unpacklo_epi16(int2, int3);
910         int2 = _mm_unpackhi_epi16(int2, int3);
911 
912         lineg = _mm_unpacklo_epi16(linef, lineh);
913         high2 = _mm_unpackhi_epi16(linef, lineh);
914 
915         int4 = _mm_unpacklo_epi16(linea, linee);
916         lineb = _mm_unpackhi_epi16(linea, linee);
917 
918         int3 = _mm_unpacklo_epi16(int1, int2);
919         lined = _mm_unpackhi_epi16(int1, int2);
920 
921         int2 = _mm_unpacklo_epi16(linec, lineg);
922         linef = _mm_unpackhi_epi16(linec, lineg);
923 
924         linea = int4;
925         linec = int3;
926         linee = int2;
927 
928         lineg = _mm_unpacklo_epi16(high1, high2);
929         lineh = _mm_unpackhi_epi16(high1, high2);
930 
931         //end of transpose
932 
933         u1_Bs = (u4_bs >> 24) & 0xff;
934         u1_Bs1 = (u4_bs >> 16) & 0xff;
935         u4_bs <<= 16;
936 
937         flag1 = _mm_set_epi16(u1_Bs1, u1_Bs, u1_Bs1, u1_Bs, u1_Bs1, u1_Bs,
938                               u1_Bs1, u1_Bs);
939         flag1 = _mm_cmpeq_epi16(flag1, zero); //Set flag to 1s and 0s
940         flag1 = _mm_xor_si128(flag1, _mm_set1_epi16(0xFFFF)); //Invert for required mask
941 
942         i_C0 = _mm_set_epi16(pu1_cliptab[u1_Bs1], pu1_cliptab[u1_Bs],
943                              pu1_cliptab[u1_Bs1], pu1_cliptab[u1_Bs],
944                              pu1_cliptab[u1_Bs1], pu1_cliptab[u1_Bs],
945                              pu1_cliptab[u1_Bs1], pu1_cliptab[u1_Bs]);
946 
947         diff = _mm_subs_epi16(linec, lined); //Condn 1
948         diff = _mm_abs_epi16(diff);
949         const1 = _mm_set1_epi16(alpha);
950         flag = _mm_cmpgt_epi16(const1, diff);
951 
952         diff = _mm_subs_epi16(linee, lined); //Condtn 2
953         diff = _mm_abs_epi16(diff);
954         const1 = _mm_set1_epi16(beta);
955         flag = _mm_and_si128(flag, _mm_cmpgt_epi16(const1, diff));
956 
957         diff = _mm_subs_epi16(lineb, linec); //Condtn 3
958         diff = _mm_abs_epi16(diff);
959         flag = _mm_and_si128(flag, _mm_cmpgt_epi16(const1, diff)); //Const 1= Beta from now on
960 
961         flag = _mm_and_si128(flag, flag1); //Final flag (ui_B condition + other 3 conditions)
962 
963         //Adding Ap<Beta and Aq<Beta
964         i_Ap = _mm_subs_epi16(linea, linec);
965         i_Ap = _mm_abs_epi16(i_Ap);
966         const2 = _mm_cmpgt_epi16(const1, i_Ap);
967         const2 = _mm_subs_epi16(zero, const2); //Make FFFF=1 and 0000=0
968         i_C = _mm_add_epi16(i_C0, const2);
969 
970         i_Aq = _mm_subs_epi16(linef, lined);
971         i_Aq = _mm_abs_epi16(i_Aq);
972         const2 = _mm_cmpgt_epi16(const1, i_Aq);
973         const2 = _mm_subs_epi16(zero, const2);
974         i_C = _mm_add_epi16(i_C, const2);
975 
976         //Calculate in_macro
977         diff = _mm_subs_epi16(lined, linec);
978         diff = _mm_slli_epi16(diff, 2);
979         const2 = _mm_subs_epi16(lineb, linee);
980         diff = _mm_add_epi16(diff, const2);
981         const2 = _mm_set1_epi16(4);
982         diff = _mm_add_epi16(diff, const2);
983         in_macro = _mm_srai_epi16(diff, 3);
984 
985         in_macro = _mm_min_epi16(i_C, in_macro); //CLIP3
986         i_C = _mm_subs_epi16(zero, i_C);
987         in_macro = _mm_max_epi16(i_C, in_macro);
988 
989         //Compute and store
990         in_macrotemp = _mm_add_epi16(linec, in_macro);
991         in_macrotemp = _mm_and_si128(in_macrotemp, flag);
992         temp = _mm_and_si128(linec,
993                              _mm_xor_si128(flag, _mm_set1_epi16(0xFFFF)));
994         temp = _mm_add_epi16(temp, in_macrotemp);
995         //temp= _mm_packus_epi16 (temp, zero);
996         //_mm_storel_epi64(uc_HorzPixel+i16_posP0+i, in_macrotemp);
997 
998         in_macrotemp = _mm_subs_epi16(lined, in_macro);
999         in_macrotemp = _mm_and_si128(in_macrotemp, flag);
1000         temp1 = _mm_and_si128(lined,
1001                               _mm_xor_si128(flag, _mm_set1_epi16(0xFFFF)));
1002         temp1 = _mm_add_epi16(temp1, in_macrotemp);
1003         //temp1= _mm_packus_epi16 (temp1, zero);
1004         //_mm_storel_epi64(pu1_src+i, in_macrotemp);
1005 
1006         //If Ap<Beta
1007         flag1 = _mm_cmpgt_epi16(const1, i_Ap);
1008         flag1 = _mm_and_si128(flag, flag1);
1009         in_macrotemp = _mm_add_epi16(linec, lined);
1010         in_macrotemp = _mm_add_epi16(in_macrotemp, _mm_set1_epi16(1));
1011         in_macrotemp = _mm_srai_epi16(in_macrotemp, 1);
1012         in_macro = _mm_add_epi16(in_macrotemp, linea);
1013         in_macro = _mm_subs_epi16(in_macro, _mm_slli_epi16(lineb, 1));
1014         in_macro = _mm_srai_epi16(in_macro, 1);
1015 
1016         in_macro = _mm_min_epi16(i_C0, in_macro); //CLIP3
1017         i_C0 = _mm_subs_epi16(zero, i_C0);
1018         in_macro = _mm_max_epi16(i_C0, in_macro);
1019 
1020         in_macro = _mm_and_si128(in_macro, flag1);
1021         lineb = _mm_add_epi16(lineb, in_macro);
1022         //in_macro= _mm_packus_epi16 (i_p1, zero);
1023         //_mm_storel_epi64(uc_HorzPixel+i16_posP1+i, in_macro);
1024 
1025         flag1 = _mm_cmpgt_epi16(const1, i_Aq);
1026         flag1 = _mm_and_si128(flag, flag1);
1027         in_macro = _mm_add_epi16(in_macrotemp, linef);
1028         in_macro = _mm_subs_epi16(in_macro, _mm_slli_epi16(linee, 1));
1029         in_macro = _mm_srai_epi16(in_macro, 1);
1030 
1031         i_C0 = _mm_abs_epi16(i_C0);
1032         in_macro = _mm_min_epi16(i_C0, in_macro); //CLIP3
1033         i_C0 = _mm_subs_epi16(zero, i_C0);
1034         in_macro = _mm_max_epi16(i_C0, in_macro);
1035 
1036         in_macro = _mm_and_si128(in_macro, flag1);
1037         linee = _mm_add_epi16(linee, in_macro);
1038         //in_macro= _mm_packus_epi16 (i_q1, zero);
1039         //_mm_storel_epi64(pu1_src+i16_posQ1+i, in_macro);
1040         linec = temp;
1041         lined = temp1;
1042         //End of filtering
1043 
1044         int1 = _mm_unpacklo_epi16(linea, linee);
1045         linee = _mm_unpackhi_epi16(linea, linee);
1046 
1047         int2 = _mm_unpacklo_epi16(linec, lineg);
1048         lineg = _mm_unpackhi_epi16(linec, lineg);
1049 
1050         linea = _mm_unpacklo_epi16(int1, int2);
1051         int3 = _mm_unpackhi_epi16(int1, int2);
1052 
1053         linec = _mm_unpacklo_epi16(linee, lineg);
1054         lineg = _mm_unpackhi_epi16(linee, lineg);
1055 
1056         int1 = _mm_unpacklo_epi16(lineb, linef);
1057         linef = _mm_unpackhi_epi16(lineb, linef);
1058 
1059         int2 = _mm_unpacklo_epi16(lined, lineh);
1060         lineh = _mm_unpackhi_epi16(lined, lineh);
1061 
1062         lineb = _mm_unpacklo_epi16(int1, int2);
1063         int4 = _mm_unpackhi_epi16(int1, int2);
1064 
1065         lined = _mm_unpacklo_epi16(linef, lineh);
1066         lineh = _mm_unpackhi_epi16(linef, lineh);
1067 
1068         int1 = _mm_unpackhi_epi16(linea, lineb);
1069         linea = _mm_unpacklo_epi16(linea, lineb);
1070 
1071         int2 = _mm_unpacklo_epi16(int3, int4);
1072         high1 = _mm_unpackhi_epi16(int3, int4);
1073 
1074         lineb = _mm_unpacklo_epi16(linec, lined);
1075         linef = _mm_unpackhi_epi16(linec, lined);
1076 
1077         lined = _mm_unpacklo_epi16(lineg, lineh);
1078         lineh = _mm_unpackhi_epi16(lineg, lineh);
1079 
1080         linee = int1;
1081         lineg = high1;
1082         linec = int2;
1083         //End of inverse transpose
1084 
1085         //Packs and stores
1086         linea = _mm_packus_epi16(linea, zero);
1087         _mm_storel_epi64((__m128i *)(pu1_src - 3 + j), linea);
1088 
1089         lineb = _mm_packus_epi16(lineb, zero);
1090         _mm_storel_epi64((__m128i *)(pu1_src - 3 + src_strd + j), lineb);
1091 
1092         linec = _mm_packus_epi16(linec, zero);
1093         _mm_storel_epi64((__m128i *)(pu1_src - 3 + 2 * src_strd + j), linec);
1094 
1095         lined = _mm_packus_epi16(lined, zero);
1096         _mm_storel_epi64((__m128i *)(pu1_src - 3 + 3 * src_strd + j), lined);
1097 
1098         linee = _mm_packus_epi16(linee, zero);
1099         _mm_storel_epi64((__m128i *)(pu1_src - 3 + 4 * src_strd + j), linee);
1100 
1101         linef = _mm_packus_epi16(linef, zero);
1102         _mm_storel_epi64((__m128i *)(pu1_src - 3 + 5 * src_strd + j), linef);
1103 
1104         lineg = _mm_packus_epi16(lineg, zero);
1105         _mm_storel_epi64((__m128i *)(pu1_src - 3 + 6 * src_strd + j), lineg);
1106 
1107         lineh = _mm_packus_epi16(lineh, zero);
1108         _mm_storel_epi64((__m128i *)(pu1_src - 3 + 7 * src_strd + j), lineh);
1109 
1110     }
1111 }
1112 
1113 /*****************************************************************************/
1114 /*                                                                           */
1115 /*  Function Name : ih264_deblk_luma_horz_bslt4_ssse3()                      */
1116 /*                                                                           */
1117 /*  Description   : This function performs filtering of a luma block         */
1118 /*                  horizontal edge when boundary strength is less than 4.   */
1119 /*                                                                           */
1120 /*  Inputs        : pu1_src       - pointer to the src sample q0             */
1121 /*                  src_strd      - source stride                            */
1122 /*                  alpha         - alpha value for the boundary             */
1123 /*                  beta          - beta value for the boundary              */
1124 /*                  u4_bs         - packed Boundary strength array           */
1125 /*                  pu1_cliptab   - tc0_table                                */
1126 /*                                                                           */
1127 /*  Globals       : None                                                     */
1128 /*                                                                           */
1129 /*  Processing    : This operation is described in Sec. 8.7.2.3 under the    */
1130 /*                  title "Filtering process for edges for bS less than 4"   */
1131 /*                  in ITU T Rec H.264.                                      */
1132 /*                                                                           */
1133 /*  Outputs       : None                                                     */
1134 /*                                                                           */
1135 /*  Returns       : None                                                     */
1136 /*                                                                           */
1137 /*  Issues        : None                                                     */
1138 /*                                                                           */
1139 /*  Revision History:                                                        */
1140 /*                                                                           */
1141 /*         DD MM YYYY   Author(s)       Changes (Describe the changes made)  */
1142 /*         12 02 2015   Naveen Kumar P  Initial version                      */
1143 /*                                                                           */
1144 /*****************************************************************************/
ih264_deblk_luma_horz_bslt4_ssse3(UWORD8 * pu1_src,WORD32 src_strd,WORD32 alpha,WORD32 beta,UWORD32 u4_bs,const UWORD8 * pu1_cliptab)1145 void ih264_deblk_luma_horz_bslt4_ssse3(UWORD8 *pu1_src,
1146                                        WORD32 src_strd,
1147                                        WORD32 alpha,
1148                                        WORD32 beta,
1149                                        UWORD32 u4_bs,
1150                                        const UWORD8 *pu1_cliptab)
1151 {
1152     WORD16 i16_posP2, i16_posP1, i16_posP0, i16_posQ1, i16_posQ2;
1153     UWORD8 *pu1_HorzPixel;
1154     __m128i zero = _mm_setzero_si128();
1155     __m128i bs_flag_16x8b, C0_16x8, C0_8x16, C0_hi_8x16, C_8x16, C_hi_8x16;
1156     __m128i q0_16x8, q1_16x8, q2_16x8, p0_16x8, p1_16x8, p2_16x8;
1157     __m128i temp1, temp2;
1158     __m128i Alpha_8x16, Beta_8x16, flag1_16x8, flag2_16x8, flag3_16x8;
1159     __m128i in_macro_16x8, in_macro_hi_16x8;
1160     __m128i const_val4_8x16;
1161     UWORD8 u1_Bs0, u1_Bs1, u1_Bs2, u1_Bs3;
1162     UWORD8 clip0, clip1, clip2, clip3;
1163 
1164     pu1_HorzPixel = pu1_src - (src_strd << 2);
1165 
1166     i16_posQ1 = src_strd;
1167     i16_posQ2 = X2(src_strd);
1168     i16_posP0 = X3(src_strd);
1169     i16_posP1 = X2(src_strd);
1170     i16_posP2 = src_strd;
1171 
1172     q0_16x8 = _mm_loadu_si128((__m128i *)(pu1_src));
1173     q1_16x8 = _mm_loadu_si128((__m128i *)(pu1_src + i16_posQ1));
1174 
1175     u1_Bs0 = (u4_bs >> 24) & 0xff;
1176     u1_Bs1 = (u4_bs >> 16) & 0xff;
1177     u1_Bs2 = (u4_bs >> 8) & 0xff;
1178     u1_Bs3 = (u4_bs >> 0) & 0xff;
1179     clip0 = pu1_cliptab[u1_Bs0];
1180     clip1 = pu1_cliptab[u1_Bs1];
1181     clip2 = pu1_cliptab[u1_Bs2];
1182     clip3 = pu1_cliptab[u1_Bs3];
1183 
1184     Alpha_8x16 = _mm_set1_epi16(alpha);
1185     Beta_8x16 = _mm_set1_epi16(beta);
1186 
1187     bs_flag_16x8b = _mm_set_epi8(u1_Bs3, u1_Bs3, u1_Bs3, u1_Bs3, u1_Bs2, u1_Bs2,
1188                                  u1_Bs2, u1_Bs2, u1_Bs1, u1_Bs1, u1_Bs1, u1_Bs1,
1189                                  u1_Bs0, u1_Bs0, u1_Bs0, u1_Bs0);
1190 
1191     C0_16x8 = _mm_set_epi8(clip3, clip3, clip3, clip3, clip2, clip2, clip2,
1192                            clip2, clip1, clip1, clip1, clip1, clip0, clip0,
1193                            clip0, clip0);
1194 
1195     bs_flag_16x8b = _mm_cmpeq_epi8(bs_flag_16x8b, zero);
1196     bs_flag_16x8b = _mm_xor_si128(bs_flag_16x8b, _mm_set1_epi8(0xFF)); //Invert for required mask
1197     C0_8x16 = _mm_unpacklo_epi8(C0_16x8, zero);
1198     C0_hi_8x16 = _mm_unpackhi_epi8(C0_16x8, zero);
1199 
1200     p1_16x8 = _mm_loadu_si128((__m128i *)(pu1_HorzPixel + i16_posP1));
1201     p0_16x8 = _mm_loadu_si128((__m128i *)(pu1_HorzPixel + i16_posP0));
1202     p2_16x8 = _mm_loadu_si128((__m128i *)(pu1_HorzPixel + i16_posP2));
1203     q2_16x8 = _mm_loadu_si128((__m128i *)(pu1_src + i16_posQ2));
1204 
1205     //Cond1 (ABS(p0 - q0) < alpha)
1206     temp1 = _mm_subs_epu8(q0_16x8, p0_16x8);
1207     temp2 = _mm_subs_epu8(p0_16x8, q0_16x8);
1208     temp1 = _mm_add_epi8(temp1, temp2);
1209 
1210     temp2 = _mm_unpacklo_epi8(temp1, zero);
1211     temp1 = _mm_unpackhi_epi8(temp1, zero);
1212 
1213     temp2 = _mm_cmpgt_epi16(Alpha_8x16, temp2);
1214     temp1 = _mm_cmpgt_epi16(Alpha_8x16, temp1);
1215 
1216     flag1_16x8 = _mm_packs_epi16(temp2, temp1);
1217     flag1_16x8 = _mm_and_si128(flag1_16x8, bs_flag_16x8b);
1218 
1219     //Cond2 (ABS(q1 - q0) < beta)
1220     temp1 = _mm_subs_epu8(q0_16x8, q1_16x8);
1221     temp2 = _mm_subs_epu8(q1_16x8, q0_16x8);
1222     temp1 = _mm_add_epi8(temp1, temp2);
1223 
1224     temp2 = _mm_unpacklo_epi8(temp1, zero);
1225     temp1 = _mm_unpackhi_epi8(temp1, zero);
1226 
1227     temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2);
1228     temp1 = _mm_cmpgt_epi16(Beta_8x16, temp1);
1229 
1230     flag2_16x8 = _mm_packs_epi16(temp2, temp1);
1231 
1232     flag1_16x8 = _mm_and_si128(flag1_16x8, flag2_16x8);
1233 
1234     //Cond3 (ABS(p1 - p0) < beta)
1235     temp1 = _mm_subs_epu8(p0_16x8, p1_16x8);
1236     temp2 = _mm_subs_epu8(p1_16x8, p0_16x8);
1237     temp1 = _mm_add_epi8(temp1, temp2);
1238 
1239     temp2 = _mm_unpacklo_epi8(temp1, zero);
1240     temp1 = _mm_unpackhi_epi8(temp1, zero);
1241 
1242     temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2);
1243     temp1 = _mm_cmpgt_epi16(Beta_8x16, temp1);
1244 
1245     flag2_16x8 = _mm_packs_epi16(temp2, temp1);
1246 
1247     // !((ABS(p0 - q0) < alpha) || (ABS(q1 - q0) < beta) || (ABS(p1 - p0) < beta))
1248     flag1_16x8 = _mm_and_si128(flag1_16x8, flag2_16x8);
1249 
1250     // (ABS(p2 - p0) < beta)
1251     temp1 = _mm_subs_epu8(p0_16x8, p2_16x8);
1252     temp2 = _mm_subs_epu8(p2_16x8, p0_16x8);
1253     temp1 = _mm_add_epi8(temp1, temp2);
1254 
1255     temp2 = _mm_unpacklo_epi8(temp1, zero);
1256     temp1 = _mm_unpackhi_epi8(temp1, zero);
1257     temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2);
1258     temp1 = _mm_cmpgt_epi16(Beta_8x16, temp1);
1259 
1260     flag2_16x8 = _mm_packs_epi16(temp2, temp1);
1261     flag2_16x8 = _mm_and_si128(flag1_16x8, flag2_16x8);
1262 
1263     temp2 = _mm_subs_epi16(zero, temp2);
1264     temp1 = _mm_subs_epi16(zero, temp1);
1265 
1266     C_8x16 = _mm_add_epi16(C0_8x16, temp2);
1267     C_hi_8x16 = _mm_add_epi16(C0_hi_8x16, temp1);
1268 
1269     // (ABS(q2 - q0) < beta)
1270     temp1 = _mm_subs_epu8(q0_16x8, q2_16x8);
1271     temp2 = _mm_subs_epu8(q2_16x8, q0_16x8);
1272     temp1 = _mm_add_epi8(temp1, temp2);
1273 
1274     temp2 = _mm_unpacklo_epi8(temp1, zero);
1275     temp1 = _mm_unpackhi_epi8(temp1, zero);
1276     temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2);
1277     temp1 = _mm_cmpgt_epi16(Beta_8x16, temp1);
1278 
1279     flag3_16x8 = _mm_packs_epi16(temp2, temp1);
1280     flag3_16x8 = _mm_and_si128(flag1_16x8, flag3_16x8);
1281 
1282     temp2 = _mm_subs_epi16(zero, temp2);
1283     temp1 = _mm_subs_epi16(zero, temp1);
1284 
1285     C_8x16 = _mm_add_epi16(C_8x16, temp2);
1286     C_hi_8x16 = _mm_add_epi16(C_hi_8x16, temp1);
1287 
1288     const_val4_8x16 = _mm_set1_epi16(4);
1289     temp1 = _mm_subs_epi16(_mm_unpacklo_epi8(q0_16x8, zero),
1290                            _mm_unpacklo_epi8(p0_16x8, zero));
1291     temp2 = _mm_subs_epi16(_mm_unpacklo_epi8(p1_16x8, zero),
1292                            _mm_unpacklo_epi8(q1_16x8, zero));
1293     temp1 = _mm_slli_epi16(temp1, 2);
1294     temp1 = _mm_add_epi16(temp1, temp2);
1295     temp1 = _mm_add_epi16(temp1, const_val4_8x16);
1296     in_macro_16x8 = _mm_srai_epi16(temp1, 3);
1297 
1298     temp1 = _mm_subs_epi16(_mm_unpackhi_epi8(q0_16x8, zero),
1299                            _mm_unpackhi_epi8(p0_16x8, zero));
1300     temp2 = _mm_subs_epi16(_mm_unpackhi_epi8(p1_16x8, zero),
1301                            _mm_unpackhi_epi8(q1_16x8, zero));
1302     temp1 = _mm_slli_epi16(temp1, 2);
1303     temp1 = _mm_add_epi16(temp1, temp2);
1304     temp1 = _mm_add_epi16(temp1, const_val4_8x16);
1305     in_macro_hi_16x8 = _mm_srai_epi16(temp1, 3);
1306 
1307     in_macro_16x8 = _mm_min_epi16(C_8x16, in_macro_16x8); //CLIP3
1308     in_macro_hi_16x8 = _mm_min_epi16(C_hi_8x16, in_macro_hi_16x8); //CLIP3
1309     C_8x16 = _mm_subs_epi16(zero, C_8x16);
1310     C_hi_8x16 = _mm_subs_epi16(zero, C_hi_8x16);
1311     in_macro_16x8 = _mm_max_epi16(C_8x16, in_macro_16x8); //CLIP3
1312     in_macro_hi_16x8 = _mm_max_epi16(C_hi_8x16, in_macro_hi_16x8); //CLIP3
1313 
1314     temp1 = _mm_add_epi16(_mm_unpacklo_epi8(p0_16x8, zero), in_macro_16x8);
1315     temp2 = _mm_add_epi16(_mm_unpackhi_epi8(p0_16x8, zero), in_macro_hi_16x8);
1316 
1317     temp1 = _mm_packus_epi16(temp1, temp2);
1318 
1319     temp1 = _mm_and_si128(temp1, flag1_16x8);
1320     temp2 = _mm_and_si128(p0_16x8,
1321                           _mm_xor_si128(flag1_16x8, _mm_set1_epi16(0xFFFF)));
1322 
1323     temp1 = _mm_add_epi8(temp1, temp2);
1324 
1325     _mm_storeu_si128((__m128i *)(pu1_HorzPixel + i16_posP0), temp1);
1326 
1327     temp1 = _mm_sub_epi16(_mm_unpacklo_epi8(q0_16x8, zero), in_macro_16x8);
1328     temp2 = _mm_sub_epi16(_mm_unpackhi_epi8(q0_16x8, zero), in_macro_hi_16x8);
1329 
1330     temp1 = _mm_packus_epi16(temp1, temp2);
1331 
1332     temp1 = _mm_and_si128(temp1, flag1_16x8);
1333     temp2 = _mm_and_si128(q0_16x8,
1334                           _mm_xor_si128(flag1_16x8, _mm_set1_epi16(0xFFFF)));
1335 
1336     temp1 = _mm_add_epi8(temp1, temp2);
1337     _mm_storeu_si128((__m128i *)(pu1_src), temp1);
1338 
1339     //if(Ap < Beta)
1340     temp1 = _mm_avg_epu16(_mm_unpacklo_epi8(q0_16x8, zero),
1341                           _mm_unpacklo_epi8(p0_16x8, zero));
1342     temp2 = _mm_slli_epi16(_mm_unpacklo_epi8(p1_16x8, zero), 1);
1343     //temp2 = _mm_subs_epi16(zero,temp2);
1344     temp2 = _mm_subs_epi16(_mm_unpacklo_epi8(p2_16x8, zero), temp2);
1345     temp2 = _mm_add_epi16(temp1, temp2);
1346     in_macro_16x8 = _mm_srai_epi16(temp2, 1);
1347 
1348     temp1 = _mm_avg_epu16(_mm_unpackhi_epi8(q0_16x8, zero),
1349                           _mm_unpackhi_epi8(p0_16x8, zero));
1350     temp2 = _mm_slli_epi16(_mm_unpackhi_epi8(p1_16x8, zero), 1);
1351     //temp2 = _mm_subs_epi16(zero,temp2);
1352     temp2 = _mm_subs_epi16(_mm_unpackhi_epi8(p2_16x8, zero), temp2);
1353     temp2 = _mm_add_epi16(temp1, temp2);
1354     in_macro_hi_16x8 = _mm_srai_epi16(temp2, 1);
1355 
1356     in_macro_16x8 = _mm_min_epi16(C0_8x16, in_macro_16x8); //CLIP3
1357     in_macro_hi_16x8 = _mm_min_epi16(C0_hi_8x16, in_macro_hi_16x8); //CLIP3
1358     C0_8x16 = _mm_subs_epi16(zero, C0_8x16);
1359     C0_hi_8x16 = _mm_subs_epi16(zero, C0_hi_8x16);
1360     in_macro_16x8 = _mm_max_epi16(C0_8x16, in_macro_16x8); //CLIP3
1361     in_macro_hi_16x8 = _mm_max_epi16(C0_hi_8x16, in_macro_hi_16x8); //CLIP3
1362 
1363     temp1 = _mm_add_epi16(_mm_unpacklo_epi8(p1_16x8, zero), in_macro_16x8);
1364     temp2 = _mm_add_epi16(_mm_unpackhi_epi8(p1_16x8, zero), in_macro_hi_16x8);
1365 
1366     temp1 = _mm_packus_epi16(temp1, temp2);
1367 
1368     temp1 = _mm_and_si128(temp1, flag2_16x8);
1369     temp2 = _mm_and_si128(p1_16x8,
1370                           _mm_xor_si128(flag2_16x8, _mm_set1_epi16(0xFFFF)));
1371     temp1 = _mm_add_epi8(temp1, temp2);
1372     _mm_storeu_si128((__m128i *)(pu1_HorzPixel + i16_posP1), temp1);
1373 
1374     //if(Aq < Beta)
1375     temp1 = _mm_avg_epu16(_mm_unpacklo_epi8(q0_16x8, zero),
1376                           _mm_unpacklo_epi8(p0_16x8, zero));
1377     temp2 = _mm_slli_epi16(_mm_unpacklo_epi8(q1_16x8, zero), 1);
1378     //temp2 = _mm_slli_epi16 (temp2, 1);
1379     temp2 = _mm_subs_epi16(_mm_unpacklo_epi8(q2_16x8, zero), temp2);
1380     temp2 = _mm_add_epi16(temp1, temp2);
1381     in_macro_16x8 = _mm_srai_epi16(temp2, 1);
1382 
1383     temp1 = _mm_avg_epu16(_mm_unpackhi_epi8(q0_16x8, zero),
1384                           _mm_unpackhi_epi8(p0_16x8, zero));
1385     temp2 = _mm_slli_epi16(_mm_unpackhi_epi8(q1_16x8, zero), 1);
1386     //temp2 = _mm_slli_epi16 (temp2, 1);
1387     temp2 = _mm_subs_epi16(_mm_unpackhi_epi8(q2_16x8, zero), temp2);
1388     temp2 = _mm_add_epi16(temp1, temp2);
1389     in_macro_hi_16x8 = _mm_srai_epi16(temp2, 1);
1390 
1391     in_macro_16x8 = _mm_max_epi16(C0_8x16, in_macro_16x8); //CLIP3
1392     in_macro_hi_16x8 = _mm_max_epi16(C0_hi_8x16, in_macro_hi_16x8); //CLIP3
1393     C0_8x16 = _mm_subs_epi16(zero, C0_8x16);
1394     C0_hi_8x16 = _mm_subs_epi16(zero, C0_hi_8x16);
1395     in_macro_16x8 = _mm_min_epi16(C0_8x16, in_macro_16x8); //CLIP3
1396     in_macro_hi_16x8 = _mm_min_epi16(C0_hi_8x16, in_macro_hi_16x8); //CLIP3
1397 
1398     temp1 = _mm_add_epi16(_mm_unpacklo_epi8(q1_16x8, zero), in_macro_16x8);
1399     temp2 = _mm_add_epi16(_mm_unpackhi_epi8(q1_16x8, zero), in_macro_hi_16x8);
1400 
1401     temp1 = _mm_packus_epi16(temp1, temp2);
1402 
1403     temp1 = _mm_and_si128(temp1, flag3_16x8);
1404     temp2 = _mm_and_si128(q1_16x8,
1405                           _mm_xor_si128(flag3_16x8, _mm_set1_epi16(0xFFFF)));
1406     temp1 = _mm_add_epi8(temp1, temp2);
1407 
1408     _mm_storeu_si128((__m128i *)(pu1_src + i16_posQ1), temp1);
1409 
1410 }
1411 
1412 /*****************************************************************************/
1413 /*                                                                           */
1414 /*  Function Name : ih264_deblk_luma_vert_bs4_mbaff_ssse3()                  */
1415 /*                                                                           */
1416 /*  Description   : This function performs filtering of a luma block         */
1417 /*                  vertical edge when boundary strength is set to 4.        */
1418 /*                                                                           */
1419 /*  Inputs        : pu1_src       - pointer to the src sample q0             */
1420 /*                  src_strd      - source stride                            */
1421 /*                  alpha         - alpha value for the boundary             */
1422 /*                  beta          - beta value for the boundary              */
1423 /*                                                                           */
1424 /*  Globals       : None                                                     */
1425 /*                                                                           */
1426 /*  Processing    : When the function is called twice, this operation is as  */
1427 /*                  described in Sec. 8.7.2.3 under the title "Filtering     */
1428 /*                  process for edges for bS equal to 4" in ITU T Rec H.264. */
1429 /*                                                                           */
1430 /*  Outputs       : None                                                     */
1431 /*                                                                           */
1432 /*  Returns       : None                                                     */
1433 /*                                                                           */
1434 /*  Issues        : None                                                     */
1435 /*                                                                           */
1436 /*  Revision History:                                                        */
1437 /*                                                                           */
1438 /*         DD MM YYYY   Author(s)       Changes (Describe the changes made)  */
1439 /*         12 02 2015   Naveen Kumar P  Initial version                      */
1440 /*                                                                           */
1441 /*****************************************************************************/
ih264_deblk_luma_vert_bs4_mbaff_ssse3(UWORD8 * pu1_src,WORD32 src_strd,WORD32 alpha,WORD32 beta)1442 void ih264_deblk_luma_vert_bs4_mbaff_ssse3(UWORD8 *pu1_src,
1443                                            WORD32 src_strd,
1444                                            WORD32 alpha,
1445                                            WORD32 beta)
1446 {
1447     __m128i zero = _mm_setzero_si128();
1448     __m128i q0_16x8, q1_16x8, q2_16x8, q3_16x8;
1449     __m128i p0_16x8, p1_16x8, p2_16x8, p3_16x8;
1450     __m128i q0_8x16, q1_8x16, q2_8x16, q3_8x16;
1451     __m128i p0_8x16, p1_8x16, p2_8x16, p3_8x16;
1452     __m128i q0_16x8_1;
1453     __m128i p0_16x8_1;
1454     __m128i q0_16x8_2, q1_16x8_2, q2_16x8_2;
1455     __m128i p0_16x8_2, p1_16x8_2, p2_16x8_2;
1456     __m128i temp1, temp2, temp3, temp4, temp5, temp6;
1457     __m128i Alpha_8x16, Beta_8x16;
1458     __m128i flag1_16x8, flag2_16x8, flag3_16x8, flag4_16x8;
1459     __m128i const_val2_16x8 = _mm_set1_epi16(2);
1460     __m128i line1, line2, line3, line4, line5, line6, line7, line8;
1461 
1462     Alpha_8x16 = _mm_set1_epi16(alpha);
1463     Beta_8x16 = _mm_set1_epi16(beta);
1464 
1465     line1 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 0 * src_strd));
1466     line2 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 1 * src_strd));
1467     line3 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 2 * src_strd));
1468     line4 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 3 * src_strd));
1469     line5 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 4 * src_strd));
1470     line6 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 5 * src_strd));
1471     line7 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 6 * src_strd));
1472     line8 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 7 * src_strd));
1473 
1474     temp1 = _mm_unpacklo_epi8(line1, line2);
1475     temp2 = _mm_unpacklo_epi8(line3, line4);
1476     temp3 = _mm_unpacklo_epi8(line5, line6);
1477     temp4 = _mm_unpacklo_epi8(line7, line8);
1478 
1479     line1 = _mm_unpacklo_epi16(temp1, temp2);
1480     line2 = _mm_unpackhi_epi16(temp1, temp2);
1481     line3 = _mm_unpacklo_epi16(temp3, temp4);
1482     line4 = _mm_unpackhi_epi16(temp3, temp4);
1483 
1484     p1_8x16 = _mm_unpacklo_epi32(line1, line3);
1485     p0_8x16 = _mm_unpackhi_epi32(line1, line3);
1486     q0_8x16 = _mm_unpacklo_epi32(line2, line4);
1487     q1_8x16 = _mm_unpackhi_epi32(line2, line4);
1488 
1489     p3_16x8 = _mm_unpacklo_epi64(p1_8x16, zero);
1490     p2_16x8 = _mm_unpackhi_epi64(p1_8x16, zero);
1491     q2_16x8 = _mm_unpacklo_epi64(q1_8x16, zero);
1492     q3_16x8 = _mm_unpackhi_epi64(q1_8x16, zero);
1493     p1_16x8 = _mm_unpacklo_epi64(p0_8x16, zero);
1494     p0_16x8 = _mm_unpackhi_epi64(p0_8x16, zero);
1495     q0_16x8 = _mm_unpacklo_epi64(q0_8x16, zero);
1496     q1_16x8 = _mm_unpackhi_epi64(q0_8x16, zero);
1497 
1498     //Cond1 (ABS(p0 - q0) < alpha)
1499     temp1 = _mm_subs_epu8(q0_16x8, p0_16x8);
1500     temp2 = _mm_subs_epu8(p0_16x8, q0_16x8);
1501     temp1 = _mm_add_epi8(temp1, temp2);
1502 
1503     temp2 = _mm_unpacklo_epi8(temp1, zero);
1504     temp1 = _mm_unpackhi_epi8(temp1, zero);
1505 
1506     temp2 = _mm_cmpgt_epi16(Alpha_8x16, temp2);
1507     temp1 = _mm_cmpgt_epi16(Alpha_8x16, temp1);
1508 
1509     flag1_16x8 = _mm_packs_epi16(temp2, temp1);
1510 
1511     //Cond2 (ABS(q1 - q0) < beta)
1512     temp1 = _mm_subs_epu8(q0_16x8, q1_16x8);
1513     temp2 = _mm_subs_epu8(q1_16x8, q0_16x8);
1514     temp1 = _mm_add_epi8(temp1, temp2);
1515 
1516     temp2 = _mm_unpacklo_epi8(temp1, zero);
1517     temp1 = _mm_unpackhi_epi8(temp1, zero);
1518 
1519     temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2);
1520     temp1 = _mm_cmpgt_epi16(Beta_8x16, temp1);
1521 
1522     flag2_16x8 = _mm_packs_epi16(temp2, temp1);
1523 
1524     flag1_16x8 = _mm_and_si128(flag1_16x8, flag2_16x8);
1525 
1526     //Cond3 (ABS(p1 - p0) < beta)
1527     temp1 = _mm_subs_epu8(p0_16x8, p1_16x8);
1528     temp2 = _mm_subs_epu8(p1_16x8, p0_16x8);
1529     temp1 = _mm_add_epi8(temp1, temp2);
1530 
1531     temp2 = _mm_unpacklo_epi8(temp1, zero);
1532     temp1 = _mm_unpackhi_epi8(temp1, zero);
1533 
1534     temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2);
1535     temp1 = _mm_cmpgt_epi16(Beta_8x16, temp1);
1536 
1537     flag2_16x8 = _mm_packs_epi16(temp2, temp1);
1538 
1539     // !((ABS(p0 - q0) < alpha) || (ABS(q1 - q0) < beta) || (ABS(p1 - p0) < beta))
1540     flag1_16x8 = _mm_and_si128(flag1_16x8, flag2_16x8);
1541 
1542     // (ABS(p0 - q0) < ((alpha >> 2) + 2))
1543     temp1 = _mm_subs_epu8(p0_16x8, q0_16x8);
1544     temp2 = _mm_subs_epu8(q0_16x8, p0_16x8);
1545     temp1 = _mm_add_epi8(temp1, temp2);
1546     Alpha_8x16 = _mm_srai_epi16(Alpha_8x16, 2);
1547     Alpha_8x16 = _mm_add_epi16(Alpha_8x16, const_val2_16x8);
1548 
1549     temp2 = _mm_unpacklo_epi8(temp1, zero);
1550     temp1 = _mm_unpackhi_epi8(temp1, zero);
1551     temp2 = _mm_cmpgt_epi16(Alpha_8x16, temp2);
1552     temp1 = _mm_cmpgt_epi16(Alpha_8x16, temp1);
1553 
1554     flag2_16x8 = _mm_packs_epi16(temp2, temp1);
1555     flag2_16x8 = _mm_and_si128(flag1_16x8, flag2_16x8);
1556 
1557     // (ABS(p2 - p0) < beta)
1558     temp1 = _mm_subs_epu8(p0_16x8, p2_16x8);
1559     temp2 = _mm_subs_epu8(p2_16x8, p0_16x8);
1560     temp1 = _mm_add_epi8(temp1, temp2);
1561 
1562     temp2 = _mm_unpacklo_epi8(temp1, zero);
1563     temp1 = _mm_unpackhi_epi8(temp1, zero);
1564     temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2);
1565     temp1 = _mm_cmpgt_epi16(Beta_8x16, temp1);
1566 
1567     flag3_16x8 = _mm_packs_epi16(temp2, temp1);
1568     flag3_16x8 = _mm_and_si128(flag3_16x8, flag2_16x8);
1569 
1570     // (ABS(q2 - q0) < beta)
1571     temp1 = _mm_subs_epu8(q0_16x8, q2_16x8);
1572     temp2 = _mm_subs_epu8(q2_16x8, q0_16x8);
1573     temp1 = _mm_add_epi8(temp1, temp2);
1574 
1575     temp2 = _mm_unpacklo_epi8(temp1, zero);
1576     temp1 = _mm_unpackhi_epi8(temp1, zero);
1577     temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2);
1578     temp1 = _mm_cmpgt_epi16(Beta_8x16, temp1);
1579 
1580     flag4_16x8 = _mm_packs_epi16(temp2, temp1);
1581     flag4_16x8 = _mm_and_si128(flag4_16x8, flag2_16x8);
1582 
1583     // First 8 pixels
1584     p3_8x16 = _mm_unpacklo_epi8(p3_16x8, zero);
1585     p2_8x16 = _mm_unpacklo_epi8(p2_16x8, zero);
1586     p1_8x16 = _mm_unpacklo_epi8(p1_16x8, zero);
1587     p0_8x16 = _mm_unpacklo_epi8(p0_16x8, zero);
1588     q0_8x16 = _mm_unpacklo_epi8(q0_16x8, zero);
1589     q1_8x16 = _mm_unpacklo_epi8(q1_16x8, zero);
1590     q2_8x16 = _mm_unpacklo_epi8(q2_16x8, zero);
1591     q3_8x16 = _mm_unpacklo_epi8(q3_16x8, zero);
1592 
1593     // p0_1 and q0_1
1594     temp1 = _mm_add_epi16(p0_8x16, q1_8x16);
1595     temp2 = _mm_add_epi16(p1_8x16, q0_8x16);
1596     temp5 = _mm_add_epi16(temp1, const_val2_16x8);
1597     temp6 = _mm_add_epi16(temp2, const_val2_16x8);
1598     temp3 = _mm_slli_epi16(p1_8x16, 1);
1599     temp4 = _mm_slli_epi16(q1_8x16, 1);
1600     temp1 = _mm_add_epi16(temp5, temp3);
1601     temp2 = _mm_add_epi16(temp6, temp4);
1602     p0_16x8_1 = _mm_srai_epi16(temp1, 2);
1603     q0_16x8_1 = _mm_srai_epi16(temp2, 2);
1604 
1605     // p1_2 and q1_2
1606     temp6 = _mm_add_epi16(temp6, p0_8x16);
1607     temp5 = _mm_add_epi16(temp5, q0_8x16);
1608     temp1 = _mm_add_epi16(temp6, p2_8x16);
1609     temp2 = _mm_add_epi16(temp5, q2_8x16);
1610     p1_16x8_2 = _mm_srai_epi16(temp1, 2);
1611     q1_16x8_2 = _mm_srai_epi16(temp2, 2);
1612 
1613     // p0_2 and q0_2
1614     temp1 = _mm_add_epi16(temp3, p2_8x16);
1615     temp2 = _mm_add_epi16(temp4, q2_8x16);
1616     temp1 = _mm_add_epi16(temp1, q1_8x16);
1617     temp2 = _mm_add_epi16(temp2, p1_8x16);
1618     temp3 = _mm_add_epi16(p0_8x16, q0_8x16);
1619     temp3 = _mm_slli_epi16(temp3, 1);
1620     temp1 = _mm_add_epi16(temp1, temp3);
1621     temp2 = _mm_add_epi16(temp2, temp3);
1622     temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(4));
1623     temp2 = _mm_add_epi16(temp2, _mm_set1_epi16(4));
1624     p0_16x8_2 = _mm_srai_epi16(temp1, 3);
1625     q0_16x8_2 = _mm_srai_epi16(temp2, 3);
1626 
1627     // p2_2 and q2_2
1628     temp1 = _mm_add_epi16(temp6, const_val2_16x8);
1629     temp2 = _mm_add_epi16(temp5, const_val2_16x8);
1630     temp3 = _mm_slli_epi16(p2_8x16, 1);
1631     temp4 = _mm_slli_epi16(q2_8x16, 1);
1632     temp3 = _mm_add_epi16(p2_8x16, temp3);
1633     temp4 = _mm_add_epi16(q2_8x16, temp4);
1634     temp5 = _mm_slli_epi16(p3_8x16, 1);
1635     temp6 = _mm_slli_epi16(q3_8x16, 1);
1636     temp1 = _mm_add_epi16(temp1, temp3);
1637     temp2 = _mm_add_epi16(temp2, temp4);
1638     temp1 = _mm_add_epi16(temp1, temp5);
1639     temp2 = _mm_add_epi16(temp2, temp6);
1640     p2_16x8_2 = _mm_srai_epi16(temp1, 3);
1641     q2_16x8_2 = _mm_srai_epi16(temp2, 3);
1642 
1643     // p0_1 and q0_1
1644     p0_16x8_1 = _mm_packus_epi16(p0_16x8_1, zero);
1645     q0_16x8_1 = _mm_packus_epi16(q0_16x8_1, zero);
1646 
1647     // p1_2 and q1_2
1648     p1_16x8_2 = _mm_packus_epi16(p1_16x8_2, zero);
1649     q1_16x8_2 = _mm_packus_epi16(q1_16x8_2, zero);
1650 
1651     // p0_2 and q0_2
1652     p0_16x8_2 = _mm_packus_epi16(p0_16x8_2, zero);
1653     q0_16x8_2 = _mm_packus_epi16(q0_16x8_2, zero);
1654 
1655     // p2_2 and q2_2
1656     p2_16x8_2 = _mm_packus_epi16(p2_16x8_2, zero);
1657     q2_16x8_2 = _mm_packus_epi16(q2_16x8_2, zero);
1658 
1659     // p0 and q0
1660     p0_16x8 = _mm_and_si128(p0_16x8,
1661                             _mm_xor_si128(flag1_16x8, _mm_set1_epi8(0xFF)));
1662     p0_16x8_1 = _mm_and_si128(p0_16x8_1, flag1_16x8);
1663     p0_16x8 = _mm_add_epi8(p0_16x8, p0_16x8_1);
1664     q0_16x8 = _mm_and_si128(q0_16x8,
1665                             _mm_xor_si128(flag1_16x8, _mm_set1_epi8(0xFF)));
1666     q0_16x8_1 = _mm_and_si128(q0_16x8_1, flag1_16x8);
1667     q0_16x8 = _mm_add_epi8(q0_16x8, q0_16x8_1);
1668 
1669     // p0 and q0
1670     p0_16x8 = _mm_and_si128(p0_16x8,
1671                             _mm_xor_si128(flag3_16x8, _mm_set1_epi8(0xFF)));
1672     p0_16x8_2 = _mm_and_si128(p0_16x8_2, flag3_16x8);
1673     p0_16x8 = _mm_add_epi8(p0_16x8, p0_16x8_2);
1674     q0_16x8 = _mm_and_si128(q0_16x8,
1675                             _mm_xor_si128(flag4_16x8, _mm_set1_epi8(0xFF)));
1676     q0_16x8_2 = _mm_and_si128(q0_16x8_2, flag4_16x8);
1677     q0_16x8 = _mm_add_epi8(q0_16x8, q0_16x8_2);
1678 
1679     // p1 and q1
1680     p1_16x8 = _mm_and_si128(p1_16x8,
1681                             _mm_xor_si128(flag3_16x8, _mm_set1_epi8(0xFF)));
1682     p1_16x8_2 = _mm_and_si128(p1_16x8_2, flag3_16x8);
1683     p1_16x8 = _mm_add_epi8(p1_16x8, p1_16x8_2);
1684     q1_16x8 = _mm_and_si128(q1_16x8,
1685                             _mm_xor_si128(flag4_16x8, _mm_set1_epi8(0xFF)));
1686     q1_16x8_2 = _mm_and_si128(q1_16x8_2, flag4_16x8);
1687     q1_16x8 = _mm_add_epi8(q1_16x8, q1_16x8_2);
1688 
1689     // p2 and q2
1690     p2_16x8 = _mm_and_si128(p2_16x8,
1691                             _mm_xor_si128(flag3_16x8, _mm_set1_epi8(0xFF)));
1692     p2_16x8_2 = _mm_and_si128(p2_16x8_2, flag3_16x8);
1693     p2_16x8 = _mm_add_epi8(p2_16x8, p2_16x8_2);
1694     q2_16x8 = _mm_and_si128(q2_16x8,
1695                             _mm_xor_si128(flag4_16x8, _mm_set1_epi8(0xFF)));
1696     q2_16x8_2 = _mm_and_si128(q2_16x8_2, flag4_16x8);
1697     q2_16x8 = _mm_add_epi8(q2_16x8, q2_16x8_2);
1698 
1699     temp1 = _mm_unpacklo_epi8(p3_16x8, p2_16x8);
1700     temp2 = _mm_unpacklo_epi8(p1_16x8, p0_16x8);
1701     temp3 = _mm_unpacklo_epi8(q0_16x8, q1_16x8);
1702     temp4 = _mm_unpacklo_epi8(q2_16x8, q3_16x8);
1703 
1704     p3_8x16 = _mm_unpacklo_epi16(temp1, temp2);
1705     p2_8x16 = _mm_unpackhi_epi16(temp1, temp2);
1706     q2_8x16 = _mm_unpacklo_epi16(temp3, temp4);
1707     q3_8x16 = _mm_unpackhi_epi16(temp3, temp4);
1708 
1709     line1 = _mm_unpacklo_epi32(p3_8x16, q2_8x16);
1710     line2 = _mm_srli_si128(line1, 8);
1711     line3 = _mm_unpackhi_epi32(p3_8x16, q2_8x16);
1712     line4 = _mm_srli_si128(line3, 8);
1713     line5 = _mm_unpacklo_epi32(p2_8x16, q3_8x16);
1714     line6 = _mm_srli_si128(line5, 8);
1715     line7 = _mm_unpackhi_epi32(p2_8x16, q3_8x16);
1716     line8 = _mm_srli_si128(line7, 8);
1717 
1718     _mm_storel_epi64((__m128i *)(pu1_src - 4 + 0 * src_strd), line1);
1719     _mm_storel_epi64((__m128i *)(pu1_src - 4 + 1 * src_strd), line2);
1720     _mm_storel_epi64((__m128i *)(pu1_src - 4 + 2 * src_strd), line3);
1721     _mm_storel_epi64((__m128i *)(pu1_src - 4 + 3 * src_strd), line4);
1722     _mm_storel_epi64((__m128i *)(pu1_src - 4 + 4 * src_strd), line5);
1723     _mm_storel_epi64((__m128i *)(pu1_src - 4 + 5 * src_strd), line6);
1724     _mm_storel_epi64((__m128i *)(pu1_src - 4 + 6 * src_strd), line7);
1725     _mm_storel_epi64((__m128i *)(pu1_src - 4 + 7 * src_strd), line8);
1726 
1727 }
1728 
1729 /*****************************************************************************/
1730 /*                                                                           */
1731 /*  Function Name : ih264_deblk_luma_vert_bslt4_mbaff_ssse3()                */
1732 /*                                                                           */
1733 /*  Description   : This function performs filtering of a luma block         */
1734 /*                  vertical edge when boundary strength is less than 4.     */
1735 /*                                                                           */
1736 /*  Inputs        : pu1_src       - pointer to the src sample q0             */
1737 /*                  src_strd      - source stride                            */
1738 /*                  alpha         - alpha value for the boundary             */
1739 /*                  beta          - beta value for the boundary              */
1740 /*                  u4_bs         - packed Boundary strength array           */
1741 /*                  pu1_cliptab   - tc0_table                                */
1742 /*                                                                           */
1743 /*  Globals       : None                                                     */
1744 /*                                                                           */
1745 /*  Processing    : When the function is called twice, this operation is as  */
1746 /*                  described in Sec. 8.7.2.3 under the title "Filtering     */
1747 /*                  process for edges for bS less than 4" in ITU T Rec H.264.*/
1748 /*                                                                           */
1749 /*  Outputs       : None                                                     */
1750 /*                                                                           */
1751 /*  Returns       : None                                                     */
1752 /*                                                                           */
1753 /*  Issues        : None                                                     */
1754 /*                                                                           */
1755 /*  Revision History:                                                        */
1756 /*                                                                           */
1757 /*         DD MM YYYY   Author(s)       Changes (Describe the changes made)  */
1758 /*         12 02 2015   Naveen Kumar P  Initial version                      */
1759 /*                                                                           */
1760 /*****************************************************************************/
ih264_deblk_luma_vert_bslt4_mbaff_ssse3(UWORD8 * pu1_src,WORD32 src_strd,WORD32 alpha,WORD32 beta,UWORD32 u4_bs,const UWORD8 * pu1_cliptab)1761 void ih264_deblk_luma_vert_bslt4_mbaff_ssse3(UWORD8 *pu1_src,
1762                                              WORD32 src_strd,
1763                                              WORD32 alpha,
1764                                              WORD32 beta,
1765                                              UWORD32 u4_bs,
1766                                              const UWORD8 *pu1_cliptab)
1767 {
1768     __m128i zero = _mm_setzero_si128();
1769     __m128i bs_flag_16x8b, C0_16x8, C0_8x16, C_8x16;
1770     __m128i q0_16x8, q1_16x8, q2_16x8, q3_16x8;
1771     __m128i p0_16x8, p1_16x8, p2_16x8, p3_16x8;
1772     __m128i temp1, temp2, temp3, temp4;
1773     __m128i Alpha_8x16, Beta_8x16, flag1_16x8, flag2_16x8, flag3_16x8;
1774     __m128i in_macro_16x8;
1775     __m128i const_val4_8x16;
1776     UWORD8 u1_Bs0, u1_Bs1, u1_Bs2, u1_Bs3;
1777     UWORD8 clip0, clip1, clip2, clip3;
1778     __m128i line1, line2, line3, line4, line5, line6, line7, line8;
1779     __m128i q0_16x8_1, q1_16x8_1, q0_16x8_2;
1780     __m128i p0_16x8_1, p1_16x8_1, p0_16x8_2;
1781 
1782     line1 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 0 * src_strd));
1783     line2 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 1 * src_strd));
1784     line3 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 2 * src_strd));
1785     line4 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 3 * src_strd));
1786     line5 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 4 * src_strd));
1787     line6 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 5 * src_strd));
1788     line7 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 6 * src_strd));
1789     line8 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 7 * src_strd));
1790 
1791     temp1 = _mm_unpacklo_epi8(line1, line2);
1792     temp2 = _mm_unpacklo_epi8(line3, line4);
1793     temp3 = _mm_unpacklo_epi8(line5, line6);
1794     temp4 = _mm_unpacklo_epi8(line7, line8);
1795 
1796     line1 = _mm_unpacklo_epi16(temp1, temp2);
1797     line2 = _mm_unpackhi_epi16(temp1, temp2);
1798     line3 = _mm_unpacklo_epi16(temp3, temp4);
1799     line4 = _mm_unpackhi_epi16(temp3, temp4);
1800 
1801     temp1 = _mm_unpacklo_epi32(line1, line3);
1802     temp2 = _mm_unpackhi_epi32(line1, line3);
1803     temp3 = _mm_unpacklo_epi32(line2, line4);
1804     temp4 = _mm_unpackhi_epi32(line2, line4);
1805 
1806     p3_16x8 = _mm_unpacklo_epi64(temp1, zero);
1807     p2_16x8 = _mm_unpackhi_epi64(temp1, zero);
1808     q2_16x8 = _mm_unpacklo_epi64(temp4, zero);
1809     q3_16x8 = _mm_unpackhi_epi64(temp4, zero);
1810     p1_16x8 = _mm_unpacklo_epi64(temp2, zero);
1811     p0_16x8 = _mm_unpackhi_epi64(temp2, zero);
1812     q0_16x8 = _mm_unpacklo_epi64(temp3, zero);
1813     q1_16x8 = _mm_unpackhi_epi64(temp3, zero);
1814 
1815     u1_Bs0 = (u4_bs >> 24) & 0xff;
1816     u1_Bs1 = (u4_bs >> 16) & 0xff;
1817     u1_Bs2 = (u4_bs >> 8) & 0xff;
1818     u1_Bs3 = (u4_bs >> 0) & 0xff;
1819     clip0 = pu1_cliptab[u1_Bs0];
1820     clip1 = pu1_cliptab[u1_Bs1];
1821     clip2 = pu1_cliptab[u1_Bs2];
1822     clip3 = pu1_cliptab[u1_Bs3];
1823 
1824     Alpha_8x16 = _mm_set1_epi16(alpha);
1825     Beta_8x16 = _mm_set1_epi16(beta);
1826 
1827     bs_flag_16x8b = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, u1_Bs3, u1_Bs3, u1_Bs2,
1828                                  u1_Bs2, u1_Bs1, u1_Bs1, u1_Bs0, u1_Bs0);
1829 
1830     C0_16x8 = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, clip3, clip3, clip2, clip2,
1831                            clip1, clip1, clip0, clip0);
1832 
1833     bs_flag_16x8b = _mm_cmpeq_epi8(bs_flag_16x8b, zero);
1834     bs_flag_16x8b = _mm_xor_si128(bs_flag_16x8b, _mm_set1_epi8(0xFF)); //Invert for required mask
1835     C0_8x16 = _mm_unpacklo_epi8(C0_16x8, zero);
1836 
1837     //Cond1 (ABS(p0 - q0) < alpha)
1838     temp1 = _mm_subs_epu8(q0_16x8, p0_16x8);
1839     temp2 = _mm_subs_epu8(p0_16x8, q0_16x8);
1840     temp1 = _mm_add_epi8(temp1, temp2);
1841 
1842     temp2 = _mm_unpacklo_epi8(temp1, zero);
1843     temp2 = _mm_cmpgt_epi16(Alpha_8x16, temp2);
1844 
1845     flag1_16x8 = _mm_packs_epi16(temp2, zero);
1846     flag1_16x8 = _mm_and_si128(flag1_16x8, bs_flag_16x8b);
1847 
1848     //Cond2 (ABS(q1 - q0) < beta)
1849     temp1 = _mm_subs_epu8(q0_16x8, q1_16x8);
1850     temp2 = _mm_subs_epu8(q1_16x8, q0_16x8);
1851     temp1 = _mm_add_epi8(temp1, temp2);
1852 
1853     temp2 = _mm_unpacklo_epi8(temp1, zero);
1854     temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2);
1855 
1856     flag2_16x8 = _mm_packs_epi16(temp2, zero);
1857     flag1_16x8 = _mm_and_si128(flag1_16x8, flag2_16x8);
1858 
1859     //Cond3 (ABS(p1 - p0) < beta)
1860     temp1 = _mm_subs_epu8(p0_16x8, p1_16x8);
1861     temp2 = _mm_subs_epu8(p1_16x8, p0_16x8);
1862     temp1 = _mm_add_epi8(temp1, temp2);
1863 
1864     temp2 = _mm_unpacklo_epi8(temp1, zero);
1865     temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2);
1866 
1867     flag2_16x8 = _mm_packs_epi16(temp2, zero);
1868 
1869     // !((ABS(p0 - q0) < alpha) || (ABS(q1 - q0) < beta) || (ABS(p1 - p0) < beta))
1870     flag1_16x8 = _mm_and_si128(flag1_16x8, flag2_16x8);
1871 
1872     // (ABS(p2 - p0) < beta)
1873     temp1 = _mm_subs_epu8(p0_16x8, p2_16x8);
1874     temp2 = _mm_subs_epu8(p2_16x8, p0_16x8);
1875     temp1 = _mm_add_epi8(temp1, temp2);
1876 
1877     temp2 = _mm_unpacklo_epi8(temp1, zero);
1878     temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2);
1879 
1880     flag2_16x8 = _mm_packs_epi16(temp2, zero);
1881     flag2_16x8 = _mm_and_si128(flag1_16x8, flag2_16x8);
1882 
1883     temp2 = _mm_subs_epi16(zero, temp2);
1884 
1885     C_8x16 = _mm_add_epi16(C0_8x16, temp2);
1886 
1887     // (ABS(q2 - q0) < beta)
1888     temp1 = _mm_subs_epu8(q0_16x8, q2_16x8);
1889     temp2 = _mm_subs_epu8(q2_16x8, q0_16x8);
1890     temp1 = _mm_add_epi8(temp1, temp2);
1891 
1892     temp2 = _mm_unpacklo_epi8(temp1, zero);
1893     temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2);
1894 
1895     flag3_16x8 = _mm_packs_epi16(temp2, zero);
1896     flag3_16x8 = _mm_and_si128(flag1_16x8, flag3_16x8);
1897 
1898     temp2 = _mm_subs_epi16(zero, temp2);
1899 
1900     C_8x16 = _mm_add_epi16(C_8x16, temp2);
1901 
1902     const_val4_8x16 = _mm_set1_epi16(4);
1903     temp1 = _mm_subs_epi16(_mm_unpacklo_epi8(q0_16x8, zero),
1904                            _mm_unpacklo_epi8(p0_16x8, zero));
1905     temp2 = _mm_subs_epi16(_mm_unpacklo_epi8(p1_16x8, zero),
1906                            _mm_unpacklo_epi8(q1_16x8, zero));
1907     temp1 = _mm_slli_epi16(temp1, 2);
1908     temp1 = _mm_add_epi16(temp1, temp2);
1909     temp1 = _mm_add_epi16(temp1, const_val4_8x16);
1910     in_macro_16x8 = _mm_srai_epi16(temp1, 3);
1911 
1912     in_macro_16x8 = _mm_min_epi16(C_8x16, in_macro_16x8); //CLIP3
1913     C_8x16 = _mm_subs_epi16(zero, C_8x16);
1914     in_macro_16x8 = _mm_max_epi16(C_8x16, in_macro_16x8); //CLIP3
1915 
1916     // p0
1917     temp1 = _mm_add_epi16(_mm_unpacklo_epi8(p0_16x8, zero), in_macro_16x8);
1918 
1919     temp1 = _mm_packus_epi16(temp1, zero);
1920 
1921     p0_16x8_1 = _mm_and_si128(temp1, flag1_16x8);
1922     p0_16x8_2 = _mm_and_si128(
1923                     p0_16x8, _mm_xor_si128(flag1_16x8, _mm_set1_epi16(0xFFFF)));
1924 
1925     p0_16x8_1 = _mm_add_epi8(p0_16x8_1, p0_16x8_2);
1926 
1927     // q0
1928     temp1 = _mm_sub_epi16(_mm_unpacklo_epi8(q0_16x8, zero), in_macro_16x8);
1929 
1930     temp1 = _mm_packus_epi16(temp1, zero);
1931 
1932     q0_16x8_1 = _mm_and_si128(temp1, flag1_16x8);
1933     q0_16x8_2 = _mm_and_si128(
1934                     q0_16x8, _mm_xor_si128(flag1_16x8, _mm_set1_epi16(0xFFFF)));
1935 
1936     q0_16x8_1 = _mm_add_epi8(q0_16x8_1, q0_16x8_2);
1937 
1938     //if(Ap < Beta)
1939     temp1 = _mm_avg_epu16(_mm_unpacklo_epi8(q0_16x8, zero),
1940                           _mm_unpacklo_epi8(p0_16x8, zero));
1941     temp2 = _mm_slli_epi16(_mm_unpacklo_epi8(p1_16x8, zero), 1);
1942     //temp2 = _mm_subs_epi16(zero,temp2);
1943     temp2 = _mm_subs_epi16(_mm_unpacklo_epi8(p2_16x8, zero), temp2);
1944     temp2 = _mm_add_epi16(temp1, temp2);
1945     in_macro_16x8 = _mm_srai_epi16(temp2, 1);
1946 
1947     in_macro_16x8 = _mm_min_epi16(C0_8x16, in_macro_16x8); //CLIP3
1948     C0_8x16 = _mm_subs_epi16(zero, C0_8x16);
1949     in_macro_16x8 = _mm_max_epi16(C0_8x16, in_macro_16x8); //CLIP3
1950 
1951     // p1
1952     temp1 = _mm_add_epi16(_mm_unpacklo_epi8(p1_16x8, zero), in_macro_16x8);
1953 
1954     temp1 = _mm_packus_epi16(temp1, zero);
1955 
1956     p1_16x8_1 = _mm_and_si128(temp1, flag2_16x8);
1957     p1_16x8 = _mm_and_si128(p1_16x8,
1958                             _mm_xor_si128(flag2_16x8, _mm_set1_epi16(0xFFFF)));
1959     p1_16x8 = _mm_add_epi8(p1_16x8, p1_16x8_1);
1960 
1961     //if(Aq < Beta)
1962     temp1 = _mm_avg_epu16(_mm_unpacklo_epi8(q0_16x8, zero),
1963                           _mm_unpacklo_epi8(p0_16x8, zero));
1964     temp2 = _mm_slli_epi16(_mm_unpacklo_epi8(q1_16x8, zero), 1);
1965     //temp2 = _mm_slli_epi16 (temp2, 1);
1966     temp2 = _mm_subs_epi16(_mm_unpacklo_epi8(q2_16x8, zero), temp2);
1967     temp2 = _mm_add_epi16(temp1, temp2);
1968     in_macro_16x8 = _mm_srai_epi16(temp2, 1);
1969 
1970     in_macro_16x8 = _mm_max_epi16(C0_8x16, in_macro_16x8); //CLIP3
1971     C0_8x16 = _mm_subs_epi16(zero, C0_8x16);
1972     in_macro_16x8 = _mm_min_epi16(C0_8x16, in_macro_16x8); //CLIP3
1973 
1974     temp1 = _mm_add_epi16(_mm_unpacklo_epi8(q1_16x8, zero), in_macro_16x8);
1975 
1976     // q1
1977     temp1 = _mm_packus_epi16(temp1, zero);
1978 
1979     q1_16x8_1 = _mm_and_si128(temp1, flag3_16x8);
1980     q1_16x8 = _mm_and_si128(q1_16x8,
1981                             _mm_xor_si128(flag3_16x8, _mm_set1_epi16(0xFFFF)));
1982     q1_16x8 = _mm_add_epi8(q1_16x8, q1_16x8_1);
1983 
1984     temp1 = _mm_unpacklo_epi8(p3_16x8, p2_16x8);
1985     temp2 = _mm_unpacklo_epi8(p1_16x8, p0_16x8_1);
1986     temp3 = _mm_unpacklo_epi8(q0_16x8_1, q1_16x8);
1987     temp4 = _mm_unpacklo_epi8(q2_16x8, q3_16x8);
1988 
1989     line7 = _mm_unpacklo_epi16(temp1, temp2);
1990     temp1 = _mm_unpackhi_epi16(temp1, temp2);
1991     line8 = _mm_unpacklo_epi16(temp3, temp4);
1992     temp2 = _mm_unpackhi_epi16(temp3, temp4);
1993 
1994     line1 = _mm_unpacklo_epi32(line7, line8);
1995     line2 = _mm_srli_si128(line1, 8);
1996     line3 = _mm_unpackhi_epi32(line7, line8);
1997     line4 = _mm_srli_si128(line3, 8);
1998     line5 = _mm_unpacklo_epi32(temp1, temp2);
1999     line6 = _mm_srli_si128(line5, 8);
2000     line7 = _mm_unpackhi_epi32(temp1, temp2);
2001     line8 = _mm_srli_si128(line7, 8);
2002 
2003     _mm_storel_epi64((__m128i *)(pu1_src - 4 + 0 * src_strd), line1);
2004     _mm_storel_epi64((__m128i *)(pu1_src - 4 + 1 * src_strd), line2);
2005     _mm_storel_epi64((__m128i *)(pu1_src - 4 + 2 * src_strd), line3);
2006     _mm_storel_epi64((__m128i *)(pu1_src - 4 + 3 * src_strd), line4);
2007     _mm_storel_epi64((__m128i *)(pu1_src - 4 + 4 * src_strd), line5);
2008     _mm_storel_epi64((__m128i *)(pu1_src - 4 + 5 * src_strd), line6);
2009     _mm_storel_epi64((__m128i *)(pu1_src - 4 + 6 * src_strd), line7);
2010     _mm_storel_epi64((__m128i *)(pu1_src - 4 + 7 * src_strd), line8);
2011 }
2012 
2013