1 /******************************************************************************
2 *
3 * Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at:
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 *
17 ******************************************************************************/
18 /**
19 *******************************************************************************
20 * @file
21 *  ihevc_sao_atom_intr.c
22 *
23 * @brief
24 *  Contains function definitions for Sample adaptive offset(SAO) used in-loop
25 * filtering
26 *
27 * @author
28 * 100592
29 *
30 * @par List of Functions:
31 *   - ihevc_sao_band_offset_luma_ssse3()
32 *   - ihevc_sao_band_offset_chroma_ssse3()
33 *   - ihevc_sao_edge_offset_class0_ssse3()
34 *   - ihevc_sao_edge_offset_class0_chroma_ssse3()
35 *   - ihevc_sao_edge_offset_class1_ssse3()
36 *   - ihevc_sao_edge_offset_class1_chroma_ssse3()
37 *   - ihevc_sao_edge_offset_class2_ssse3()
38 *   - ihevc_sao_edge_offset_class2_chroma_ssse3()
39 *   - ihevc_sao_edge_offset_class3_ssse3()
40 *   - ihevc_sao_edge_offset_class3_chroma_ssse3()
41 *
42 * @remarks
43 *  None
44 *
45 *******************************************************************************
46 */
47 /*****************************************************************************/
48 /* File Includes                                                             */
49 /*****************************************************************************/
50 #include <stdio.h>
51 
52 #include "ihevc_typedefs.h"
53 #include "ihevc_platform_macros.h"
54 #include "ihevc_macros.h"
55 #include "ihevc_func_selector.h"
56 #include "ihevc_defs.h"
57 #include "ihevc_tables_x86_intr.h"
58 #include "ihevc_common_tables.h"
59 #include "ihevc_sao.h"
60 
61 #include <immintrin.h>
62 
63 #define NUM_BAND_TABLE  32
64 /**
65 *******************************************************************************
66 *
67 * @brief
68 * Has two sets of functions : band offset and edge offset both for luma and chroma
69 * edge offset has horizontal ,vertical, 135 degree and 45 degree
70 *
71 * @par Description:
72 *
73 *
74 * @param[in-out] pu1_src
75 *  Pointer to the source
76 *
77 * @param[in] src_strd
78 *  Source stride
79 *
80 * @param[in-out] pu1_src_left
81 *  source left boundary
82 *
83 * @param[in-out] pu1_src_top
84 * Source top boundary
85 *
86 * @param[in-out] pu1_src_top_left
87 *  Source top left boundary
88 *
89 * @param[in] pu1_src_top_right
90 *  Source top right boundary
91 *
92 * @param[in] pu1_src_bot_left
93 *  Source bottom left boundary
94 *
95 * @param[in] pu1_avail
96 *  boundary availability flags
97 *
98 * @param[in] pi1_sao_offset_u
99 *  Chroma U sao offset values
100 *
101 * @param[in] pi1_sao_offset_v
102 *  Chroma V sao offset values
103 *
104 * @param[in] pi1_sao_offset
105 *  Luma sao offset values
106 *
107 * @param[in] wd
108 *  width of the source
109 
110 * @param[in] ht
111 *  height of the source
112 * @returns
113 *
114 * @remarks
115 *  None
116 *
117 *******************************************************************************
118 */
119 
120 
ihevc_sao_band_offset_luma_ssse3(UWORD8 * pu1_src,WORD32 src_strd,UWORD8 * pu1_src_left,UWORD8 * pu1_src_top,UWORD8 * pu1_src_top_left,WORD32 sao_band_pos,WORD8 * pi1_sao_offset,WORD32 wd,WORD32 ht)121 void ihevc_sao_band_offset_luma_ssse3(UWORD8 *pu1_src,
122                                       WORD32 src_strd,
123                                       UWORD8 *pu1_src_left,
124                                       UWORD8 *pu1_src_top,
125                                       UWORD8 *pu1_src_top_left,
126                                       WORD32 sao_band_pos,
127                                       WORD8 *pi1_sao_offset,
128                                       WORD32 wd,
129                                       WORD32 ht)
130 {
131     WORD32 row, col;
132     UWORD8 *pu1_src_cpy;
133     WORD32 wd_rem;
134     WORD8 offset = 0;
135 
136     __m128i src_temp0_8x16b, src_temp1_8x16b, src_temp2_8x16b, src_temp3_8x16b;
137     __m128i band_table0_8x16b, band_table1_8x16b, band_table2_8x16b, band_table3_8x16b;
138     __m128i tmp_set_128i_1, tmp_set_128i_2, tmp_set_128i_3, tmp_set_128i_4;
139     __m128i band_pos_16x8b;
140     __m128i sao_offset;
141     __m128i cmp_mask, cmp_store;
142 
143     /* Updating left and top-left and top */
144     for(row = 0; row < ht; row++)
145     {
146         pu1_src_left[row] = pu1_src[row * src_strd + (wd - 1)];
147     }
148     pu1_src_top_left[0] = pu1_src_top[wd - 1];
149     for(col = 0; col < wd; col += 8)
150     {
151         tmp_set_128i_1 = _mm_loadl_epi64((__m128i *)(pu1_src + (ht - 1) * src_strd + offset));
152         _mm_storel_epi64((__m128i *)(pu1_src_top + offset), tmp_set_128i_1);
153         offset += 8;
154     }
155 
156     //replicating sao_band_pos as 8 bit value 16 times
157 
158 
159     band_pos_16x8b = _mm_set1_epi16((WORD16)(sao_band_pos << 3));
160     //value set for sao_offset extraction
161     tmp_set_128i_1  = _mm_set_epi8(128, 1, 128, 1, 128, 1, 128, 1, 128, 1, 128, 1, 128, 1, 128, 1);
162     tmp_set_128i_2  = _mm_set_epi8(128, 2, 128, 2, 128, 2, 128, 2, 128, 2, 128, 2, 128, 2, 128, 2);
163     tmp_set_128i_3  = _mm_set_epi8(128, 3, 128, 3, 128, 3, 128, 3, 128, 3, 128, 3, 128, 3, 128, 3);
164     tmp_set_128i_4  = _mm_set_epi8(128, 4, 128, 4, 128, 4, 128, 4, 128, 4, 128, 4, 128, 4, 128, 4);
165 
166     //loaded sao offset values
167     sao_offset      = _mm_loadl_epi64((__m128i *)pi1_sao_offset);
168 
169     //loading 16bit 32values of gu2_table_band_idx consecutively in 4 registers
170     band_table0_8x16b = _mm_load_si128((__m128i *)(gu2_table_band_idx));
171     band_table1_8x16b = _mm_load_si128((__m128i *)(gu2_table_band_idx + 8));
172     band_table2_8x16b = _mm_load_si128((__m128i *)(gu2_table_band_idx + 16));
173     band_table3_8x16b = _mm_load_si128((__m128i *)(gu2_table_band_idx + 24));
174 
175     //band_position addition
176     band_table0_8x16b = _mm_add_epi16(band_table0_8x16b, band_pos_16x8b);
177     band_table1_8x16b = _mm_add_epi16(band_table1_8x16b, band_pos_16x8b);
178     band_table2_8x16b = _mm_add_epi16(band_table2_8x16b, band_pos_16x8b);
179     band_table3_8x16b = _mm_add_epi16(band_table3_8x16b, band_pos_16x8b);
180     //sao_offset duplication
181     tmp_set_128i_1  = _mm_shuffle_epi8(sao_offset, tmp_set_128i_1);
182     tmp_set_128i_2  = _mm_shuffle_epi8(sao_offset, tmp_set_128i_2);
183     tmp_set_128i_3  = _mm_shuffle_epi8(sao_offset, tmp_set_128i_3);
184     tmp_set_128i_4  = _mm_shuffle_epi8(sao_offset, tmp_set_128i_4);
185     //settng for comparision
186     cmp_mask = _mm_set1_epi16(16);
187     cmp_store = _mm_set1_epi16(0x00ff);
188 
189     //sao_offset addition
190     band_table0_8x16b = _mm_add_epi16(band_table0_8x16b, tmp_set_128i_1);
191     band_table1_8x16b = _mm_add_epi16(band_table1_8x16b, tmp_set_128i_2);
192     band_table2_8x16b = _mm_add_epi16(band_table2_8x16b, tmp_set_128i_3);
193     band_table3_8x16b = _mm_add_epi16(band_table3_8x16b, tmp_set_128i_4);
194     //masking upper 8bit values of each  16 bit band table value
195     band_table0_8x16b = _mm_and_si128(band_table0_8x16b, cmp_store);
196     band_table1_8x16b = _mm_and_si128(band_table1_8x16b, cmp_store);
197     band_table2_8x16b = _mm_and_si128(band_table2_8x16b, cmp_store);
198     band_table3_8x16b = _mm_and_si128(band_table3_8x16b, cmp_store);
199 
200     switch(sao_band_pos)
201     {
202         case 0:
203             tmp_set_128i_2 = _mm_cmpgt_epi16(cmp_mask, band_table0_8x16b);
204             band_table0_8x16b = _mm_and_si128(band_table0_8x16b, tmp_set_128i_2);
205             break;
206         case 28:
207             tmp_set_128i_2 = _mm_cmpgt_epi16(cmp_mask, band_table3_8x16b);
208             band_table3_8x16b = _mm_or_si128(band_table3_8x16b, tmp_set_128i_2);
209             break;
210         case 29:
211             tmp_set_128i_2 = _mm_cmpgt_epi16(cmp_mask, band_table2_8x16b);
212             band_table2_8x16b = _mm_or_si128(band_table2_8x16b, tmp_set_128i_2);
213             tmp_set_128i_2 = _mm_cmpgt_epi16(cmp_mask, band_table3_8x16b);
214             band_table3_8x16b = _mm_and_si128(band_table3_8x16b, tmp_set_128i_2);
215             break;
216         case 30:
217             tmp_set_128i_2 = _mm_cmpgt_epi16(cmp_mask, band_table1_8x16b);
218             band_table1_8x16b = _mm_or_si128(band_table1_8x16b, tmp_set_128i_2);
219             tmp_set_128i_2 = _mm_cmpgt_epi16(cmp_mask, band_table2_8x16b);
220             band_table2_8x16b = _mm_and_si128(band_table2_8x16b, tmp_set_128i_2);
221             break;
222         case 31:
223             tmp_set_128i_2 = _mm_cmpgt_epi16(cmp_mask, band_table0_8x16b);
224             band_table0_8x16b = _mm_or_si128(band_table0_8x16b, tmp_set_128i_2);
225             tmp_set_128i_2 = _mm_cmpgt_epi16(cmp_mask, band_table1_8x16b);
226             band_table1_8x16b = _mm_and_si128(band_table1_8x16b, tmp_set_128i_2);
227             break;
228         default:
229             break;
230     }
231     //sao_offset is reused for zero cmp mask.
232     sao_offset = _mm_setzero_si128();
233     tmp_set_128i_1 = _mm_set1_epi8(1);
234     //tmp_set_128i_2 = _mm_set_epi8 (128,7,128,6,128,5,128,4,128,3,128,2,128,1,128,0);
235     cmp_mask = _mm_packus_epi16(cmp_mask, cmp_mask); //cmp_msk=dup16(16);
236 
237     //masking upper 8bit values of each  16 bit band table value
238     band_table0_8x16b = _mm_and_si128(band_table0_8x16b, cmp_store);
239     band_table1_8x16b = _mm_and_si128(band_table1_8x16b, cmp_store);
240     band_table2_8x16b = _mm_and_si128(band_table2_8x16b, cmp_store);
241     band_table3_8x16b = _mm_and_si128(band_table3_8x16b, cmp_store);
242 
243     //band table 8x16 four registers are packed into 16x8 two registers:  band_table0_8x16b and band_table2_8x16b
244     band_table0_8x16b = _mm_packus_epi16(band_table0_8x16b, band_table1_8x16b);
245     band_table2_8x16b = _mm_packus_epi16(band_table2_8x16b, band_table3_8x16b);
246 
247     band_table3_8x16b = _mm_slli_epi16(cmp_mask, 1); // to compare if value is greater than 31
248     band_pos_16x8b = _mm_packus_epi16(band_pos_16x8b, band_pos_16x8b); //band_pos is now 8 bit aligned
249     band_table3_8x16b = _mm_sub_epi8(band_table3_8x16b, tmp_set_128i_1); // to compare if value is greater than 31
250 
251     cmp_mask = _mm_sub_epi8(cmp_mask, tmp_set_128i_1);
252     //  band_pos_16x8b = _mm_or_si128(band_pos_16x8b,cmp_store);
253 
254     for(col = wd; col >= 16; col -= 16)
255     {
256         pu1_src_cpy = pu1_src;
257         for(row = ht; row > 0; row -= 2)
258         {
259 
260 
261             //row = 0 load 8 pixel values from 7:0 pos. relative to cur. pos.
262             src_temp0_8x16b = _mm_loadu_si128((__m128i *)(pu1_src_cpy));
263             // row = 1
264             src_temp2_8x16b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
265 
266 
267 
268             //saturated substract 8 bit
269             tmp_set_128i_1 = _mm_sub_epi8(src_temp0_8x16b, band_pos_16x8b);
270             tmp_set_128i_3 = _mm_sub_epi8(src_temp2_8x16b, band_pos_16x8b);
271             //if the values less than 0 put ff
272             tmp_set_128i_2 = _mm_cmpgt_epi8(sao_offset, tmp_set_128i_1);
273             tmp_set_128i_4 = _mm_cmpgt_epi8(sao_offset, tmp_set_128i_3);
274             tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, tmp_set_128i_2);
275             tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, tmp_set_128i_4);
276             //if the values gret=ater than 31 put ff
277             tmp_set_128i_2 = _mm_cmpgt_epi8(tmp_set_128i_1, band_table3_8x16b);
278             tmp_set_128i_4 = _mm_cmpgt_epi8(tmp_set_128i_3, band_table3_8x16b);
279             tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, tmp_set_128i_2);
280             tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, tmp_set_128i_4);
281 
282 
283             //row 0 and row1
284             //if the values >16 then put ff ,cmp_mask = dup16(15)
285             cmp_store = _mm_cmpgt_epi8(tmp_set_128i_1, cmp_mask);
286             //values 16 to 31 for row 0 & 1 but values <16 ==0
287             tmp_set_128i_2 = _mm_and_si128(tmp_set_128i_1, cmp_store);
288             // values 0 to 15 for row 0 & 1
289             tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, cmp_store);
290             //values 16 to 31 for row 0 & 1 but values <16 masked to ff
291             cmp_store = _mm_cmpeq_epi8(tmp_set_128i_2, sao_offset);
292             tmp_set_128i_2 = _mm_or_si128(tmp_set_128i_2, cmp_store);
293             //row 2 and  row 3
294             //if the values >16 then put ff ,cmp_mask = dup16(15)
295             cmp_store = _mm_cmpgt_epi8(tmp_set_128i_3, cmp_mask);
296             //values 16 to 31 for row 2 & 3 but values <16 ==0
297             tmp_set_128i_4 = _mm_and_si128(tmp_set_128i_3, cmp_store);
298             // values 0 to 15 for row 2 & 3
299             tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, cmp_store);
300             //values 16 to 31 for row 2 & 3 but values <16 masked to ff
301             cmp_store = _mm_cmpeq_epi8(tmp_set_128i_4, sao_offset);
302             tmp_set_128i_4 = _mm_or_si128(tmp_set_128i_4, cmp_store);
303 
304             //row 0 and row 1
305             //to preserve pixel values in which no offset needs to be added.
306             cmp_store = _mm_cmpeq_epi8(tmp_set_128i_1, tmp_set_128i_2);
307             src_temp0_8x16b = _mm_and_si128(src_temp0_8x16b, cmp_store);
308 
309             //row 2 and row 3
310             //to preserve pixel values in which no offset needs to be added.
311             cmp_store = _mm_cmpeq_epi8(tmp_set_128i_3, tmp_set_128i_4);
312             src_temp2_8x16b = _mm_and_si128(src_temp2_8x16b, cmp_store);
313 
314             //indexing 0 - 15 bandtable indexes
315             tmp_set_128i_1 = _mm_shuffle_epi8(band_table0_8x16b, tmp_set_128i_1);
316             tmp_set_128i_3 = _mm_shuffle_epi8(band_table0_8x16b, tmp_set_128i_3);
317             tmp_set_128i_2 = _mm_shuffle_epi8(band_table2_8x16b, tmp_set_128i_2);
318             tmp_set_128i_4 = _mm_shuffle_epi8(band_table2_8x16b, tmp_set_128i_4);
319             // combining all offsets results
320             tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, tmp_set_128i_2);
321             tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, tmp_set_128i_4);
322             // combing results woth the pixel values
323             src_temp0_8x16b = _mm_or_si128(src_temp0_8x16b, tmp_set_128i_1);
324             src_temp2_8x16b = _mm_or_si128(src_temp2_8x16b, tmp_set_128i_3);
325 
326 
327             //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
328             _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_8x16b);
329             // row = 1
330             _mm_storeu_si128((__m128i *)(pu1_src_cpy + src_strd), src_temp2_8x16b);
331 
332             pu1_src_cpy += (src_strd << 1);
333         }
334         pu1_src += 16;
335     }
336     wd_rem = wd & 0xF;
337     if(wd_rem)
338     {pu1_src_cpy = pu1_src;
339         for(row = ht; row > 0; row -= 4)
340         {
341 
342 
343             //row = 0 load 8 pixel values from 7:0 pos. relative to cur. pos.
344             src_temp0_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy));
345             // row = 1
346             src_temp1_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + src_strd));
347             // row = 2
348             src_temp2_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd));
349             // row = 3
350             src_temp3_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd));
351             //row0 and row1 packed and row2 and row3 packed
352 
353             src_temp0_8x16b = _mm_unpacklo_epi64(src_temp0_8x16b, src_temp1_8x16b);
354             src_temp2_8x16b = _mm_unpacklo_epi64(src_temp2_8x16b, src_temp3_8x16b);
355 
356             //saturated substract 8 bit
357             tmp_set_128i_1 = _mm_sub_epi8(src_temp0_8x16b, band_pos_16x8b);
358             tmp_set_128i_3 = _mm_sub_epi8(src_temp2_8x16b, band_pos_16x8b);
359             //if the values less than 0 put ff
360             tmp_set_128i_2 = _mm_cmpgt_epi8(sao_offset, tmp_set_128i_1);
361             tmp_set_128i_4 = _mm_cmpgt_epi8(sao_offset, tmp_set_128i_3);
362             tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, tmp_set_128i_2);
363             tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, tmp_set_128i_4);
364             //if the values gret=ater than 31 put ff
365             tmp_set_128i_2 = _mm_cmpgt_epi8(tmp_set_128i_1, band_table3_8x16b);
366             tmp_set_128i_4 = _mm_cmpgt_epi8(tmp_set_128i_3, band_table3_8x16b);
367             tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, tmp_set_128i_2);
368             tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, tmp_set_128i_4);
369 
370 
371 
372             //row 0 and row1
373             //if the values >16 then put ff ,cmp_mask = dup16(15)
374             cmp_store = _mm_cmpgt_epi8(tmp_set_128i_1, cmp_mask);
375             //values 16 to 31 for row 0 & 1 but values <16 ==0
376             tmp_set_128i_2 = _mm_and_si128(tmp_set_128i_1, cmp_store);
377             // values 0 to 15 for row 0 & 1
378             tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, cmp_store);
379             //values 16 to 31 for row 0 & 1 but values <16 masked to ff
380             cmp_store = _mm_cmpeq_epi8(tmp_set_128i_2, sao_offset);
381             tmp_set_128i_2 = _mm_or_si128(tmp_set_128i_2, cmp_store);
382             //row 2 and  row 3
383             //if the values >16 then put ff ,cmp_mask = dup16(15)
384             cmp_store = _mm_cmpgt_epi8(tmp_set_128i_3, cmp_mask);
385             //values 16 to 31 for row 2 & 3 but values <16 ==0
386             tmp_set_128i_4 = _mm_and_si128(tmp_set_128i_3, cmp_store);
387             // values 0 to 15 for row 2 & 3
388             tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, cmp_store);
389             //values 16 to 31 for row 2 & 3 but values <16 masked to ff
390             cmp_store = _mm_cmpeq_epi8(tmp_set_128i_4, sao_offset);
391             tmp_set_128i_4 = _mm_or_si128(tmp_set_128i_4, cmp_store);
392 
393             //row 0 and row 1
394             //to preserve pixel values in which no offset needs to be added.
395             cmp_store = _mm_cmpeq_epi8(tmp_set_128i_1, tmp_set_128i_2);
396             src_temp0_8x16b = _mm_and_si128(src_temp0_8x16b, cmp_store);
397 
398             //row 2 and row 3
399             //to preserve pixel values in which no offset needs to be added.
400             cmp_store = _mm_cmpeq_epi8(tmp_set_128i_3, tmp_set_128i_4);
401             src_temp2_8x16b = _mm_and_si128(src_temp2_8x16b, cmp_store);
402 
403             //indexing 0 - 15 bandtable indexes
404             tmp_set_128i_1 = _mm_shuffle_epi8(band_table0_8x16b, tmp_set_128i_1);
405             tmp_set_128i_3 = _mm_shuffle_epi8(band_table0_8x16b, tmp_set_128i_3);
406             tmp_set_128i_2 = _mm_shuffle_epi8(band_table2_8x16b, tmp_set_128i_2);
407             tmp_set_128i_4 = _mm_shuffle_epi8(band_table2_8x16b, tmp_set_128i_4);
408             // combining all offsets results
409             tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, tmp_set_128i_2);
410             tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, tmp_set_128i_4);
411             // combing results woth the pixel values
412             src_temp0_8x16b = _mm_or_si128(src_temp0_8x16b, tmp_set_128i_1);
413             src_temp2_8x16b = _mm_or_si128(src_temp2_8x16b, tmp_set_128i_3);
414 
415             //Getting row1 separately
416             src_temp1_8x16b = _mm_srli_si128(src_temp0_8x16b, 8);
417             //Getting row3 separately
418             src_temp3_8x16b = _mm_srli_si128(src_temp2_8x16b, 8);
419 
420             //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
421             _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_8x16b);
422             // row = 1
423             _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), src_temp1_8x16b);
424             // row = 2
425             _mm_storel_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd), src_temp2_8x16b);
426             // row = 3
427             _mm_storel_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd), src_temp3_8x16b);
428 
429             pu1_src_cpy += (src_strd << 2);
430 
431         }
432         pu1_src += 8;
433     }
434 
435 
436 }
437 
ihevc_sao_band_offset_chroma_ssse3(UWORD8 * pu1_src,WORD32 src_strd,UWORD8 * pu1_src_left,UWORD8 * pu1_src_top,UWORD8 * pu1_src_top_left,WORD32 sao_band_pos_u,WORD32 sao_band_pos_v,WORD8 * pi1_sao_offset_u,WORD8 * pi1_sao_offset_v,WORD32 wd,WORD32 ht)438 void ihevc_sao_band_offset_chroma_ssse3(UWORD8 *pu1_src,
439                                         WORD32 src_strd,
440                                         UWORD8 *pu1_src_left,
441                                         UWORD8 *pu1_src_top,
442                                         UWORD8 *pu1_src_top_left,
443                                         WORD32 sao_band_pos_u,
444                                         WORD32 sao_band_pos_v,
445                                         WORD8 *pi1_sao_offset_u,
446                                         WORD8 *pi1_sao_offset_v,
447                                         WORD32 wd,
448                                         WORD32 ht)
449 {
450     WORD32 row, col;
451     WORD8 offset = 0;
452 
453 
454     __m128i src_temp0_8x16b, src_temp1_8x16b, src_temp2_8x16b, src_temp3_8x16b;
455     __m128i cmp_msk2;
456     __m128i band_table0_16x8b, band_table1_16x8b, band_table2_16x8b, band_table3_16x8b;
457     __m128i tmp_set_128i_1, tmp_set_128i_2, tmp_set_128i_3, tmp_set_128i_4;
458     __m128i band_pos_u_16x8b, band_pos_v_16x8b;
459     __m128i sao_offset;
460     __m128i cmp_mask;
461 
462 
463     /* Updating left and top and top-left */
464     for(row = 0; row < ht; row++)
465     {
466         pu1_src_left[2 * row] = pu1_src[row * src_strd + (wd - 2)];
467         pu1_src_left[2 * row + 1] = pu1_src[row * src_strd + (wd - 1)];
468     }
469     pu1_src_top_left[0] = pu1_src_top[wd - 2];
470     pu1_src_top_left[1] = pu1_src_top[wd - 1];
471     for(col = 0; col < wd; col += 8)
472     {
473         tmp_set_128i_1 = _mm_loadl_epi64((__m128i *)(pu1_src + (ht - 1) * src_strd + offset));
474         _mm_storel_epi64((__m128i *)(pu1_src_top + offset), tmp_set_128i_1);
475         offset += 8;
476     }
477 
478     { // band _table creation
479         __m128i temp0_8x16b, temp1_8x16b, temp2_8x16b, temp3_8x16b;
480         // Band table for U component : band_table0_16x8b and band_table2_16x8b
481         //replicating sao_band_pos as 8 bit value 16 times
482         band_pos_u_16x8b = _mm_set1_epi16((WORD16)(sao_band_pos_u << 3));
483         //value set for sao_offset extraction
484         tmp_set_128i_1  = _mm_set_epi8(128, 1, 128, 1, 128, 1, 128, 1, 128, 1, 128, 1, 128, 1, 128, 1);
485         tmp_set_128i_2  = _mm_set_epi8(128, 2, 128, 2, 128, 2, 128, 2, 128, 2, 128, 2, 128, 2, 128, 2);
486         tmp_set_128i_3  = _mm_set_epi8(128, 3, 128, 3, 128, 3, 128, 3, 128, 3, 128, 3, 128, 3, 128, 3);
487         tmp_set_128i_4  = _mm_set_epi8(128, 4, 128, 4, 128, 4, 128, 4, 128, 4, 128, 4, 128, 4, 128, 4);
488 
489         //loaded sao offset values
490         sao_offset      = _mm_loadl_epi64((__m128i *)pi1_sao_offset_u);
491 
492         //loading 16bit 32values of gu2_table_band_idx consecutively in 4 registers
493         band_table0_16x8b = _mm_load_si128((__m128i *)(gu2_table_band_idx));
494         band_table1_16x8b = _mm_load_si128((__m128i *)(gu2_table_band_idx + 8));
495         band_table2_16x8b = _mm_load_si128((__m128i *)(gu2_table_band_idx + 16));
496         band_table3_16x8b = _mm_load_si128((__m128i *)(gu2_table_band_idx + 24));
497 
498         //band_position addition
499         band_table0_16x8b = _mm_add_epi16(band_table0_16x8b, band_pos_u_16x8b);
500         band_table1_16x8b = _mm_add_epi16(band_table1_16x8b, band_pos_u_16x8b);
501         band_table2_16x8b = _mm_add_epi16(band_table2_16x8b, band_pos_u_16x8b);
502         band_table3_16x8b = _mm_add_epi16(band_table3_16x8b, band_pos_u_16x8b);
503         //sao_offset duplication
504         temp0_8x16b = _mm_shuffle_epi8(sao_offset, tmp_set_128i_1);
505         temp1_8x16b = _mm_shuffle_epi8(sao_offset, tmp_set_128i_2);
506         temp2_8x16b = _mm_shuffle_epi8(sao_offset, tmp_set_128i_3);
507         temp3_8x16b = _mm_shuffle_epi8(sao_offset, tmp_set_128i_4);
508 
509         //sao_offset addition
510         band_table0_16x8b = _mm_add_epi16(band_table0_16x8b, temp0_8x16b);
511         band_table1_16x8b = _mm_add_epi16(band_table1_16x8b, temp1_8x16b);
512         band_table2_16x8b = _mm_add_epi16(band_table2_16x8b, temp2_8x16b);
513         band_table3_16x8b = _mm_add_epi16(band_table3_16x8b, temp3_8x16b);
514         //reuse for clipping
515         temp1_8x16b = _mm_set1_epi16(0x00ff);
516         //settng for comparision
517         cmp_mask = _mm_set1_epi16(16);
518 
519         //masking upper 8bit values of each  16 bit band table value
520         band_table0_16x8b = _mm_and_si128(band_table0_16x8b, temp1_8x16b);
521         band_table1_16x8b = _mm_and_si128(band_table1_16x8b, temp1_8x16b);
522         band_table2_16x8b = _mm_and_si128(band_table2_16x8b, temp1_8x16b);
523         band_table3_16x8b = _mm_and_si128(band_table3_16x8b, temp1_8x16b);
524 
525         //temp1_8x16b reuse for compare storage
526         switch(sao_band_pos_u)
527         {
528             case 0:
529                 temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, band_table0_16x8b);
530                 band_table0_16x8b = _mm_and_si128(band_table0_16x8b, temp3_8x16b);
531                 break;
532             case 28:
533                 temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, band_table3_16x8b);
534                 band_table3_16x8b = _mm_or_si128(band_table3_16x8b, temp3_8x16b);
535                 break;
536             case 29:
537                 temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, band_table2_16x8b);
538                 band_table2_16x8b = _mm_or_si128(band_table2_16x8b, temp3_8x16b);
539                 temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, band_table3_16x8b);
540                 band_table3_16x8b = _mm_and_si128(band_table3_16x8b, temp3_8x16b);
541                 break;
542             case 30:
543                 temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, band_table1_16x8b);
544                 band_table1_16x8b = _mm_or_si128(band_table1_16x8b, temp3_8x16b);
545                 temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, band_table2_16x8b);
546                 band_table2_16x8b = _mm_and_si128(band_table2_16x8b, temp3_8x16b);
547                 break;
548             case 31:
549                 temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, band_table0_16x8b);
550                 band_table0_16x8b = _mm_or_si128(band_table0_16x8b, temp3_8x16b);
551                 temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, band_table1_16x8b);
552                 band_table1_16x8b = _mm_and_si128(band_table1_16x8b, temp3_8x16b);
553                 break;
554             default:
555                 break;
556         }
557         //masking upper 8bit values of each  16 bit band table value
558         band_table0_16x8b = _mm_and_si128(band_table0_16x8b, temp1_8x16b);
559         band_table1_16x8b = _mm_and_si128(band_table1_16x8b, temp1_8x16b);
560         band_table2_16x8b = _mm_and_si128(band_table2_16x8b, temp1_8x16b);
561         band_table3_16x8b = _mm_and_si128(band_table3_16x8b, temp1_8x16b);
562         //band table 8x16 four registers are packed into 16x8 two registers:  band_table0_8x16b and band_table2_8x16b
563         band_table0_16x8b = _mm_packus_epi16(band_table0_16x8b, band_table1_16x8b);
564         band_table2_16x8b = _mm_packus_epi16(band_table2_16x8b, band_table3_16x8b);
565         // Band table for U component over
566 
567         // Band table for V component : band_table1_16x8b and band_table3_16x8b
568         // replicating sao_band_pos as 8 bit value 16 times
569         band_pos_v_16x8b = _mm_set1_epi16((WORD16)(sao_band_pos_v << 3));
570 
571         //loaded sao offset values
572         sao_offset      = _mm_loadl_epi64((__m128i *)pi1_sao_offset_v);
573 
574         //loading 16bit 32values of gu2_table_band_idx consecutively in 4 registers
575         temp0_8x16b = _mm_load_si128((__m128i *)(gu2_table_band_idx));
576         band_table1_16x8b = _mm_load_si128((__m128i *)(gu2_table_band_idx + 8));
577         temp2_8x16b = _mm_load_si128((__m128i *)(gu2_table_band_idx + 16));
578         band_table3_16x8b = _mm_load_si128((__m128i *)(gu2_table_band_idx + 24));
579 
580         //band_position addition
581         temp0_8x16b = _mm_add_epi16(temp0_8x16b, band_pos_v_16x8b);
582         band_table1_16x8b = _mm_add_epi16(band_table1_16x8b, band_pos_v_16x8b);
583         temp2_8x16b = _mm_add_epi16(temp2_8x16b, band_pos_v_16x8b);
584         band_table3_16x8b = _mm_add_epi16(band_table3_16x8b, band_pos_v_16x8b);
585         //sao_offset duplication
586         tmp_set_128i_1  = _mm_shuffle_epi8(sao_offset, tmp_set_128i_1);
587         tmp_set_128i_2  = _mm_shuffle_epi8(sao_offset, tmp_set_128i_2);
588         tmp_set_128i_3  = _mm_shuffle_epi8(sao_offset, tmp_set_128i_3);
589         tmp_set_128i_4  = _mm_shuffle_epi8(sao_offset, tmp_set_128i_4);
590 
591         //sao_offset addition
592         temp0_8x16b = _mm_add_epi16(temp0_8x16b, tmp_set_128i_1);
593         band_table1_16x8b = _mm_add_epi16(band_table1_16x8b, tmp_set_128i_2);
594         temp2_8x16b = _mm_add_epi16(temp2_8x16b, tmp_set_128i_3);
595         band_table3_16x8b = _mm_add_epi16(band_table3_16x8b, tmp_set_128i_4);
596 
597         //masking upper 8bit values of 16 bit band table value
598         temp0_8x16b = _mm_and_si128(temp0_8x16b, temp1_8x16b);
599         band_table1_16x8b = _mm_and_si128(band_table1_16x8b, temp1_8x16b);
600         temp2_8x16b = _mm_and_si128(temp2_8x16b, temp1_8x16b);
601         band_table3_16x8b = _mm_and_si128(band_table3_16x8b, temp1_8x16b);
602         //temp1_8x16b reuse for compare storage
603 
604         switch(sao_band_pos_v)
605         {
606             case 0:
607                 temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, temp0_8x16b);
608                 temp0_8x16b = _mm_and_si128(temp0_8x16b, temp3_8x16b);
609                 break;
610             case 28:
611                 temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, band_table3_16x8b);
612                 band_table3_16x8b = _mm_or_si128(band_table3_16x8b, temp3_8x16b);
613                 break;
614             case 29:
615                 temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, temp2_8x16b);
616                 temp2_8x16b = _mm_or_si128(temp2_8x16b, temp3_8x16b);
617                 temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, band_table3_16x8b);
618                 band_table3_16x8b = _mm_and_si128(band_table3_16x8b, temp3_8x16b);
619                 break;
620             case 30:
621                 temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, band_table1_16x8b);
622                 band_table1_16x8b = _mm_or_si128(band_table1_16x8b, temp3_8x16b);
623                 temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, temp2_8x16b);
624                 temp2_8x16b = _mm_and_si128(temp2_8x16b, temp3_8x16b);
625                 break;
626             case 31:
627                 temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, temp0_8x16b);
628                 temp0_8x16b = _mm_or_si128(temp0_8x16b, temp3_8x16b);
629                 temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, band_table1_16x8b);
630                 band_table1_16x8b = _mm_and_si128(band_table1_16x8b, temp3_8x16b);
631                 break;
632             default:
633                 break;
634         }
635         //masking upper 8bit values of each  16 bit band table value
636         temp0_8x16b = _mm_and_si128(temp0_8x16b, temp1_8x16b);
637         band_table1_16x8b = _mm_and_si128(band_table1_16x8b, temp1_8x16b);
638         temp2_8x16b = _mm_and_si128(temp2_8x16b, temp1_8x16b);
639         band_table3_16x8b = _mm_and_si128(band_table3_16x8b, temp1_8x16b);
640         //band table 8x16 four registers are packed into 16x8 two registers:  band_table0_8x16b and band_table2_8x16b
641         band_table1_16x8b = _mm_packus_epi16(temp0_8x16b, band_table1_16x8b);
642         band_table3_16x8b = _mm_packus_epi16(temp2_8x16b, band_table3_16x8b);
643         //band table for u and v created
644     }
645     {
646         UWORD8 *pu1_src_cpy;
647         WORD32 wd_rem;
648 
649 
650         //sao_offset is reused for zero cmp mask.
651         sao_offset = _mm_setzero_si128();
652         tmp_set_128i_1 = _mm_set1_epi8(1);
653         //tmp_set_128i_2 = _mm_set_epi8 (128,7,128,6,128,5,128,4,128,3,128,2,128,1,128,0);
654         cmp_mask = _mm_packus_epi16(cmp_mask, cmp_mask); //cmp_msk=dup16(16);
655         //to avoid ffff to be saturated to 0 instead it should be to ff
656 
657         cmp_msk2 = _mm_slli_epi16(cmp_mask, 1); // to compare if value is greater than 31
658         band_pos_u_16x8b = _mm_packus_epi16(band_pos_u_16x8b, band_pos_u_16x8b); //band_pos_u is now 8 bit aligned
659         band_pos_v_16x8b = _mm_packus_epi16(band_pos_v_16x8b, band_pos_v_16x8b); //band_pos_v is now 8 bit aligned
660         cmp_msk2 = _mm_sub_epi8(cmp_msk2, tmp_set_128i_1); // to compare if value is greater than 31
661 
662         cmp_mask = _mm_sub_epi8(cmp_mask, tmp_set_128i_1);
663 
664         for(col = wd; col >= 16; col -= 16)
665         {
666             pu1_src_cpy = pu1_src;
667             for(row = ht; row > 0; row -= 2)
668             {
669                 //row = 0 load 8 pixel values from 7:0 pos. relative to cur. pos.
670                 src_temp0_8x16b = _mm_loadu_si128((__m128i *)(pu1_src_cpy));
671                 // row = 1
672                 src_temp3_8x16b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
673 
674 
675                 //odd values
676                 src_temp1_8x16b = _mm_srli_epi16(src_temp0_8x16b, 8);
677                 src_temp2_8x16b = _mm_srli_epi16(src_temp3_8x16b, 8);
678                 //even values
679                 src_temp0_8x16b = _mm_slli_epi16(src_temp0_8x16b, 8);
680                 src_temp3_8x16b = _mm_slli_epi16(src_temp3_8x16b, 8);
681                 src_temp0_8x16b = _mm_srli_epi16(src_temp0_8x16b, 8);
682                 src_temp3_8x16b = _mm_srli_epi16(src_temp3_8x16b, 8);
683                 //combining odd values
684                 src_temp2_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp2_8x16b);
685                 //combining even values
686                 src_temp0_8x16b = _mm_packus_epi16(src_temp0_8x16b, src_temp3_8x16b);
687 
688                 //saturated substract 8 bit
689                 tmp_set_128i_1 = _mm_sub_epi8(src_temp0_8x16b, band_pos_u_16x8b);
690                 tmp_set_128i_3 = _mm_sub_epi8(src_temp2_8x16b, band_pos_v_16x8b);
691                 //if the values less than 0 put ff
692                 tmp_set_128i_2 = _mm_cmpgt_epi8(sao_offset, tmp_set_128i_1);
693                 tmp_set_128i_4 = _mm_cmpgt_epi8(sao_offset, tmp_set_128i_3);
694                 tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, tmp_set_128i_2);
695                 tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, tmp_set_128i_4);
696                 //if the values greater than 31 put ff
697                 tmp_set_128i_2 = _mm_cmpgt_epi8(tmp_set_128i_1, cmp_msk2);
698                 tmp_set_128i_4 = _mm_cmpgt_epi8(tmp_set_128i_3, cmp_msk2);
699                 tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, tmp_set_128i_2);
700                 tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, tmp_set_128i_4);
701                 // registers reused to increase performance
702                 //if the values >16 then put ff ,cmp_mask = dup16(15) row 0 and row1
703                 src_temp1_8x16b = _mm_cmpgt_epi8(tmp_set_128i_1, cmp_mask);
704                 //if the values >16 then put ff ,cmp_mask = dup16(15) row 2 and  row 3
705                 src_temp3_8x16b = _mm_cmpgt_epi8(tmp_set_128i_3, cmp_mask);
706 
707                 //values 16 to 31 for row 0 & 1 but values <16 ==0
708                 tmp_set_128i_2 = _mm_and_si128(tmp_set_128i_1, src_temp1_8x16b);
709                 // values 0 to 15 for row 0 & 1
710                 tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, src_temp1_8x16b);
711                 //values 16 to 31 for row 2 & 3 but values <16 ==0
712                 tmp_set_128i_4 = _mm_and_si128(tmp_set_128i_3, src_temp3_8x16b);
713                 // values 0 to 15 for row 2 & 3
714                 tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, src_temp3_8x16b);
715 
716                 //values 16 to 31 for row 0 & 1 but values <16 masked to ff row 0 and row1
717                 src_temp1_8x16b = _mm_cmpeq_epi8(tmp_set_128i_2, sao_offset);
718                 //values 16 to 31 for row 0 & 1 but values <16 masked to ff row 2 and  row 3
719                 src_temp3_8x16b = _mm_cmpeq_epi8(tmp_set_128i_4, sao_offset);
720                 tmp_set_128i_2 = _mm_or_si128(tmp_set_128i_2, src_temp1_8x16b);
721                 tmp_set_128i_4 = _mm_or_si128(tmp_set_128i_4, src_temp3_8x16b);
722 
723 
724                 //to choose which pixel values to preserve in row 0 and row 1
725                 src_temp1_8x16b = _mm_cmpeq_epi8(tmp_set_128i_1, tmp_set_128i_2);
726                 //to choose which pixel values to preserve in row 2 and row 3
727                 src_temp3_8x16b = _mm_cmpeq_epi8(tmp_set_128i_3, tmp_set_128i_4);
728                 //values of all rows to which no offset needs to be added preserved.
729                 src_temp0_8x16b = _mm_and_si128(src_temp0_8x16b, src_temp1_8x16b);
730                 src_temp2_8x16b = _mm_and_si128(src_temp2_8x16b, src_temp3_8x16b);
731 
732                 //indexing 0 - 15 bandtable indexes
733                 tmp_set_128i_1 = _mm_shuffle_epi8(band_table0_16x8b, tmp_set_128i_1); //U low
734                 tmp_set_128i_3 = _mm_shuffle_epi8(band_table1_16x8b, tmp_set_128i_3); //V low
735                 //indexing 16 -31 bandtable indexes
736                 tmp_set_128i_2 = _mm_shuffle_epi8(band_table2_16x8b, tmp_set_128i_2); //U high
737                 tmp_set_128i_4 = _mm_shuffle_epi8(band_table3_16x8b, tmp_set_128i_4); //V high
738                 // combining all offsets results
739                 tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, tmp_set_128i_2); //U
740                 tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, tmp_set_128i_4); //V
741                 // combing results with the pixel values
742                 src_temp0_8x16b = _mm_or_si128(src_temp0_8x16b, tmp_set_128i_1);
743                 src_temp2_8x16b = _mm_or_si128(src_temp2_8x16b, tmp_set_128i_3);
744                 //reorganising even and odd values
745                 src_temp1_8x16b = _mm_unpacklo_epi8(src_temp0_8x16b, src_temp2_8x16b);
746                 src_temp3_8x16b = _mm_unpackhi_epi8(src_temp0_8x16b, src_temp2_8x16b);
747 
748 
749                 //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
750                 _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp1_8x16b);
751                 // row = 1
752                 _mm_storeu_si128((__m128i *)(pu1_src_cpy + src_strd), src_temp3_8x16b);
753 
754 
755                 pu1_src_cpy += (src_strd << 1);
756 
757             }
758             pu1_src += 16;
759         }
760 
761         wd_rem = wd & 0xF;
762         if(wd_rem)
763         {
764             pu1_src_cpy = pu1_src;
765             for(row = ht; row > 0; row -= 4)
766             {
767                 //row = 0 load 8 pixel values from 7:0 pos. relative to cur. pos.
768                 src_temp0_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy));
769                 // row = 1
770                 src_temp1_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + src_strd));
771                 // row = 2
772                 src_temp2_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd));
773                 // row = 3
774                 src_temp3_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd));
775                 //row0 and row1 packed and row2 and row3 packed
776 
777                 src_temp0_8x16b = _mm_unpacklo_epi64(src_temp0_8x16b, src_temp1_8x16b);
778                 src_temp3_8x16b = _mm_unpacklo_epi64(src_temp2_8x16b, src_temp3_8x16b);
779                 //odd values
780                 src_temp1_8x16b = _mm_srli_epi16(src_temp0_8x16b, 8);
781                 src_temp2_8x16b = _mm_srli_epi16(src_temp3_8x16b, 8);
782                 //even values
783                 src_temp0_8x16b = _mm_slli_epi16(src_temp0_8x16b, 8);
784                 src_temp3_8x16b = _mm_slli_epi16(src_temp3_8x16b, 8);
785                 src_temp0_8x16b = _mm_srli_epi16(src_temp0_8x16b, 8);
786                 src_temp3_8x16b = _mm_srli_epi16(src_temp3_8x16b, 8);
787                 //combining odd values
788                 src_temp2_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp2_8x16b);
789                 //combining even values
790                 src_temp0_8x16b = _mm_packus_epi16(src_temp0_8x16b, src_temp3_8x16b);
791 
792                 //saturated substract 8 bit
793                 tmp_set_128i_1 = _mm_sub_epi8(src_temp0_8x16b, band_pos_u_16x8b);
794                 tmp_set_128i_3 = _mm_sub_epi8(src_temp2_8x16b, band_pos_v_16x8b);
795                 //if the values less than 0 put ff
796                 tmp_set_128i_2 = _mm_cmpgt_epi8(sao_offset, tmp_set_128i_1);
797                 tmp_set_128i_4 = _mm_cmpgt_epi8(sao_offset, tmp_set_128i_3);
798                 tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, tmp_set_128i_2);
799                 tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, tmp_set_128i_4);
800                 //if the values greater than 31 put ff
801                 tmp_set_128i_2 = _mm_cmpgt_epi8(tmp_set_128i_1, cmp_msk2);
802                 tmp_set_128i_4 = _mm_cmpgt_epi8(tmp_set_128i_3, cmp_msk2);
803                 tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, tmp_set_128i_2);
804                 tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, tmp_set_128i_4);
805                 // registers reused to increase performance
806                 //if the values >16 then put ff ,cmp_mask = dup16(15) row 0 and row1
807                 src_temp1_8x16b = _mm_cmpgt_epi8(tmp_set_128i_1, cmp_mask);
808                 //if the values >16 then put ff ,cmp_mask = dup16(15) row 2 and  row 3
809                 src_temp3_8x16b = _mm_cmpgt_epi8(tmp_set_128i_3, cmp_mask);
810 
811                 //values 16 to 31 for row 0 & 1 but values <16 ==0
812                 tmp_set_128i_2 = _mm_and_si128(tmp_set_128i_1, src_temp1_8x16b);
813                 // values 0 to 15 for row 0 & 1
814                 tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, src_temp1_8x16b);
815                 //values 16 to 31 for row 2 & 3 but values <16 ==0
816                 tmp_set_128i_4 = _mm_and_si128(tmp_set_128i_3, src_temp3_8x16b);
817                 // values 0 to 15 for row 2 & 3
818                 tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, src_temp3_8x16b);
819 
820                 //values 16 to 31 for row 0 & 1 but values <16 masked to ff row 0 and row1
821                 src_temp1_8x16b = _mm_cmpeq_epi8(tmp_set_128i_2, sao_offset);
822                 //values 16 to 31 for row 0 & 1 but values <16 masked to ff row 2 and  row 3
823                 src_temp3_8x16b = _mm_cmpeq_epi8(tmp_set_128i_4, sao_offset);
824                 tmp_set_128i_2 = _mm_or_si128(tmp_set_128i_2, src_temp1_8x16b);
825                 tmp_set_128i_4 = _mm_or_si128(tmp_set_128i_4, src_temp3_8x16b);
826 
827 
828                 //to choose which pixel values to preserve in row 0 and row 1
829                 src_temp1_8x16b = _mm_cmpeq_epi8(tmp_set_128i_1, tmp_set_128i_2);
830                 //to choose which pixel values to preserve in row 2 and row 3
831                 src_temp3_8x16b = _mm_cmpeq_epi8(tmp_set_128i_3, tmp_set_128i_4);
832                 //values of all rows to which no offset needs to be added preserved.
833                 src_temp0_8x16b = _mm_and_si128(src_temp0_8x16b, src_temp1_8x16b);
834                 src_temp2_8x16b = _mm_and_si128(src_temp2_8x16b, src_temp3_8x16b);
835 
836                 //indexing 0 - 15 bandtable indexes
837                 tmp_set_128i_1 = _mm_shuffle_epi8(band_table0_16x8b, tmp_set_128i_1); //U low
838                 tmp_set_128i_3 = _mm_shuffle_epi8(band_table1_16x8b, tmp_set_128i_3); //V low
839                 //indexing 16 -31 bandtable indexes
840                 tmp_set_128i_2 = _mm_shuffle_epi8(band_table2_16x8b, tmp_set_128i_2); //U high
841                 tmp_set_128i_4 = _mm_shuffle_epi8(band_table3_16x8b, tmp_set_128i_4); //V high
842                 // combining all offsets results
843                 tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, tmp_set_128i_2); //U
844                 tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, tmp_set_128i_4); //V
845                 // combing results with the pixel values
846                 src_temp0_8x16b = _mm_or_si128(src_temp0_8x16b, tmp_set_128i_1);
847                 src_temp2_8x16b = _mm_or_si128(src_temp2_8x16b, tmp_set_128i_3);
848                 //reorganising even and odd values
849                 src_temp1_8x16b = _mm_unpacklo_epi8(src_temp0_8x16b, src_temp2_8x16b);
850                 src_temp3_8x16b = _mm_unpackhi_epi8(src_temp0_8x16b, src_temp2_8x16b);
851                 //Getting row1 separately
852                 src_temp0_8x16b = _mm_srli_si128(src_temp1_8x16b, 8);
853                 //Getting row3 separately
854                 src_temp2_8x16b = _mm_srli_si128(src_temp3_8x16b, 8);
855 
856                 //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
857                 _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp1_8x16b);
858                 // row = 1
859                 _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), src_temp0_8x16b);
860                 // row = 2
861                 _mm_storel_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd), src_temp3_8x16b);
862                 // row = 3
863                 _mm_storel_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd), src_temp2_8x16b);
864 
865                 pu1_src_cpy += (src_strd << 2);
866 
867             }
868             pu1_src += 16;
869         }
870 
871 
872     }
873 }
874 
875 
876 
ihevc_sao_edge_offset_class0_ssse3(UWORD8 * pu1_src,WORD32 src_strd,UWORD8 * pu1_src_left,UWORD8 * pu1_src_top,UWORD8 * pu1_src_top_left,UWORD8 * pu1_src_top_right,UWORD8 * pu1_src_bot_left,UWORD8 * pu1_avail,WORD8 * pi1_sao_offset,WORD32 wd,WORD32 ht)877 void ihevc_sao_edge_offset_class0_ssse3(UWORD8 *pu1_src,
878                                         WORD32 src_strd,
879                                         UWORD8 *pu1_src_left,
880                                         UWORD8 *pu1_src_top,
881                                         UWORD8 *pu1_src_top_left,
882                                         UWORD8 *pu1_src_top_right,
883                                         UWORD8 *pu1_src_bot_left,
884                                         UWORD8 *pu1_avail,
885                                         WORD8 *pi1_sao_offset,
886                                         WORD32 wd,
887                                         WORD32 ht)
888 {
889     WORD32 row, col;
890     UWORD8 *pu1_src_cpy, *pu1_src_left_cpy, *pu1_src_left_str, *pu1_left_tmp;
891     UWORD8 au1_mask[MAX_CTB_SIZE], *au1_mask_cpy;
892     UWORD8 au1_src_left_tmp[MAX_CTB_SIZE + 8];
893     UWORD8 au1_src_left_tmp1[MAX_CTB_SIZE + 8];
894     UWORD8 u1_avail0, u1_avail1;
895     WORD32 wd_rem;
896     WORD32 offset = 0;
897     __m128i src_temp0_16x8b, src_temp1_16x8b;
898     __m128i left0_16x8b, left1_16x8b;
899     __m128i cmp_gt0_16x8b, cmp_lt0_16x8b, cmp_gt1_16x8b, cmp_lt1_16x8b;
900     __m128i edge0_16x8b, edge1_16x8b;
901     __m128i au1_mask8x16b;
902     __m128i edge_idx_8x16b, sao_offset_8x16b;
903     __m128i const2_16x8b, const0_16x8b;
904     __m128i left_store_16x8b;
905     UNUSED(pu1_src_top_right);
906     UNUSED(pu1_src_bot_left);
907 
908     au1_mask8x16b = _mm_set1_epi8(0xff);
909 
910     /* Update  top and top-left arrays */
911 
912     *pu1_src_top_left = pu1_src_top[wd - 1];
913 
914     for(col = wd; col >= 16; col -= 16)
915     {
916         const0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + offset + (ht - 1) * src_strd));
917         _mm_storeu_si128((__m128i *)(pu1_src_top + offset), const0_16x8b);
918         offset += 16;
919     }
920 
921     //setting availability mask to ff size MAX_CTB_SIZE
922     for(col = 0; col < MAX_CTB_SIZE; col += 16)
923         _mm_storeu_si128((__m128i *)(au1_mask + col), au1_mask8x16b);
924     for(row = 0; row < ht; row++)
925     {
926         au1_src_left_tmp[row] = pu1_src_left[row];
927     }
928     edge_idx_8x16b   = _mm_loadl_epi64((__m128i *)gi1_table_edge_idx);
929     sao_offset_8x16b = _mm_loadl_epi64((__m128i *)pi1_sao_offset);
930 
931     //availability mask creation
932     u1_avail0 = pu1_avail[0];
933     u1_avail1 = pu1_avail[1];
934     au1_mask[0] = u1_avail0;
935     au1_mask[wd - 1] = u1_avail1;
936 
937     const2_16x8b = _mm_set1_epi8(2);
938     const0_16x8b = _mm_setzero_si128();
939     pu1_src_left_cpy = au1_src_left_tmp;
940     pu1_src_left_str = au1_src_left_tmp1;
941     {
942         au1_mask_cpy = au1_mask;
943         for(col = wd; col >= 16; col -= 16)
944         {
945             pu1_src_cpy = pu1_src;
946             au1_mask8x16b = _mm_loadu_si128((__m128i *)au1_mask_cpy);
947             //pu1_src_left_cpy =au1_src_left_tmp;
948             for(row = ht; row > 0; row -= 2)
949             {
950 
951                 left_store_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_left_cpy));
952                 //row = 0 load 8 pixel values from 7:0 pos. relative to cur. pos.
953                 src_temp0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy));
954                 // row = 1
955                 src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
956 
957                 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, left_store_16x8b, 2);
958                 //row 1 left
959                 left1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, left_store_16x8b, 15);
960                 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp1_16x8b, 15);
961                 //row 0 left
962                 left0_16x8b = _mm_alignr_epi8(src_temp0_16x8b, left_store_16x8b, 15);
963                 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 15);
964 
965 
966                 //separating +ve and and -ve values.
967                 cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, left0_16x8b);
968                 cmp_lt0_16x8b = _mm_subs_epu8(left0_16x8b, src_temp0_16x8b);
969                 cmp_gt1_16x8b = _mm_subs_epu8(src_temp1_16x8b, left1_16x8b);
970                 cmp_lt1_16x8b = _mm_subs_epu8(left1_16x8b, src_temp1_16x8b);
971                 //creating mask 00 for +ve and -ve values and FF for zero.
972                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
973                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
974                 cmp_gt1_16x8b = _mm_cmpeq_epi8(cmp_gt1_16x8b, const0_16x8b);
975                 cmp_lt1_16x8b = _mm_cmpeq_epi8(cmp_lt1_16x8b, const0_16x8b);
976                 //combining the appropriate sign change
977                 left0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
978                 left1_16x8b = _mm_sub_epi8(cmp_gt1_16x8b, cmp_lt1_16x8b);
979 
980                 //row = 0 right
981                 edge0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 1));
982                 // row = 1 right
983                 edge1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd + 1));
984                 //separating +ve and and -ve values.
985                 cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, edge0_16x8b);
986                 cmp_lt0_16x8b = _mm_subs_epu8(edge0_16x8b, src_temp0_16x8b);
987                 cmp_gt1_16x8b = _mm_subs_epu8(src_temp1_16x8b, edge1_16x8b);
988                 cmp_lt1_16x8b = _mm_subs_epu8(edge1_16x8b, src_temp1_16x8b);
989                 //creating mask 00 for +ve and -ve values and FF for zero.
990                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
991                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
992                 cmp_gt1_16x8b = _mm_cmpeq_epi8(cmp_gt1_16x8b, const0_16x8b);
993                 cmp_lt1_16x8b = _mm_cmpeq_epi8(cmp_lt1_16x8b, const0_16x8b);
994                 //combining the appropriate sign change
995                 edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
996                 edge1_16x8b = _mm_sub_epi8(cmp_gt1_16x8b, cmp_lt1_16x8b);
997 
998                 //combining sign-left and sign_right
999                 edge0_16x8b = _mm_add_epi8(edge0_16x8b, left0_16x8b);
1000                 edge1_16x8b = _mm_add_epi8(edge1_16x8b, left1_16x8b);
1001                 //adding constant 2
1002                 edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
1003                 edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b);
1004                 //shuffle to get sao index
1005                 edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
1006                 edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b);
1007                 //using availability mask
1008                 edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
1009                 edge1_16x8b = _mm_and_si128(edge1_16x8b, au1_mask8x16b);
1010 
1011                 //shuffle to get sao offset
1012                 edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
1013                 edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b);
1014                 //cnvert to 16 bit then add and then saturated pack
1015                 left0_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
1016                 cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
1017                 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, left0_16x8b);
1018                 src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
1019                 cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
1020                 cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, left0_16x8b);
1021                 src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
1022                 src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
1023 
1024                 left0_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b);
1025                 cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b);
1026                 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, left0_16x8b);
1027                 src_temp1_16x8b = _mm_unpackhi_epi8(src_temp1_16x8b, const0_16x8b);
1028                 cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
1029                 cmp_lt0_16x8b = _mm_unpackhi_epi8(edge1_16x8b, left0_16x8b);
1030                 src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, cmp_lt0_16x8b);
1031                 src_temp1_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp1_16x8b);
1032 
1033 
1034                 _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
1035                 //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
1036                 _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
1037                 // row = 1
1038                 _mm_storeu_si128((__m128i *)(pu1_src_cpy + src_strd), src_temp1_16x8b);
1039 
1040                 pu1_src_cpy += (src_strd << 1);
1041                 pu1_src_left_cpy += 2;
1042                 pu1_src_left_str += 2;
1043             }
1044             au1_mask_cpy += 16;
1045             pu1_src += 16;
1046             pu1_src_left_cpy -= ht;
1047             pu1_src_left_str -= ht;
1048 
1049             pu1_left_tmp = pu1_src_left_cpy;
1050             pu1_src_left_cpy = pu1_src_left_str;
1051             pu1_src_left_str = pu1_left_tmp;
1052         }
1053 
1054         wd_rem = wd & 0xF;
1055         if(wd_rem)
1056         {
1057 
1058             cmp_gt1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + (ht - 1) * src_strd));
1059             _mm_storel_epi64((__m128i *)(pu1_src_top + offset), cmp_gt1_16x8b);
1060 
1061             au1_mask8x16b = _mm_loadl_epi64((__m128i *)au1_mask_cpy);
1062             pu1_src_cpy = pu1_src;
1063             au1_mask8x16b = _mm_unpacklo_epi64(au1_mask8x16b, au1_mask8x16b);
1064             //pu1_src_left_cpy =au1_src_left_tmp;
1065             for(row = ht; row > 0; row -= 4)
1066             {
1067                 left_store_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_left_cpy));
1068                 //row = 0 load 8 pixel values from 7:0 pos. relative to cur. pos.
1069                 src_temp0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy));
1070                 // row = 1
1071                 cmp_gt0_16x8b =  _mm_loadl_epi64((__m128i *)(pu1_src_cpy + src_strd));
1072                 // row  = 2
1073                 src_temp1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd));
1074                 // row = 3
1075                 cmp_gt1_16x8b =  _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd));
1076 
1077 
1078                 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, left_store_16x8b, 4);
1079                 //row 3 left
1080                 edge0_16x8b = _mm_slli_si128(cmp_gt1_16x8b, 8);
1081                 cmp_lt1_16x8b = _mm_alignr_epi8(cmp_gt1_16x8b, left_store_16x8b, 15);
1082                 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, edge0_16x8b, 15);
1083                 //row 2 left
1084                 edge0_16x8b = _mm_slli_si128(src_temp1_16x8b, 8);
1085                 left1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, left_store_16x8b, 15);
1086                 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, edge0_16x8b, 15);
1087                 //row 1 left
1088                 edge0_16x8b = _mm_slli_si128(cmp_gt0_16x8b, 8);
1089                 cmp_lt0_16x8b = _mm_alignr_epi8(cmp_gt0_16x8b, left_store_16x8b, 15);
1090                 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, edge0_16x8b, 15);
1091                 //row 0 left
1092                 edge0_16x8b = _mm_slli_si128(src_temp0_16x8b, 8);
1093                 left0_16x8b = _mm_alignr_epi8(src_temp0_16x8b, left_store_16x8b, 15);
1094                 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, edge0_16x8b, 15);
1095 
1096                 // packing rows together for 16 SIMD operations
1097                 src_temp0_16x8b = _mm_unpacklo_epi64(src_temp0_16x8b, cmp_gt0_16x8b);
1098                 src_temp1_16x8b = _mm_unpacklo_epi64(src_temp1_16x8b, cmp_gt1_16x8b);
1099                 // packing rows together for 16 SIMD operations
1100                 left0_16x8b = _mm_unpacklo_epi64(left0_16x8b, cmp_lt0_16x8b);
1101                 left1_16x8b = _mm_unpacklo_epi64(left1_16x8b, cmp_lt1_16x8b);
1102 
1103                 //separating +ve and and -ve values.
1104                 cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, left0_16x8b);
1105                 cmp_lt0_16x8b = _mm_subs_epu8(left0_16x8b, src_temp0_16x8b);
1106                 cmp_gt1_16x8b = _mm_subs_epu8(src_temp1_16x8b, left1_16x8b);
1107                 cmp_lt1_16x8b = _mm_subs_epu8(left1_16x8b, src_temp1_16x8b);
1108                 //creating mask 00 for +ve and -ve values and FF for zero.
1109                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
1110                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
1111                 cmp_gt1_16x8b = _mm_cmpeq_epi8(cmp_gt1_16x8b, const0_16x8b);
1112                 cmp_lt1_16x8b = _mm_cmpeq_epi8(cmp_lt1_16x8b, const0_16x8b);
1113                 //combining the appropriate sign change
1114                 left0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
1115                 left1_16x8b = _mm_sub_epi8(cmp_gt1_16x8b, cmp_lt1_16x8b);
1116 
1117                 //row = 0 right
1118                 edge0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 1));
1119                 // row = 1 right
1120                 cmp_gt0_16x8b =  _mm_loadl_epi64((__m128i *)(pu1_src_cpy + src_strd + 1));
1121                 // row = 2 right
1122                 edge1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd + 1));
1123                 // row = 3 right
1124                 cmp_gt1_16x8b =  _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd + 1));
1125                 // packing rows together for 16 SIMD operations
1126                 edge0_16x8b = _mm_unpacklo_epi64(edge0_16x8b, cmp_gt0_16x8b);
1127                 edge1_16x8b = _mm_unpacklo_epi64(edge1_16x8b, cmp_gt1_16x8b);
1128 
1129                 //separating +ve and and -ve values.
1130                 cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, edge0_16x8b);
1131                 cmp_lt0_16x8b = _mm_subs_epu8(edge0_16x8b, src_temp0_16x8b);
1132                 cmp_gt1_16x8b = _mm_subs_epu8(src_temp1_16x8b, edge1_16x8b);
1133                 cmp_lt1_16x8b = _mm_subs_epu8(edge1_16x8b, src_temp1_16x8b);
1134                 //creating mask 00 for +ve and -ve values and FF for zero.
1135                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
1136                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
1137                 cmp_gt1_16x8b = _mm_cmpeq_epi8(cmp_gt1_16x8b, const0_16x8b);
1138                 cmp_lt1_16x8b = _mm_cmpeq_epi8(cmp_lt1_16x8b, const0_16x8b);
1139                 //combining the appropriate sign change
1140                 edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
1141                 edge1_16x8b = _mm_sub_epi8(cmp_gt1_16x8b, cmp_lt1_16x8b);
1142 
1143                 //combining sign-left and sign_right
1144                 edge0_16x8b = _mm_add_epi8(edge0_16x8b, left0_16x8b);
1145                 edge1_16x8b = _mm_add_epi8(edge1_16x8b, left1_16x8b);
1146                 //adding constant 2
1147                 edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
1148                 edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b);
1149                 //shuffle to get sao index
1150                 edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
1151                 edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b);
1152                 //shuffle to get sao offset
1153                 //using availability mask
1154                 edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
1155                 edge1_16x8b = _mm_and_si128(edge1_16x8b, au1_mask8x16b);
1156 
1157                 edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
1158                 edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b);
1159                 //cnvert to 16 bit then add and then saturated pack
1160                 left0_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
1161                 cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
1162                 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, left0_16x8b);
1163                 src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
1164                 cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
1165                 cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, left0_16x8b);
1166                 src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
1167                 src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
1168 
1169                 left0_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b);
1170                 cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b);
1171                 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, left0_16x8b);
1172                 src_temp1_16x8b = _mm_unpackhi_epi8(src_temp1_16x8b, const0_16x8b);
1173                 cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
1174                 cmp_lt0_16x8b = _mm_unpackhi_epi8(edge1_16x8b, left0_16x8b);
1175                 src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, cmp_lt0_16x8b);
1176                 src_temp1_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp1_16x8b);
1177                 //separting row 1 and row 3
1178                 cmp_lt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8);
1179                 cmp_lt1_16x8b = _mm_srli_si128(src_temp1_16x8b, 8);
1180 
1181                 _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
1182                 //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
1183                 _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
1184                 // row = 1
1185                 _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_lt0_16x8b);
1186                 // row = 2
1187                 _mm_storel_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd), src_temp1_16x8b);
1188                 // row = 3
1189                 _mm_storel_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd), cmp_lt1_16x8b);
1190 
1191                 pu1_src_cpy += (src_strd << 2);
1192                 pu1_src_left_cpy += 4;
1193                 pu1_src_left_str += 4;
1194             }
1195             pu1_src += wd;
1196             pu1_src_left_cpy -= ht;
1197             pu1_src_left_str -= ht;
1198 
1199             pu1_left_tmp = pu1_src_left_cpy;
1200             pu1_src_left_cpy = pu1_src_left_str;
1201             pu1_src_left_str = pu1_left_tmp;
1202         }
1203         for(row = 0; row < ht; row++)
1204         {
1205             pu1_src_left[row] = pu1_src_left_cpy[row];
1206         }
1207     }
1208 }
1209 
1210 
ihevc_sao_edge_offset_class0_chroma_ssse3(UWORD8 * pu1_src,WORD32 src_strd,UWORD8 * pu1_src_left,UWORD8 * pu1_src_top,UWORD8 * pu1_src_top_left,UWORD8 * pu1_src_top_right,UWORD8 * pu1_src_bot_left,UWORD8 * pu1_avail,WORD8 * pi1_sao_offset_u,WORD8 * pi1_sao_offset_v,WORD32 wd,WORD32 ht)1211 void ihevc_sao_edge_offset_class0_chroma_ssse3(UWORD8 *pu1_src,
1212                                                WORD32 src_strd,
1213                                                UWORD8 *pu1_src_left,
1214                                                UWORD8 *pu1_src_top,
1215                                                UWORD8 *pu1_src_top_left,
1216                                                UWORD8 *pu1_src_top_right,
1217                                                UWORD8 *pu1_src_bot_left,
1218                                                UWORD8 *pu1_avail,
1219                                                WORD8 *pi1_sao_offset_u,
1220                                                WORD8 *pi1_sao_offset_v,
1221                                                WORD32 wd,
1222                                                WORD32 ht)
1223 {
1224     WORD32 row, col;
1225     UWORD8 *pu1_src_cpy, *pu1_src_left_cpy, *pu1_src_left_str, *pu1_left_tmp;
1226     UWORD8 au1_mask[MAX_CTB_SIZE], *au1_mask_cpy;
1227     UWORD8 au1_src_left_tmp[2 * (MAX_CTB_SIZE + 8)];
1228     UWORD8 au1_src_left_tmp1[2 * (MAX_CTB_SIZE + 8)];
1229     UWORD8 u1_avail0, u1_avail1;
1230     WORD32 wd_rem;
1231     WORD32 offset = 0;
1232 
1233     __m128i src_temp0_16x8b, src_temp1_16x8b;
1234     __m128i left0_16x8b, left1_16x8b;
1235     __m128i cmp_gt0_16x8b, cmp_lt0_16x8b;
1236     __m128i edge0_16x8b, edge1_16x8b;
1237     __m128i au1_mask8x16b;
1238     __m128i edge_idx_8x16b, sao_offset_8x16b;
1239     __m128i const2_16x8b, const0_16x8b;
1240     __m128i left_store_16x8b;
1241     __m128i chroma_offset_8x16b;
1242     UNUSED(pu1_src_top_right);
1243     UNUSED(pu1_src_bot_left);
1244 
1245     au1_mask8x16b = _mm_set1_epi8(0xff);
1246 
1247     /* Update  top and top-left arrays */
1248     pu1_src_top_left[0] = pu1_src_top[wd - 2];
1249     pu1_src_top_left[1] = pu1_src_top[wd - 1];;
1250 
1251     for(col = wd; col >= 16; col -= 16)
1252     {
1253         const0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + offset + (ht - 1) * src_strd));
1254         _mm_storeu_si128((__m128i *)(pu1_src_top + offset), const0_16x8b);
1255         offset += 16;
1256     }
1257     for(row = 0; row < 2 * ht; row++)
1258     {
1259         au1_src_left_tmp[row] = pu1_src_left[row];
1260     }
1261     //setting availability mask to ff size MAX_CTB_SIZE
1262     for(col = 0; col < MAX_CTB_SIZE; col += 16)
1263         _mm_storeu_si128((__m128i *)(au1_mask + col), au1_mask8x16b);
1264 
1265     edge_idx_8x16b   = _mm_loadl_epi64((__m128i *)gi1_table_edge_idx);
1266     sao_offset_8x16b = _mm_loadl_epi64((__m128i *)pi1_sao_offset_u);
1267     const0_16x8b = _mm_loadl_epi64((__m128i *)pi1_sao_offset_v);
1268     chroma_offset_8x16b = _mm_set1_epi16(0x0800);
1269     //availability mask creation
1270     u1_avail0 = pu1_avail[0];
1271     u1_avail1 = pu1_avail[1];
1272     au1_mask[0] = u1_avail0;
1273     au1_mask[1] = u1_avail0;
1274     au1_mask[wd - 1] = u1_avail1;
1275     au1_mask[wd - 2] = u1_avail1;
1276     sao_offset_8x16b = _mm_unpacklo_epi64(sao_offset_8x16b, const0_16x8b);
1277     const2_16x8b = _mm_set1_epi8(2);
1278     const0_16x8b = _mm_setzero_si128();
1279 
1280     {
1281         pu1_src_left_cpy = au1_src_left_tmp;
1282         pu1_src_left_str = au1_src_left_tmp1;
1283         au1_mask_cpy = au1_mask;
1284         for(col = wd; col >= 16; col -= 16)
1285         {
1286             pu1_src_cpy = pu1_src;
1287             au1_mask8x16b = _mm_loadu_si128((__m128i *)au1_mask_cpy);
1288 
1289             for(row = ht; row > 0; row -= 2)
1290             {
1291 
1292                 left_store_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_left_cpy));
1293                 //row = 0 load 8 pixel values from 7:0 pos. relative to cur. pos.
1294                 src_temp0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy));
1295                 // row = 1
1296                 src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
1297 
1298                 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, left_store_16x8b, 4);
1299                 //row 1 left
1300                 left1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, left_store_16x8b, 14);
1301                 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp1_16x8b, 14);
1302                 //row 0 left
1303                 left0_16x8b = _mm_alignr_epi8(src_temp0_16x8b, left_store_16x8b, 14);
1304                 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 14);
1305 
1306 
1307                 //separating +ve and and -ve values.row 0 left
1308                 cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, left0_16x8b);
1309                 cmp_lt0_16x8b = _mm_subs_epu8(left0_16x8b, src_temp0_16x8b);
1310                 //creating mask 00 for +ve and -ve values and FF for zero.
1311                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
1312                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
1313                 //combining the appropriate sign change
1314                 left0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
1315 
1316                 //separating +ve and and -ve values.row 1 left
1317                 cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, left1_16x8b);
1318                 cmp_lt0_16x8b = _mm_subs_epu8(left1_16x8b, src_temp1_16x8b);
1319                 //creating mask 00 for +ve and -ve values and FF for zero.
1320                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
1321                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
1322                 //combining the appropriate sign change
1323                 left1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
1324 
1325 
1326                 //row = 0 right
1327                 edge0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2));
1328                 // row = 1 right
1329                 edge1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd + 2));
1330                 //separating +ve and and -ve values.row 0 right
1331                 cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, edge0_16x8b);
1332                 cmp_lt0_16x8b = _mm_subs_epu8(edge0_16x8b, src_temp0_16x8b);
1333                 //creating mask 00 for +ve and -ve values and FF for zero.
1334                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
1335                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
1336                 //combining the appropriate sign change
1337                 edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
1338 
1339                 //separating +ve and and -ve values.row 1 right
1340                 cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, edge1_16x8b);
1341                 cmp_lt0_16x8b = _mm_subs_epu8(edge1_16x8b, src_temp1_16x8b);
1342                 //creating mask 00 for +ve and -ve values and FF for zero.
1343                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
1344                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
1345                 //combining the appropriate sign change
1346                 edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
1347 
1348                 //combining sign-left and sign_right
1349                 edge0_16x8b = _mm_add_epi8(edge0_16x8b, left0_16x8b);
1350                 edge1_16x8b = _mm_add_epi8(edge1_16x8b, left1_16x8b);
1351                 //adding constant 2
1352                 edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
1353                 edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b);
1354                 //shuffle to get sao index
1355                 edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
1356                 edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b);
1357                 //using availability mask
1358                 edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
1359                 edge1_16x8b = _mm_and_si128(edge1_16x8b, au1_mask8x16b);
1360                 //adding chroma offset to access U and V
1361                 edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b);
1362                 edge1_16x8b = _mm_add_epi8(edge1_16x8b, chroma_offset_8x16b);
1363 
1364                 //shuffle to get sao offset
1365                 edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
1366                 edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b);
1367                 //cnvert to 16 bit then add and then saturated pack
1368                 left0_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
1369                 cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
1370                 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, left0_16x8b);
1371                 src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
1372                 edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, left0_16x8b);
1373                 cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
1374                 src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, edge0_16x8b);
1375                 src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
1376 
1377                 left0_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b);
1378                 cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b);
1379                 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, left0_16x8b);
1380                 src_temp1_16x8b = _mm_unpackhi_epi8(src_temp1_16x8b, const0_16x8b);
1381                 edge1_16x8b = _mm_unpackhi_epi8(edge1_16x8b, left0_16x8b);
1382                 cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
1383                 src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, edge1_16x8b);
1384                 src_temp1_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp1_16x8b);
1385 
1386                 _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
1387                 //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
1388                 _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
1389                 // row = 1
1390                 _mm_storeu_si128((__m128i *)(pu1_src_cpy + src_strd), src_temp1_16x8b);
1391 
1392                 pu1_src_cpy += (src_strd << 1);
1393                 pu1_src_left_cpy += 4;
1394                 pu1_src_left_str += 4;
1395             }
1396             au1_mask_cpy += 16;
1397             pu1_src += 16;
1398             pu1_src_left_cpy -= 2 * ht;
1399             pu1_src_left_str -= 2 * ht;
1400 
1401             pu1_left_tmp = pu1_src_left_cpy;
1402             pu1_src_left_cpy = pu1_src_left_str;
1403             pu1_src_left_str = pu1_left_tmp;
1404         }
1405 
1406         wd_rem = wd & 0xF;
1407         if(wd_rem)
1408         {
1409 
1410             cmp_gt0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + (ht - 1) * src_strd));
1411             _mm_storel_epi64((__m128i *)(pu1_src_top + offset), cmp_gt0_16x8b);
1412 
1413             au1_mask8x16b = _mm_loadl_epi64((__m128i *)au1_mask_cpy);
1414             pu1_src_cpy = pu1_src;
1415             au1_mask8x16b = _mm_unpacklo_epi64(au1_mask8x16b, au1_mask8x16b);
1416 
1417             for(row = ht; row > 0; row -= 4)
1418             {
1419                 left_store_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_left_cpy));
1420                 //row = 0 load 8 pixel values from 7:0 pos. relative to cur. pos.
1421                 src_temp0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy));
1422                 // row = 1
1423                 cmp_gt0_16x8b =  _mm_loadl_epi64((__m128i *)(pu1_src_cpy + src_strd));
1424                 // row  = 2
1425                 src_temp1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd));
1426                 // row = 3
1427                 cmp_lt0_16x8b =  _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd));
1428 
1429 
1430                 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, left_store_16x8b, 8);
1431                 //row 3 left
1432                 edge0_16x8b = _mm_slli_si128(cmp_lt0_16x8b, 8);
1433                 left0_16x8b = _mm_alignr_epi8(cmp_lt0_16x8b, left_store_16x8b, 14);
1434                 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, edge0_16x8b, 14);
1435                 //row 2 left
1436                 edge0_16x8b = _mm_slli_si128(src_temp1_16x8b, 8);
1437                 left1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, left_store_16x8b, 14);
1438                 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, edge0_16x8b, 14);
1439 
1440 
1441                 // packing rows together for 16 SIMD operations
1442                 src_temp1_16x8b = _mm_unpacklo_epi64(src_temp1_16x8b, cmp_lt0_16x8b);
1443                 left1_16x8b = _mm_unpacklo_epi64(left1_16x8b, left0_16x8b);
1444 
1445                 //row 1 left
1446                 edge0_16x8b = _mm_slli_si128(cmp_gt0_16x8b, 8);
1447                 edge1_16x8b = _mm_alignr_epi8(cmp_gt0_16x8b, left_store_16x8b, 14);
1448                 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, edge0_16x8b, 14);
1449                 //row 0 left
1450                 edge0_16x8b = _mm_slli_si128(src_temp0_16x8b, 8);
1451                 left0_16x8b = _mm_alignr_epi8(src_temp0_16x8b, left_store_16x8b, 14);
1452                 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, edge0_16x8b, 14);
1453                 // packing rows together for 16 SIMD operations
1454                 src_temp0_16x8b = _mm_unpacklo_epi64(src_temp0_16x8b, cmp_gt0_16x8b);
1455                 left0_16x8b = _mm_unpacklo_epi64(left0_16x8b, edge1_16x8b);
1456 
1457                 //separating +ve and and -ve values.for row 2 and row 3
1458                 cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, left1_16x8b);
1459                 cmp_lt0_16x8b = _mm_subs_epu8(left1_16x8b, src_temp1_16x8b);
1460                 //creating mask 00 for +ve and -ve values and FF for zero.
1461                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
1462                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
1463                 //combining the appropriate sign change
1464                 left1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
1465 
1466 
1467 
1468 
1469 
1470                 //separating +ve and and -ve values.
1471                 cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, left0_16x8b);
1472                 cmp_lt0_16x8b = _mm_subs_epu8(left0_16x8b, src_temp0_16x8b);
1473                 //creating mask 00 for +ve and -ve values and FF for zero.
1474                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
1475                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
1476                 //combining the appropriate sign change
1477                 left0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
1478 
1479 
1480                 //row = 0 right
1481                 edge0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 2));
1482                 // row = 1 right
1483                 cmp_gt0_16x8b =  _mm_loadl_epi64((__m128i *)(pu1_src_cpy + src_strd + 2));
1484                 // row = 2 right
1485                 edge1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd + 2));
1486                 // row = 3 right
1487                 cmp_lt0_16x8b =  _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd + 2));
1488                 // packing rows together for 16 SIMD operations
1489                 edge0_16x8b = _mm_unpacklo_epi64(edge0_16x8b, cmp_gt0_16x8b);
1490                 edge1_16x8b = _mm_unpacklo_epi64(edge1_16x8b, cmp_lt0_16x8b);
1491 
1492                 //separating +ve and and -ve values.
1493                 cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, edge0_16x8b);
1494                 cmp_lt0_16x8b = _mm_subs_epu8(edge0_16x8b, src_temp0_16x8b);
1495                 //creating mask 00 for +ve and -ve values and FF for zero.
1496                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
1497                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
1498                 //combining the appropriate sign change
1499                 edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
1500 
1501                 cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, edge1_16x8b);
1502                 cmp_lt0_16x8b = _mm_subs_epu8(edge1_16x8b, src_temp1_16x8b);
1503                 //creating mask 00 for +ve and -ve values and FF for zero.
1504                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
1505                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
1506                 //combining the appropriate sign change
1507                 edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
1508 
1509                 //combining sign-left and sign_right
1510                 edge0_16x8b = _mm_add_epi8(edge0_16x8b, left0_16x8b);
1511                 edge1_16x8b = _mm_add_epi8(edge1_16x8b, left1_16x8b);
1512                 //adding constant 2
1513                 edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
1514                 edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b);
1515                 //shuffle to get sao index
1516                 edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
1517                 edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b);
1518                 //shuffle to get sao offset
1519                 //using availability mask
1520                 edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
1521                 edge1_16x8b = _mm_and_si128(edge1_16x8b, au1_mask8x16b);
1522                 //adding chroma offset to access U and V
1523                 edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b);
1524                 edge1_16x8b = _mm_add_epi8(edge1_16x8b, chroma_offset_8x16b);
1525 
1526                 edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
1527                 edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b);
1528                 //cnvert to 16 bit then add and then saturated pack
1529                 left0_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
1530                 cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
1531                 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, left0_16x8b);
1532                 src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
1533                 edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, left0_16x8b);
1534                 cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
1535                 src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, edge0_16x8b);
1536                 src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
1537 
1538                 left0_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b);
1539                 cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b);
1540                 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, left0_16x8b);
1541                 src_temp1_16x8b = _mm_unpackhi_epi8(src_temp1_16x8b, const0_16x8b);
1542                 edge1_16x8b = _mm_unpackhi_epi8(edge1_16x8b, left0_16x8b);
1543                 cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
1544                 src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, edge1_16x8b);
1545                 src_temp1_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp1_16x8b);
1546 
1547                 //seaprting row 1 and row 3
1548                 cmp_gt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8);
1549                 cmp_lt0_16x8b = _mm_srli_si128(src_temp1_16x8b, 8);
1550 
1551                 _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
1552                 //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
1553                 _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
1554                 // row = 1
1555                 _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_gt0_16x8b);
1556                 // row = 2
1557                 _mm_storel_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd), src_temp1_16x8b);
1558                 // row = 3
1559                 _mm_storel_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd), cmp_lt0_16x8b);
1560 
1561                 pu1_src_cpy += (src_strd << 2);
1562                 pu1_src_left_cpy += 8;
1563                 pu1_src_left_str += 8;
1564             }
1565             pu1_src += wd;
1566             pu1_src_left_cpy -= 2 * ht;
1567             pu1_src_left_str -= 2 * ht;
1568 
1569             pu1_left_tmp = pu1_src_left_cpy;
1570             pu1_src_left_cpy = pu1_src_left_str;
1571             pu1_src_left_str = pu1_left_tmp;
1572         }
1573         for(row = 0; row < 2 * ht; row++)
1574         {
1575             pu1_src_left[row] = pu1_src_left_cpy[row];
1576         }
1577     }
1578 
1579 }
1580 
1581 
ihevc_sao_edge_offset_class1_ssse3(UWORD8 * pu1_src,WORD32 src_strd,UWORD8 * pu1_src_left,UWORD8 * pu1_src_top,UWORD8 * pu1_src_top_left,UWORD8 * pu1_src_top_right,UWORD8 * pu1_src_bot_left,UWORD8 * pu1_avail,WORD8 * pi1_sao_offset,WORD32 wd,WORD32 ht)1582 void ihevc_sao_edge_offset_class1_ssse3(UWORD8 *pu1_src,
1583                                         WORD32 src_strd,
1584                                         UWORD8 *pu1_src_left,
1585                                         UWORD8 *pu1_src_top,
1586                                         UWORD8 *pu1_src_top_left,
1587                                         UWORD8 *pu1_src_top_right,
1588                                         UWORD8 *pu1_src_bot_left,
1589                                         UWORD8 *pu1_avail,
1590                                         WORD8 *pi1_sao_offset,
1591                                         WORD32 wd,
1592                                         WORD32 ht)
1593 {
1594     WORD32 row, col;
1595     UWORD8 *pu1_src_top_cpy;
1596     UWORD8 *pu1_src_cpy;
1597     WORD32 wd_rem;
1598 
1599 
1600     __m128i src_top_16x8b, src_bottom_16x8b;
1601     __m128i src_temp0_16x8b, src_temp1_16x8b;
1602     __m128i signup0_16x8b, signdwn1_16x8b;
1603     __m128i cmp_gt0_16x8b, cmp_lt0_16x8b;
1604     __m128i edge0_16x8b, edge1_16x8b;
1605     __m128i edge_idx_8x16b, sao_offset_8x16b;
1606     __m128i const2_16x8b, const0_16x8b;
1607 
1608     UNUSED(pu1_src_top_right);
1609     UNUSED(pu1_src_bot_left);
1610 
1611 
1612     /* Updating left and top-left  */
1613     for(row = 0; row < ht; row++)
1614     {
1615         pu1_src_left[row] = pu1_src[row * src_strd + (wd - 1)];
1616     }
1617     *pu1_src_top_left = pu1_src_top[wd - 1];
1618 
1619 
1620 
1621     pu1_src_top_cpy = pu1_src_top;
1622     edge_idx_8x16b   = _mm_loadl_epi64((__m128i *)gi1_table_edge_idx);
1623     sao_offset_8x16b = _mm_loadl_epi64((__m128i *)pi1_sao_offset);
1624 
1625     /* Update height and source pointers based on the availability flags */
1626     if(0 == pu1_avail[2])
1627     {
1628         pu1_src_top_cpy = pu1_src;
1629         pu1_src += src_strd;
1630         ht--;
1631     }
1632     if(0 == pu1_avail[3])
1633     {
1634         ht--;
1635     }
1636 
1637     const2_16x8b = _mm_set1_epi8(2);
1638     const0_16x8b = _mm_setzero_si128();
1639 
1640     {
1641         WORD32 ht_rem;
1642         for(col = wd; col >= 16; col -= 16)
1643         {
1644             pu1_src_cpy = pu1_src;
1645             src_top_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_top_cpy + wd - col));
1646             //row = 0
1647             src_temp0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy));
1648             //separating +ve and and -ve values.
1649             cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_top_16x8b);
1650             cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp0_16x8b);
1651             //creating mask 00 for +ve and -ve values and FF for zero.
1652             cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
1653             cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
1654             //combining the appropriate sign change
1655             signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
1656 
1657             for(row = ht; row >= 2; row -= 2)
1658             {
1659 
1660                 //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
1661                 src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
1662                 // row = 2
1663                 src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd));
1664 
1665 
1666                 //row 0 -row1
1667                 //separating +ve and and -ve values.
1668                 cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_temp1_16x8b);
1669                 cmp_lt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_temp0_16x8b);
1670                 //creating mask 00 for +ve and -ve values and FF for zero.
1671                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
1672                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
1673                 //combining the appropriate sign change
1674                 edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
1675                 //row1-row0
1676                 edge1_16x8b = _mm_sub_epi8(cmp_lt0_16x8b, cmp_gt0_16x8b);
1677 
1678                 //row1 -bottom
1679                 cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_bottom_16x8b);
1680                 cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp1_16x8b);
1681                 //creating mask 00 for +ve and -ve values and FF for zero.
1682                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
1683                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
1684                 //combining the appropriate sign change
1685                 signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
1686 
1687                 //combining sign-left and sign_right
1688                 edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
1689                 edge1_16x8b = _mm_add_epi8(edge1_16x8b, signdwn1_16x8b);
1690 
1691                 //for the next iteration signup0_16x8b = -signdwn1_16x8b
1692                 signup0_16x8b = _mm_sub_epi8(cmp_lt0_16x8b, cmp_gt0_16x8b);
1693                 //adding constant 2
1694                 edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
1695                 edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b);
1696                 //shuffle to get sao index
1697                 edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
1698                 edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b);
1699                 //shuffle to get sao offset
1700                 edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
1701                 edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b);
1702                 //copying the next top
1703                 src_top_16x8b = src_temp1_16x8b;
1704                 //cnvert to 16 bit then add and then saturated pack
1705                 signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
1706                 cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
1707                 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
1708                 src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
1709                 cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
1710                 cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
1711                 src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
1712                 src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
1713 
1714                 signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b);
1715                 cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b);
1716                 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, signdwn1_16x8b);
1717                 src_temp1_16x8b = _mm_unpackhi_epi8(src_temp1_16x8b, const0_16x8b);
1718                 cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
1719                 cmp_lt0_16x8b = _mm_unpackhi_epi8(edge1_16x8b, signdwn1_16x8b);
1720                 src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, cmp_lt0_16x8b);
1721                 src_temp1_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp1_16x8b);
1722 
1723                 //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
1724                 _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
1725                 // row = 1
1726                 _mm_storeu_si128((__m128i *)(pu1_src_cpy + src_strd), src_temp1_16x8b);
1727 
1728                 src_temp0_16x8b = src_bottom_16x8b;
1729                 pu1_src_cpy += (src_strd << 1);
1730             }
1731             ht_rem = ht & 0x1;
1732 
1733             if(ht_rem)
1734             {
1735                 src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
1736                 //current row -next row
1737                 //separating +ve and and -ve values.
1738                 cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_bottom_16x8b);
1739                 cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp0_16x8b);
1740                 //creating mask 00 for +ve and -ve values and FF for zero.
1741                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
1742                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
1743                 //combining the appropriate sign change
1744                 edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
1745                 //adding top and botton and constant 2
1746                 edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
1747                 edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
1748 
1749                 edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
1750                 edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
1751                 //copying the next top
1752                 src_top_16x8b = src_temp0_16x8b;
1753                 //cnvert to 16 bit then add and then saturated pack
1754                 signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
1755                 cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
1756                 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
1757                 src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
1758                 cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
1759                 cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
1760                 src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
1761                 src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
1762 
1763                 _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
1764             }
1765             if(0 == pu1_avail[3])
1766             {
1767                 src_top_16x8b = src_bottom_16x8b;
1768             }
1769             //updating top flag
1770             _mm_storeu_si128((__m128i *)(pu1_src_top + wd - col), src_top_16x8b);
1771             pu1_src += 16;
1772         }
1773 
1774         wd_rem = wd & 0xF;
1775         if(wd_rem)
1776         {
1777             pu1_src_cpy = pu1_src;
1778             src_top_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_top_cpy + wd - col));
1779             //row = 0
1780             src_temp0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy));
1781             //separating +ve and and -ve values.
1782             cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_top_16x8b);
1783             cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp0_16x8b);
1784             //creating mask 00 for +ve and -ve values and FF for zero.
1785             cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
1786             cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
1787             //combining the appropriate sign change
1788             signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
1789             signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8);
1790             for(row = ht; row >= 4; row -= 4)
1791             {
1792                 //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
1793                 src_temp1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + src_strd));
1794                 // row = 2
1795                 src_bottom_16x8b =  _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd));
1796 
1797                 //row 0 -row1
1798                 //separating +ve and and -ve values.
1799                 cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_temp1_16x8b);
1800                 cmp_lt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_temp0_16x8b);
1801                 //creating mask 00 for +ve and -ve values and FF for zero.
1802                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
1803                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
1804                 //combining the appropriate sign change
1805                 edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
1806 
1807                 //row1-row0
1808                 edge1_16x8b = _mm_sub_epi8(cmp_lt0_16x8b, cmp_gt0_16x8b);
1809                 edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8); //aligned left (0-1)
1810                 signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(1-0),(0-top)
1811                 //row1 -row2
1812                 cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_bottom_16x8b);
1813                 cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp1_16x8b);
1814                 //creating mask 00 for +ve and -ve values and FF for zero.
1815                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
1816                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
1817                 //combining the appropriate sign change
1818                 signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-2)
1819                 edge0_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge0_16x8b, 8); //(1-2),(0-1)
1820                 //packing row 0 n row 1
1821                 src_temp0_16x8b = _mm_unpacklo_epi64(src_temp0_16x8b, src_temp1_16x8b);
1822                 //row = 3
1823                 src_top_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd));
1824                 // row = 4
1825                 src_temp1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 4 * src_strd));
1826 
1827                 edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); //(1,0) sign_up empty
1828                 signdwn1_16x8b = _mm_slli_si128(signdwn1_16x8b, 8); //allign left (1-2)
1829                 //separating +ve and and -ve values.(2,3)
1830                 cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_top_16x8b);
1831                 cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_bottom_16x8b);
1832                 //creating mask 00 for +ve and -ve values and FF for zero.
1833                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
1834                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
1835                 //combining the appropriate sign change
1836                 edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(2-3)
1837 
1838                 signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signdwn1_16x8b, 8); //(2-3) ,(1-2) (substract with down)
1839                 edge1_16x8b = _mm_slli_si128(edge1_16x8b, 8);
1840                 //separating +ve and and -ve values.(3,4)
1841                 cmp_gt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp1_16x8b);
1842                 cmp_lt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_top_16x8b);
1843                 //creating mask 00 for +ve and -ve values and FF for zero.
1844                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
1845                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
1846                 signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(3-4)
1847                 //combining sign-left and sign_right
1848                 edge1_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge1_16x8b, 8); //(3-4),(2-3)
1849 
1850                 edge1_16x8b = _mm_sub_epi8(edge1_16x8b, signup0_16x8b); //(3,2)
1851 
1852                 //packing row 2 n row 3
1853                 src_bottom_16x8b = _mm_unpacklo_epi64(src_bottom_16x8b, src_top_16x8b);
1854                 //for the next iteration signup0_16x8b = -signdwn1_16x8b
1855                 signup0_16x8b = _mm_sub_epi8(cmp_lt0_16x8b, cmp_gt0_16x8b); //(4-3)
1856 
1857                 //adding constant 2
1858                 edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
1859                 edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b);
1860                 //shuffle to get sao index
1861                 edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
1862                 edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b);
1863                 //shuffle to get sao offset
1864                 edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
1865                 edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b);
1866                 //the next top already in  src_top_16x8b
1867                 //src_top_16x8b = src_temp1_16x8b;
1868                 //cnvert to 16 bit then add and then saturated pack
1869                 signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
1870                 cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
1871                 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
1872                 src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
1873                 cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
1874                 cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
1875                 src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
1876                 src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
1877 
1878                 signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b);
1879                 cmp_gt0_16x8b = _mm_unpacklo_epi8(src_bottom_16x8b, const0_16x8b);
1880                 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, signdwn1_16x8b);
1881                 src_bottom_16x8b = _mm_unpackhi_epi8(src_bottom_16x8b, const0_16x8b);
1882                 cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
1883                 cmp_lt0_16x8b = _mm_unpackhi_epi8(edge1_16x8b, signdwn1_16x8b);
1884                 src_bottom_16x8b = _mm_add_epi16(src_bottom_16x8b, cmp_lt0_16x8b);
1885                 src_bottom_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_bottom_16x8b);
1886 
1887                 cmp_gt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8);
1888                 cmp_lt0_16x8b = _mm_srli_si128(src_bottom_16x8b, 8);
1889                 //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
1890                 _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
1891                 // row = 1
1892                 _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_gt0_16x8b);
1893                 //row = 2
1894                 _mm_storel_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd), src_bottom_16x8b);
1895                 // row = 3
1896                 _mm_storel_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd), cmp_lt0_16x8b);
1897 
1898                 src_temp0_16x8b = src_temp1_16x8b;
1899                 signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8);
1900                 pu1_src_cpy += (src_strd << 2);
1901 
1902             }
1903             ht_rem = ht & 0x2;
1904             if(ht_rem)
1905             {
1906 
1907                 //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
1908                 src_temp1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + src_strd));
1909                 // row = 2
1910                 src_bottom_16x8b =  _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd));
1911 
1912                 //row 0 -row1
1913                 //separating +ve and and -ve values.
1914                 cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_temp1_16x8b);
1915                 cmp_lt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_temp0_16x8b);
1916                 //creating mask 00 for +ve and -ve values and FF for zero.
1917                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
1918                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
1919                 //combining the appropriate sign change
1920                 edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
1921                 //row1-row0
1922                 edge1_16x8b = _mm_sub_epi8(cmp_lt0_16x8b, cmp_gt0_16x8b);
1923                 edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8); //aligned left (0-1)
1924                 signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(1-0),(0-top)
1925                 //row1 -row2
1926                 cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_bottom_16x8b);
1927                 cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp1_16x8b);
1928                 //creating mask 00 for +ve and -ve values and FF for zero.
1929                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
1930                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
1931                 //combining the appropriate sign change
1932                 signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-2)
1933                 edge0_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge0_16x8b, 8); //(1-2),(0-1)
1934                 //adding top and down substraction
1935                 edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); //(1,0) sign_up empty
1936                 //for the next iteration signup0_16x8b = -signdwn1_16x8b
1937                 signup0_16x8b = _mm_sub_epi8(cmp_lt0_16x8b, cmp_gt0_16x8b); //(2-1) for next
1938                 src_top_16x8b = src_temp1_16x8b;
1939                 //adding constant 2
1940                 edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
1941 
1942                 //shuffle to get sao index
1943                 edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
1944 
1945                 //shuffle to get sao offset
1946                 edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
1947 
1948                 //the next top already in  src_top_16x8b
1949                 //cnvert to 16 bit then add and then saturated pack
1950                 signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
1951                 src_temp0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
1952                 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
1953                 src_temp1_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b);
1954                 src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
1955                 cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
1956                 src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, cmp_lt0_16x8b);
1957                 src_temp0_16x8b = _mm_packus_epi16(src_temp0_16x8b, src_temp1_16x8b);
1958 
1959                 cmp_gt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8);
1960 
1961                 //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
1962                 _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
1963                 // row = 1
1964                 _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_gt0_16x8b);
1965                 src_temp0_16x8b = src_bottom_16x8b;
1966                 pu1_src_cpy += (src_strd << 1);
1967 
1968             }
1969             ht_rem = ht & 0x1;
1970             if(ht_rem)
1971             {
1972 
1973                 //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
1974                 src_bottom_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + src_strd));
1975 
1976                 //row 0 -row1
1977                 //separating +ve and and -ve values.
1978                 cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_bottom_16x8b);
1979                 cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp0_16x8b);
1980                 //creating mask 00 for +ve and -ve values and FF for zero.
1981                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
1982                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
1983                 //combining the appropriate sign change
1984                 edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
1985                 //adding top and down substraction
1986                 edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
1987                 //adding constant 2
1988                 edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
1989                 edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8);
1990                 edge0_16x8b = _mm_srli_si128(edge0_16x8b, 8);
1991                 //shuffle to get sao index
1992                 edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
1993                 //shuffle to get sao offset
1994                 edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
1995                 src_top_16x8b = src_temp0_16x8b;
1996                 //cnvert to 16 bit then add and then saturated pack
1997                 signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
1998                 src_temp0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
1999                 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
2000                 src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
2001                 src_temp0_16x8b = _mm_packus_epi16(src_temp0_16x8b, const0_16x8b);
2002                 //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
2003                 _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
2004                 pu1_src_cpy += (src_strd);
2005 
2006             }
2007             if(0 == pu1_avail[3])
2008             {
2009                 src_top_16x8b = src_bottom_16x8b;
2010             }
2011             _mm_storel_epi64((__m128i *)(pu1_src_top + wd - col), src_top_16x8b);
2012             pu1_src += 8;
2013         }
2014     }
2015 }
2016 
ihevc_sao_edge_offset_class1_chroma_ssse3(UWORD8 * pu1_src,WORD32 src_strd,UWORD8 * pu1_src_left,UWORD8 * pu1_src_top,UWORD8 * pu1_src_top_left,UWORD8 * pu1_src_top_right,UWORD8 * pu1_src_bot_left,UWORD8 * pu1_avail,WORD8 * pi1_sao_offset_u,WORD8 * pi1_sao_offset_v,WORD32 wd,WORD32 ht)2017 void ihevc_sao_edge_offset_class1_chroma_ssse3(UWORD8 *pu1_src,
2018                                                WORD32 src_strd,
2019                                                UWORD8 *pu1_src_left,
2020                                                UWORD8 *pu1_src_top,
2021                                                UWORD8 *pu1_src_top_left,
2022                                                UWORD8 *pu1_src_top_right,
2023                                                UWORD8 *pu1_src_bot_left,
2024                                                UWORD8 *pu1_avail,
2025                                                WORD8 *pi1_sao_offset_u,
2026                                                WORD8 *pi1_sao_offset_v,
2027                                                WORD32 wd,
2028                                                WORD32 ht)
2029 {
2030     WORD32 row, col;
2031     UWORD8 *pu1_src_top_cpy;
2032     UWORD8 *pu1_src_cpy;
2033     WORD32 wd_rem;
2034 
2035 
2036     __m128i src_top_16x8b, src_bottom_16x8b;
2037     __m128i src_temp0_16x8b, src_temp1_16x8b;
2038     __m128i signup0_16x8b, signdwn1_16x8b;
2039     __m128i cmp_gt0_16x8b, cmp_lt0_16x8b;
2040     __m128i edge0_16x8b, edge1_16x8b;
2041     __m128i edge_idx_8x16b, sao_offset_8x16b;
2042     __m128i const2_16x8b, const0_16x8b;
2043     __m128i chroma_offset_8x16b;
2044 
2045     UNUSED(pu1_src_top_right);
2046     UNUSED(pu1_src_bot_left);
2047 
2048     /* Updating left and top and top-left */
2049     for(row = 0; row < ht; row++)
2050     {
2051         pu1_src_left[2 * row] = pu1_src[row * src_strd + (wd - 2)];
2052         pu1_src_left[2 * row + 1] = pu1_src[row * src_strd + (wd - 1)];
2053     }
2054     pu1_src_top_left[0] = pu1_src_top[wd - 2];
2055     pu1_src_top_left[1] = pu1_src_top[wd - 1];
2056 
2057 
2058 
2059     pu1_src_top_cpy = pu1_src_top;
2060     edge_idx_8x16b   = _mm_loadl_epi64((__m128i *)gi1_table_edge_idx);
2061     sao_offset_8x16b = _mm_loadl_epi64((__m128i *)pi1_sao_offset_u);
2062     const0_16x8b = _mm_loadl_epi64((__m128i *)pi1_sao_offset_v);
2063     chroma_offset_8x16b = _mm_set1_epi16(0x0800);
2064     /* Update height and source pointers based on the availability flags */
2065     if(0 == pu1_avail[2])
2066     {
2067         pu1_src_top_cpy = pu1_src;
2068         pu1_src += src_strd;
2069         ht--;
2070     }
2071     if(0 == pu1_avail[3])
2072     {
2073         ht--;
2074     }
2075     sao_offset_8x16b = _mm_unpacklo_epi64(sao_offset_8x16b, const0_16x8b);
2076     const2_16x8b = _mm_set1_epi8(2);
2077     const0_16x8b = _mm_setzero_si128();
2078 
2079 
2080     {
2081         WORD32 ht_rem;
2082 
2083 
2084 
2085         for(col = wd; col >= 16; col -= 16)
2086         {
2087             pu1_src_cpy = pu1_src;
2088             src_top_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_top_cpy + wd - col));
2089             //row = 0
2090             src_temp0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy));
2091             //separating +ve and and -ve values.
2092             cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_top_16x8b);
2093             cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp0_16x8b);
2094             //creating mask 00 for +ve and -ve values and FF for zero.
2095             cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
2096             cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
2097             //combining the appropriate sign change
2098             signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
2099 
2100             for(row = ht; row >= 2; row -= 2)
2101             {
2102 
2103                 //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
2104                 src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
2105                 // row = 2
2106                 src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd));
2107 
2108 
2109                 //row 0 -row1
2110                 //separating +ve and and -ve values.
2111                 cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_temp1_16x8b);
2112                 cmp_lt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_temp0_16x8b);
2113                 //creating mask 00 for +ve and -ve values and FF for zero.
2114                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
2115                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
2116                 //combining the appropriate sign change
2117                 edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
2118                 //row1-row0
2119                 edge1_16x8b = _mm_sub_epi8(cmp_lt0_16x8b, cmp_gt0_16x8b);
2120 
2121                 //row1 -bottom
2122                 cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_bottom_16x8b);
2123                 cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp1_16x8b);
2124                 //creating mask 00 for +ve and -ve values and FF for zero.
2125                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
2126                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
2127                 //combining the appropriate sign change
2128                 signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
2129 
2130                 //combining sign-left and sign_right
2131                 edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
2132                 edge1_16x8b = _mm_add_epi8(edge1_16x8b, signdwn1_16x8b);
2133 
2134                 //for the next iteration signup0_16x8b = -signdwn1_16x8b
2135                 signup0_16x8b = _mm_sub_epi8(cmp_lt0_16x8b, cmp_gt0_16x8b);
2136                 //adding constant 2
2137                 edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
2138                 edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b);
2139                 //copying the next top
2140                 src_top_16x8b = src_temp1_16x8b;
2141 
2142 
2143                 //shuffle to get sao index
2144                 edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
2145                 edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b);
2146                 //adding chroma offset to access U and V
2147                 edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b);
2148                 edge1_16x8b = _mm_add_epi8(edge1_16x8b, chroma_offset_8x16b);
2149 
2150                 //shuffle to get sao offset
2151                 edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
2152                 edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b);
2153                 //cnvert to 16 bit then add and then saturated pack
2154                 signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
2155                 cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
2156                 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
2157                 src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
2158                 edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
2159                 cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
2160                 src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, edge0_16x8b);
2161                 src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
2162 
2163                 signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b);
2164                 cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b);
2165                 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, signdwn1_16x8b);
2166                 src_temp1_16x8b = _mm_unpackhi_epi8(src_temp1_16x8b, const0_16x8b);
2167                 edge1_16x8b = _mm_unpackhi_epi8(edge1_16x8b, signdwn1_16x8b);
2168                 cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
2169                 src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, edge1_16x8b);
2170                 src_temp1_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp1_16x8b);
2171                 //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
2172                 _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
2173                 // row = 1
2174                 _mm_storeu_si128((__m128i *)(pu1_src_cpy + src_strd), src_temp1_16x8b);
2175 
2176                 src_temp0_16x8b = src_bottom_16x8b;
2177                 pu1_src_cpy += (src_strd << 1);
2178             }
2179             ht_rem = ht & 0x1;
2180 
2181             if(ht_rem)
2182             {
2183                 src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
2184                 //current row -next row
2185                 //separating +ve and and -ve values.
2186                 cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_bottom_16x8b);
2187                 cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp0_16x8b);
2188                 //creating mask 00 for +ve and -ve values and FF for zero.
2189                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
2190                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
2191                 //combining the appropriate sign change
2192                 edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
2193                 //adding top and botton and constant 2
2194                 edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
2195                 edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
2196                 //copying the next top
2197                 src_top_16x8b = src_temp0_16x8b;
2198 
2199                 edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
2200                 //adding chroma offset to access U and V
2201                 edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b);
2202                 edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
2203 
2204                 //cnvert to 16 bit then add and then saturated pack
2205                 signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
2206                 cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
2207                 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
2208                 src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
2209                 edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
2210                 cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
2211                 src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, edge0_16x8b);
2212                 src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
2213 
2214                 _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
2215             }
2216             if(0 == pu1_avail[3])
2217             {
2218                 src_top_16x8b = src_bottom_16x8b;
2219             }
2220             //updating top flag
2221             _mm_storeu_si128((__m128i *)(pu1_src_top + wd - col), src_top_16x8b);
2222             pu1_src += 16;
2223         }
2224 
2225         wd_rem = wd & 0xF;
2226         if(wd_rem)
2227         {
2228             pu1_src_cpy = pu1_src;
2229             src_top_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_top_cpy + wd - col));
2230             //row = 0
2231             src_temp0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy));
2232             //separating +ve and and -ve values.
2233             cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_top_16x8b);
2234             cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp0_16x8b);
2235             //creating mask 00 for +ve and -ve values and FF for zero.
2236             cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
2237             cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
2238             //combining the appropriate sign change
2239             signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
2240             signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8);
2241             for(row = ht; row >= 4; row -= 4)
2242             {
2243                 //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
2244                 src_temp1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + src_strd));
2245                 // row = 2
2246                 src_bottom_16x8b =  _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd));
2247 
2248                 //row 0 -row1
2249                 //separating +ve and and -ve values.
2250                 cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_temp1_16x8b);
2251                 cmp_lt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_temp0_16x8b);
2252                 //creating mask 00 for +ve and -ve values and FF for zero.
2253                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
2254                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
2255                 //combining the appropriate sign change
2256                 edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
2257 
2258                 //row1-row0
2259                 edge1_16x8b = _mm_sub_epi8(cmp_lt0_16x8b, cmp_gt0_16x8b);
2260                 edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8); //aligned left (0-1)
2261                 signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(1-0),(0-top)
2262                 //row1 -row2
2263                 cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_bottom_16x8b);
2264                 cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp1_16x8b);
2265                 //creating mask 00 for +ve and -ve values and FF for zero.
2266                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
2267                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
2268                 //combining the appropriate sign change
2269                 signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-2)
2270                 edge0_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge0_16x8b, 8); //(1-2),(0-1)
2271                 //packing row 0 n row 1
2272                 src_temp0_16x8b = _mm_unpacklo_epi64(src_temp0_16x8b, src_temp1_16x8b);
2273                 //row = 3
2274                 src_top_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd));
2275                 // row = 4
2276                 src_temp1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 4 * src_strd));
2277 
2278                 edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); //(1,0) sign_up empty
2279                 signdwn1_16x8b = _mm_slli_si128(signdwn1_16x8b, 8); //allign left (1-2)
2280                 //separating +ve and and -ve values.(2,3)
2281                 cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_top_16x8b);
2282                 cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_bottom_16x8b);
2283                 //creating mask 00 for +ve and -ve values and FF for zero.
2284                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
2285                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
2286                 //combining the appropriate sign change
2287                 edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(2-3)
2288 
2289                 signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signdwn1_16x8b, 8); //(2-3) ,(1-2) (substract with down)
2290                 edge1_16x8b = _mm_slli_si128(edge1_16x8b, 8);
2291                 //separating +ve and and -ve values.(3,4)
2292                 cmp_gt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp1_16x8b);
2293                 cmp_lt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_top_16x8b);
2294                 //creating mask 00 for +ve and -ve values and FF for zero.
2295                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
2296                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
2297                 signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(3-4)
2298                 //combining sign-left and sign_right
2299                 edge1_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge1_16x8b, 8); //(3-4),(2-3)
2300 
2301                 edge1_16x8b = _mm_sub_epi8(edge1_16x8b, signup0_16x8b); //(3,2)
2302 
2303                 //packing row 2 n row 3
2304                 src_bottom_16x8b = _mm_unpacklo_epi64(src_bottom_16x8b, src_top_16x8b);
2305                 //for the next iteration signup0_16x8b = -signdwn1_16x8b
2306                 signup0_16x8b = _mm_sub_epi8(cmp_lt0_16x8b, cmp_gt0_16x8b); //(4-3)
2307                 //adding constant 2
2308                 edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
2309                 edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b);
2310                 //shuffle to get sao index
2311                 edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
2312                 edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b);
2313                 //adding chroma offset to access U and V
2314                 edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b);
2315                 edge1_16x8b = _mm_add_epi8(edge1_16x8b, chroma_offset_8x16b);
2316 
2317                 //shuffle to get sao offset
2318                 edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
2319                 edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b);
2320                 //the next top already in  src_top_16x8b
2321                 //cnvert to 16 bit then add and then saturated pack
2322                 signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
2323                 cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
2324                 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
2325                 src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
2326                 edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
2327                 cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
2328                 src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, edge0_16x8b);
2329                 src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
2330 
2331                 signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b);
2332                 cmp_gt0_16x8b = _mm_unpacklo_epi8(src_bottom_16x8b, const0_16x8b);
2333                 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, signdwn1_16x8b);
2334                 src_bottom_16x8b = _mm_unpackhi_epi8(src_bottom_16x8b, const0_16x8b);
2335                 edge1_16x8b = _mm_unpackhi_epi8(edge1_16x8b, signdwn1_16x8b);
2336                 cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
2337                 src_bottom_16x8b = _mm_add_epi16(src_bottom_16x8b, edge1_16x8b);
2338                 src_bottom_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_bottom_16x8b);
2339 
2340                 cmp_gt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8);
2341                 cmp_lt0_16x8b = _mm_srli_si128(src_bottom_16x8b, 8);
2342                 //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
2343                 _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
2344                 // row = 1
2345                 _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_gt0_16x8b);
2346                 //row = 2
2347                 _mm_storel_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd), src_bottom_16x8b);
2348                 // row = 3
2349                 _mm_storel_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd), cmp_lt0_16x8b);
2350 
2351                 src_temp0_16x8b = src_temp1_16x8b;
2352                 signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8);
2353                 pu1_src_cpy += (src_strd << 2);
2354 
2355             }
2356             ht_rem = ht & 0x2;
2357             if(ht_rem)
2358             {
2359 
2360                 //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
2361                 src_temp1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + src_strd));
2362                 // row = 2
2363                 src_bottom_16x8b =  _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd));
2364 
2365                 //row 0 -row1
2366                 //separating +ve and and -ve values.
2367                 cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_temp1_16x8b);
2368                 cmp_lt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_temp0_16x8b);
2369                 //creating mask 00 for +ve and -ve values and FF for zero.
2370                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
2371                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
2372                 //combining the appropriate sign change
2373                 edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
2374                 //row1-row0
2375                 edge1_16x8b = _mm_sub_epi8(cmp_lt0_16x8b, cmp_gt0_16x8b);
2376                 edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8); //aligned left (0-1)
2377                 signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(1-0),(0-top)
2378                 //row1 -row2
2379                 cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_bottom_16x8b);
2380                 cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp1_16x8b);
2381                 //creating mask 00 for +ve and -ve values and FF for zero.
2382                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
2383                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
2384                 //combining the appropriate sign change
2385                 signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-2)
2386                 edge0_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge0_16x8b, 8); //(1-2),(0-1)
2387                 //adding top and down substraction
2388                 edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); //(1,0) sign_up empty
2389                 //for the next iteration signup0_16x8b = -signdwn1_16x8b
2390                 signup0_16x8b = _mm_sub_epi8(cmp_lt0_16x8b, cmp_gt0_16x8b); //(2-1) for next
2391                 src_top_16x8b = src_temp1_16x8b;
2392 
2393                 //adding constant 2
2394                 edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
2395 
2396                 //shuffle to get sao index
2397                 edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
2398 
2399                 //adding chroma offset to access U and V
2400                 edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b);
2401                 //shuffle to get sao offset
2402                 edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
2403                 //the next top already in  src_top_16x8b
2404                 //cnvert to 16 bit then add and then saturated pack
2405                 signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
2406                 src_temp0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
2407                 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
2408                 src_temp1_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b);
2409                 edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
2410                 src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
2411                 src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, edge0_16x8b);
2412                 src_temp0_16x8b = _mm_packus_epi16(src_temp0_16x8b, src_temp1_16x8b);
2413 
2414                 cmp_gt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8);
2415 
2416                 //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
2417                 _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
2418                 // row = 1
2419                 _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_gt0_16x8b);
2420                 src_temp0_16x8b = src_bottom_16x8b;
2421                 pu1_src_cpy += (src_strd << 1);
2422 
2423             }
2424             ht_rem = ht & 0x1;
2425             if(ht_rem)
2426             {
2427 
2428                 //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
2429                 src_bottom_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + src_strd));
2430 
2431                 //row 0 -row1
2432                 //separating +ve and and -ve values.
2433                 cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_bottom_16x8b);
2434                 cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp0_16x8b);
2435                 //creating mask 00 for +ve and -ve values and FF for zero.
2436                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
2437                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
2438                 //combining the appropriate sign change
2439                 edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
2440                 //adding top and down substraction
2441                 edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
2442                 //adding constant 2
2443                 edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
2444                 src_top_16x8b = src_temp0_16x8b;
2445 
2446                 edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8);
2447                 edge0_16x8b = _mm_srli_si128(edge0_16x8b, 8);
2448                 //shuffle to get sao index
2449                 edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
2450                 //adding chroma offset to access U and V
2451                 edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b);
2452                 //shuffle to get sao offset
2453                 edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
2454 
2455                 //cnvert to 16 bit then add and then saturated pack
2456                 signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
2457                 src_temp0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
2458                 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
2459                 src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
2460                 src_temp0_16x8b = _mm_packus_epi16(src_temp0_16x8b, const0_16x8b);
2461                 //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
2462                 _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
2463                 pu1_src_cpy += (src_strd);
2464 
2465             }
2466             if(0 == pu1_avail[3])
2467             {
2468                 src_top_16x8b = src_bottom_16x8b;
2469             }
2470             _mm_storel_epi64((__m128i *)(pu1_src_top + wd - col), src_top_16x8b);
2471             pu1_src += 8;
2472         }
2473     }
2474 }
2475 
2476 /* 135 degree filtering */
ihevc_sao_edge_offset_class2_ssse3(UWORD8 * pu1_src,WORD32 src_strd,UWORD8 * pu1_src_left,UWORD8 * pu1_src_top,UWORD8 * pu1_src_top_left,UWORD8 * pu1_src_top_right,UWORD8 * pu1_src_bot_left,UWORD8 * pu1_avail,WORD8 * pi1_sao_offset,WORD32 wd,WORD32 ht)2477 void ihevc_sao_edge_offset_class2_ssse3(UWORD8 *pu1_src,
2478                                         WORD32 src_strd,
2479                                         UWORD8 *pu1_src_left,
2480                                         UWORD8 *pu1_src_top,
2481                                         UWORD8 *pu1_src_top_left,
2482                                         UWORD8 *pu1_src_top_right,
2483                                         UWORD8 *pu1_src_bot_left,
2484                                         UWORD8 *pu1_avail,
2485                                         WORD8 *pi1_sao_offset,
2486                                         WORD32 wd,
2487                                         WORD32 ht)
2488 {
2489     WORD32 row, col;
2490     UWORD8 *pu1_src_top_cpy, *pu1_src_left_cpy, *pu1_src_left_cpy2;
2491     UWORD8 *pu1_left_tmp, *pu1_src_left_str, *pu1_src_left_str2;
2492     UWORD8 *pu1_firstleft;
2493     UWORD8 *pu1_src_cpy, *pu1_src_org;
2494     UWORD8 au1_mask[MAX_CTB_SIZE], *au1_mask_cpy;
2495     UWORD8 au1_src_left_tmp[MAX_CTB_SIZE + 8];
2496     UWORD8 au1_src_left_tmp1[MAX_CTB_SIZE + 8];
2497     WORD32 wd_rem;
2498     UWORD8 u1_pos_0_0_tmp, u1_pos_wd_ht_tmp;
2499     WORD32 ht_tmp, ht_0;
2500 
2501     WORD32 bit_depth;
2502     UWORD8 u1_avail0, u1_avail1;
2503 
2504     __m128i src_top_16x8b, src_bottom_16x8b;
2505     __m128i src_temp0_16x8b, src_temp1_16x8b;
2506     __m128i signup0_16x8b, signdwn1_16x8b;
2507     __m128i cmp_gt0_16x8b, cmp_lt0_16x8b;
2508     __m128i edge0_16x8b, edge1_16x8b;
2509     __m128i au1_mask8x16b;
2510     __m128i edge_idx_8x16b, sao_offset_8x16b;
2511     __m128i const2_16x8b, const0_16x8b;
2512     __m128i left_store_16x8b;
2513     UNUSED(pu1_src_top_right);
2514     UNUSED(pu1_src_bot_left);
2515 
2516     ht_0 = ht; ht_tmp = ht;
2517     au1_mask8x16b = _mm_set1_epi8(0xff);
2518 
2519     //setting availability mask to ff size MAX_CTB_SIZE
2520     for(col = 0; col < MAX_CTB_SIZE; col += 16)
2521         _mm_storeu_si128((__m128i *)(au1_mask + col), au1_mask8x16b);
2522     for(row = 0; row < ht; row++)
2523     {
2524         au1_src_left_tmp[row] = pu1_src_left[row];
2525     }
2526     bit_depth = BIT_DEPTH_LUMA;
2527     pu1_src_org = pu1_src;
2528     pu1_src_top_cpy = pu1_src_top;
2529     pu1_src_left_cpy2 = au1_src_left_tmp;
2530     pu1_src_left_cpy = au1_src_left_tmp;
2531     pu1_src_left_str2 = au1_src_left_tmp1;
2532     pu1_src_left_str = au1_src_left_tmp1;
2533     edge_idx_8x16b   = _mm_loadl_epi64((__m128i *)gi1_table_edge_idx);
2534     sao_offset_8x16b = _mm_loadl_epi64((__m128i *)pi1_sao_offset);
2535 
2536 
2537     /* If top-left is available, process separately */
2538     if(0 != pu1_avail[4])
2539     {
2540         WORD8 edge_idx;
2541 
2542         edge_idx = 2 + SIGN(pu1_src[0] - pu1_src_top_left[0]) +
2543                         SIGN(pu1_src[0] - pu1_src[1 + src_strd]);
2544 
2545         edge_idx = gi1_table_edge_idx[edge_idx];
2546 
2547         if(0 != edge_idx)
2548         {
2549             u1_pos_0_0_tmp = CLIP3(pu1_src[0] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1);
2550         }
2551         else
2552         {
2553             u1_pos_0_0_tmp = pu1_src[0];
2554         }
2555     }
2556     else
2557     {
2558         u1_pos_0_0_tmp = pu1_src[0];
2559     }
2560 
2561     /* If bottom-right is available, process separately */
2562     if(0 != pu1_avail[7])
2563     {
2564         WORD8 edge_idx;
2565 
2566         edge_idx = 2 + SIGN(pu1_src[wd - 1 + (ht - 1) * src_strd] - pu1_src[wd - 1 + (ht - 1) * src_strd - 1 - src_strd]) +
2567                         SIGN(pu1_src[wd - 1 + (ht - 1) * src_strd] - pu1_src[wd - 1 + (ht - 1) * src_strd + 1 + src_strd]);
2568 
2569         edge_idx = gi1_table_edge_idx[edge_idx];
2570 
2571         if(0 != edge_idx)
2572         {
2573             u1_pos_wd_ht_tmp = CLIP3(pu1_src[wd - 1 + (ht - 1) * src_strd] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1);
2574         }
2575         else
2576         {
2577             u1_pos_wd_ht_tmp = pu1_src[wd - 1 + (ht - 1) * src_strd];
2578         }
2579     }
2580     else
2581     {
2582         u1_pos_wd_ht_tmp = pu1_src[wd - 1 + (ht - 1) * src_strd];
2583     }
2584     pu1_firstleft = pu1_src_top_left;
2585 
2586     /* Update height and source pointers based on the availability flags */
2587     if(0 == pu1_avail[2])
2588     {
2589         pu1_firstleft = pu1_src_left_cpy2;
2590         pu1_src_left_cpy2++;
2591         pu1_src_left_str2++;
2592         pu1_src_top_cpy = pu1_src;
2593         pu1_src += src_strd;
2594         ht--;
2595     }
2596     if(0 == pu1_avail[3])
2597     {
2598         ht--;
2599         ht_0--;
2600     }
2601     //storing top left in a mmx register
2602     left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_firstleft);
2603     const2_16x8b = _mm_set1_epi8(2);
2604     const0_16x8b = _mm_setzero_si128();
2605     left_store_16x8b = _mm_slli_si128(left_store_16x8b, 15);
2606     //update top -left
2607     *pu1_src_top_left = pu1_src_top[wd - 1];
2608     //availability mask creation
2609     u1_avail0 = pu1_avail[0];
2610     u1_avail1 = pu1_avail[1];
2611     au1_mask[0] = u1_avail0;
2612     au1_mask[wd - 1] = u1_avail1;
2613     {
2614         WORD32 ht_rem;
2615 
2616 
2617         pu1_src_left_cpy = pu1_src_left_cpy2;
2618         pu1_src_left_str = pu1_src_left_str2;
2619         au1_mask_cpy = au1_mask;
2620         for(col = wd; col >= 16; col -= 16)
2621         {
2622             pu1_src_cpy = pu1_src;
2623             src_top_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_top_cpy + wd - col));
2624             //row = 0
2625             src_temp0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy));
2626             src_top_16x8b = _mm_alignr_epi8(src_top_16x8b, left_store_16x8b, 15);
2627             //loading the mask
2628             au1_mask8x16b = _mm_loadu_si128((__m128i *)au1_mask_cpy);
2629             //separating +ve and and -ve values.
2630             cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_top_16x8b);
2631             cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp0_16x8b);
2632             //creating mask 00 for +ve and -ve values and FF for zero.
2633             cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
2634             cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
2635             //combining the appropriate sign change
2636             signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
2637 
2638 
2639             for(row = ht; row >= 2; row -= 2)
2640             {
2641                 left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
2642                 //row = 1
2643                 src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
2644                 // row = 1 right
2645                 src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd + 1));
2646                 //to insert left in row 0
2647                 signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 15);
2648                 //row 0 -row1
2649                 //separating +ve and and -ve values.
2650                 cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_bottom_16x8b);
2651                 cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp0_16x8b);
2652 
2653                 //creating mask 00 for +ve and -ve values and FF for zero.
2654                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
2655                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
2656                 //manipulation for row 1 - row 0
2657                 signdwn1_16x8b = _mm_alignr_epi8(src_temp0_16x8b, signdwn1_16x8b, 15);
2658                 //combining the appropriate sign change
2659                 edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(0-1)
2660                 //row1-row0
2661                 //separating +ve and and -ve values.
2662                 cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
2663                 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
2664                 //creating mask 00 for +ve and -ve values and FF for zero.
2665                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
2666                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
2667                 // row = 2 right
2668                 src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd + 1));
2669                 edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-0)
2670 
2671 
2672                 //row1 -bottom
2673                 cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_bottom_16x8b);
2674                 cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp1_16x8b);
2675                 //creating mask 00 for +ve and -ve values and FF for zero.
2676                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
2677                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
2678                 //combining the appropriate sign change
2679                 signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
2680                 // row = 2
2681                 src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd));
2682 
2683                 //combining sign-left and sign_right
2684                 edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
2685 
2686                 //storing the row 1 left for next row.
2687                 signup0_16x8b = _mm_slli_si128(left_store_16x8b, 14);
2688 
2689                 //combining sign-left and sign_right
2690                 edge1_16x8b = _mm_add_epi8(edge1_16x8b, signdwn1_16x8b);
2691                 //manipulation for bottom - row 1
2692                 signup0_16x8b = _mm_alignr_epi8(src_temp1_16x8b, signup0_16x8b, 15);
2693                 //eliminating old left for row 0 and row 1
2694                 left_store_16x8b = _mm_srli_si128(left_store_16x8b, 2);
2695                 //bottom - row1
2696                 cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signup0_16x8b);
2697                 cmp_lt0_16x8b = _mm_subs_epu8(signup0_16x8b, src_bottom_16x8b);
2698                 //creating mask 00 for +ve and -ve values and FF for zero.
2699                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
2700                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
2701                 //for the next iteration bottom -row1
2702                 signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
2703                 //row1  getting it right for left of next block
2704                 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp1_16x8b, 15);
2705                 //adding constant 2
2706                 edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
2707                 edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b);
2708                 //shuffle to get sao index
2709                 edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
2710                 edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b);
2711                 //using availability mask
2712                 edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
2713                 edge1_16x8b = _mm_and_si128(edge1_16x8b, au1_mask8x16b);
2714                 //shuffle to get sao offset
2715                 edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
2716                 edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b);
2717                 //row0  getting it right for left of next block
2718                 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 15);
2719                 //copying the next top
2720                 src_top_16x8b = src_temp1_16x8b;
2721                 //cnvert to 16 bit then add and then saturated pack
2722                 signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
2723                 cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
2724                 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
2725                 src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
2726                 cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
2727                 cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
2728                 src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
2729                 src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
2730 
2731                 signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b);
2732                 cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b);
2733                 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, signdwn1_16x8b);
2734                 src_temp1_16x8b = _mm_unpackhi_epi8(src_temp1_16x8b, const0_16x8b);
2735                 cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
2736                 cmp_lt0_16x8b = _mm_unpackhi_epi8(edge1_16x8b, signdwn1_16x8b);
2737                 src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, cmp_lt0_16x8b);
2738                 src_temp1_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp1_16x8b);
2739 
2740                 //store left boundary
2741                 _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
2742                 //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
2743                 _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
2744                 // row = 1
2745                 _mm_storeu_si128((__m128i *)(pu1_src_cpy + src_strd), src_temp1_16x8b);
2746 
2747                 src_temp0_16x8b = src_bottom_16x8b;
2748                 pu1_src_cpy += (src_strd << 1);
2749                 pu1_src_left_cpy += 2;
2750                 pu1_src_left_str += 2;
2751             }
2752             ht_rem = ht & 0x1;
2753 
2754             if(ht_rem)
2755             {
2756                 left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
2757                 src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd + 1));
2758                 //current row -next row
2759                 //separating +ve and and -ve values.
2760                 cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_bottom_16x8b);
2761                 cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp0_16x8b);
2762                 //creating mask 00 for +ve and -ve values and FF for zero.
2763                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
2764                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
2765                 //combining the appropriate sign change
2766                 edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
2767                 //adding top and botton and constant 2
2768                 edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
2769                 edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
2770                 //eliminating old left for row 0 and row 1
2771                 left_store_16x8b = _mm_srli_si128(left_store_16x8b, 1);
2772 
2773                 edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
2774                 //using availability mask
2775                 edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
2776 
2777                 edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
2778 
2779                 //row0  getting it right for left of next block
2780                 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 15);
2781                 //copying the next top
2782                 src_top_16x8b = src_temp0_16x8b;
2783                 //cnvert to 16 bit then add and then saturated pack
2784                 signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
2785                 cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
2786                 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
2787                 src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
2788                 cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
2789                 cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
2790                 src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
2791                 src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
2792                 //store left boundary
2793                 _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
2794 
2795                 _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
2796                 pu1_src_cpy += (src_strd);
2797                 pu1_src_left_cpy += 1;
2798                 pu1_src_left_str += 1;
2799             }
2800             if(0 == pu1_avail[3])
2801             {
2802                 src_top_16x8b = src_bottom_16x8b;
2803                 pu1_src_left_str[0] = pu1_src_cpy[15];
2804             }
2805             if(0 == pu1_avail[2])
2806             {
2807                 pu1_src_left_str[-ht_0] = pu1_src[15 - src_strd];
2808             }
2809 
2810             //for the top left of next part of the block
2811             left_store_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_top_cpy + wd - col));
2812             //updating top flag
2813             _mm_storeu_si128((__m128i *)(pu1_src_top + wd - col), src_top_16x8b);
2814             pu1_src += 16;
2815             au1_mask_cpy += 16;
2816 
2817 
2818             pu1_left_tmp = pu1_src_left_cpy2;
2819             pu1_src_left_cpy2 = pu1_src_left_str2;
2820             pu1_src_left_str2 = pu1_left_tmp;
2821 
2822             pu1_src_left_cpy = pu1_src_left_cpy2;
2823             pu1_src_left_str = pu1_src_left_str2;
2824         }
2825 
2826         wd_rem = wd & 0xF;
2827         if(wd_rem)
2828         {
2829             pu1_src_left_cpy = pu1_src_left_cpy2;
2830             pu1_src_left_str = pu1_src_left_str2;
2831             pu1_src_cpy = pu1_src;
2832             src_top_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_top_cpy + wd - col));
2833             //row = 0
2834             src_temp0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy));
2835             src_top_16x8b = _mm_alignr_epi8(src_top_16x8b, left_store_16x8b, 15);
2836             au1_mask8x16b = _mm_loadl_epi64((__m128i *)au1_mask_cpy); //????
2837             //separating +ve and and -ve values.
2838             cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_top_16x8b);
2839             cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp0_16x8b);
2840             //creating mask 00 for +ve and -ve values and FF for zero.
2841             cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
2842             cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
2843             //preparing au1_mask
2844             au1_mask8x16b = _mm_unpacklo_epi64(au1_mask8x16b, au1_mask8x16b);
2845             //combining the appropriate sign change
2846             signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
2847             signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8);
2848 
2849             for(row = ht; row >= 4; row -= 4)
2850             {
2851                 left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
2852                 //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
2853                 src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
2854                 // row = 2
2855                 src_bottom_16x8b =  _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd));
2856                 //right row1
2857                 signdwn1_16x8b = _mm_srli_si128(src_temp1_16x8b, 1);
2858                 //row 0 -row1
2859                 //separating +ve and and -ve values.
2860                 cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b);
2861                 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b);
2862                 //manipulation for row 1 -row 0
2863                 signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 15);
2864                 //creating mask 00 for +ve and -ve values and FF for zero.
2865                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
2866                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
2867                 //row 0 left
2868                 signdwn1_16x8b = _mm_alignr_epi8(src_temp0_16x8b, signdwn1_16x8b, 15);
2869                 //combining the appropriate sign change
2870                 edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
2871                 //row 1 -row0
2872                 //separating +ve and and -ve values.
2873                 cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
2874                 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
2875 
2876                 //creating mask 00 for +ve and -ve values and FF for zero.
2877                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
2878                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
2879                 //row1-row0
2880                 edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
2881 
2882                 edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8); //aligned left (0-1)
2883 
2884                 signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(1-0),(0-top)
2885                 //right row2
2886                 signdwn1_16x8b = _mm_srli_si128(src_bottom_16x8b, 1);
2887                 //packing row 0 n row 1
2888                 src_temp0_16x8b = _mm_unpacklo_epi64(src_temp0_16x8b, src_temp1_16x8b);
2889                 //row1 -row2
2890                 cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
2891                 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
2892                 //creating mask 00 for +ve and -ve values and FF for zero.
2893                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
2894                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
2895                 //combining the appropriate sign change
2896                 signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-2)
2897                 edge0_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge0_16x8b, 8); //(1-2),(0-1)
2898                 //manipulation for row 2 -row 1
2899                 signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 14);
2900                 //row 1 left
2901                 signdwn1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, signdwn1_16x8b, 15);
2902                 //row = 3
2903                 src_top_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 3 * src_strd));
2904 
2905                 // row = 4
2906                 src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 4 * src_strd));
2907 
2908                 edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); //(1,0) sign_up empty
2909 
2910                 //separating +ve and and -ve values.(2,1)
2911                 cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signdwn1_16x8b);
2912                 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_bottom_16x8b);
2913                 //manipulation for row 3 -row 2
2914                 signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 13);
2915                 //creating mask 00 for +ve and -ve values and FF for zero.
2916                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
2917                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
2918                 //row 2 left
2919                 signdwn1_16x8b = _mm_alignr_epi8(src_bottom_16x8b, signdwn1_16x8b, 15);
2920                 //combining the appropriate sign change
2921                 signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(2-1)
2922 
2923                 //separating +ve and and -ve values.(3,2)
2924                 cmp_gt0_16x8b = _mm_subs_epu8(src_top_16x8b, signdwn1_16x8b);
2925                 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_top_16x8b);
2926                 signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8); //aligned left (2-1)
2927                 //creating mask 00 for +ve and -ve values and FF for zero.
2928                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
2929                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
2930                 //right row3
2931                 signdwn1_16x8b = _mm_srli_si128(src_top_16x8b, 1);
2932                 //combining the appropriate sign change
2933                 edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(3-2)
2934 
2935                 signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(3-2) ,(2-1)
2936 
2937                 //separating +ve and and -ve values.(2,3)
2938                 cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signdwn1_16x8b);
2939                 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_bottom_16x8b);
2940                 //right row 4
2941                 signdwn1_16x8b =  _mm_srli_si128(src_temp1_16x8b, 1);
2942                 //creating mask 00 for +ve and -ve values and FF for zero.
2943                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
2944                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
2945                 //combining the appropriate sign change
2946                 edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(2-3)
2947 
2948                 //separating +ve and and -ve values.(3,bottom)
2949                 cmp_gt0_16x8b = _mm_subs_epu8(src_top_16x8b, signdwn1_16x8b);
2950                 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_top_16x8b);
2951 
2952                 //creating mask 00 for +ve and -ve values and FF for zero.
2953                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
2954                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
2955                 edge1_16x8b = _mm_slli_si128(edge1_16x8b, 8); //aligned left (2-3)
2956                 //combining the appropriate sign change
2957                 signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(3-bottom)
2958                 edge1_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge1_16x8b, 8); //(3-bottom),(2-3)
2959 
2960                 //manipulation for bottom -row 3
2961                 signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 12);
2962                 //eliminating old left for row 0,1,2,3
2963                 left_store_16x8b = _mm_srli_si128(left_store_16x8b, 4);
2964                 //packing row 2 n row 3
2965                 src_bottom_16x8b = _mm_unpacklo_epi64(src_bottom_16x8b, src_top_16x8b);
2966                 //row 3 left
2967                 signdwn1_16x8b = _mm_alignr_epi8(src_top_16x8b, signdwn1_16x8b, 15);
2968                 //loading row 3 right into left
2969                 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_bottom_16x8b, 15);
2970                 //adding bottom and top values of row 2 and row 3
2971                 edge1_16x8b = _mm_add_epi8(edge1_16x8b, signup0_16x8b); //(3,2)
2972                 //separating +ve and and -ve values.(botttom,3)
2973                 cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
2974                 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
2975                 //to store right of row 2
2976                 signdwn1_16x8b = _mm_slli_si128(src_bottom_16x8b, 8);
2977                 //creating mask 00 for +ve and -ve values and FF for zero.
2978                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
2979                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
2980                 signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(bottom -3) for next iteration
2981 
2982                 //storing right of row 2into left
2983                 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 15);
2984                 //to store right of row 0
2985                 signdwn1_16x8b = _mm_slli_si128(src_temp0_16x8b, 8);
2986                 //storing right of row 1 into left
2987                 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 15);
2988 
2989                 //adding constant 2
2990                 edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
2991                 edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b);
2992                 //shuffle to get sao index
2993                 edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
2994                 edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b);
2995                 //using availability mask
2996                 edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
2997                 edge1_16x8b = _mm_and_si128(edge1_16x8b, au1_mask8x16b);
2998                 //shuffle to get sao offset
2999                 edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
3000                 edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b);
3001 
3002                 //storing right of row 0 into left
3003                 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 15);
3004                 //cnvert to 16 bit then add and then saturated pack
3005                 signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
3006                 cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
3007                 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
3008                 src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
3009                 cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
3010                 cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
3011                 src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
3012                 src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
3013 
3014                 signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b);
3015                 cmp_gt0_16x8b = _mm_unpacklo_epi8(src_bottom_16x8b, const0_16x8b);
3016                 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, signdwn1_16x8b);
3017                 src_bottom_16x8b = _mm_unpackhi_epi8(src_bottom_16x8b, const0_16x8b);
3018                 cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
3019                 cmp_lt0_16x8b = _mm_unpackhi_epi8(edge1_16x8b, signdwn1_16x8b);
3020                 src_bottom_16x8b = _mm_add_epi16(src_bottom_16x8b, cmp_lt0_16x8b);
3021                 src_bottom_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_bottom_16x8b);
3022 
3023                 cmp_gt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8);
3024                 cmp_lt0_16x8b = _mm_srli_si128(src_bottom_16x8b, 8);
3025 
3026                 _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
3027                 //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
3028                 _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
3029                 // row = 1
3030                 _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_gt0_16x8b);
3031                 //row = 2
3032                 _mm_storel_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd), src_bottom_16x8b);
3033                 // row = 3
3034                 _mm_storel_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd), cmp_lt0_16x8b);
3035 
3036                 src_temp0_16x8b = src_temp1_16x8b;
3037                 signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8);
3038                 pu1_src_cpy += (src_strd << 2);
3039                 pu1_src_left_cpy += 4;
3040                 pu1_src_left_str += 4;
3041             }
3042             ht_rem = ht & 0x2;
3043             if(ht_rem)
3044             {
3045                 left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
3046                 //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
3047                 src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
3048                 // row = 2
3049                 src_bottom_16x8b =  _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd));
3050 
3051                 //row 0 -row 1
3052                 signdwn1_16x8b = _mm_srli_si128(src_temp1_16x8b, 1);
3053                 //separating +ve and and -ve values.
3054                 cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b);
3055                 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b);
3056                 //manipulation for row 1 -row 0
3057                 signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 15);
3058                 //creating mask 00 for +ve and -ve values and FF for zero.
3059                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
3060                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
3061                 //manipulation for row 1 - row 0
3062                 signdwn1_16x8b = _mm_alignr_epi8(src_temp0_16x8b, signdwn1_16x8b, 15);
3063                 //combining the appropriate sign change
3064                 edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
3065 
3066                 //row1-row0
3067                 //separating +ve and and -ve values.
3068                 cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
3069                 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
3070 
3071                 //creating mask 00 for +ve and -ve values and FF for zero.
3072                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
3073                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
3074                 //combining the appropriate sign chang
3075                 edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
3076                 //row 1 -bottom
3077                 signdwn1_16x8b = _mm_srli_si128(src_bottom_16x8b, 1);
3078 
3079                 edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8); //aligned left (0-1)
3080                 signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(1-0),(0-top)
3081                 //row1 -bottom
3082                 cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
3083                 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
3084 
3085                 //creating mask 00 for +ve and -ve values and FF for zero.
3086                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
3087                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
3088                 //combining the appropriate sign change
3089                 signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-2)
3090                 edge0_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge0_16x8b, 8); //(1-2),(0-1)
3091                 //manipulation for bottom -row1
3092                 signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 14);
3093                 //manipulation for bottom- row 1
3094                 signdwn1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, signdwn1_16x8b, 15);
3095                 //adding top and down substraction
3096                 edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); //(1,0) sign_up empty
3097                 //bottom - row 1
3098                 cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signdwn1_16x8b);
3099                 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_bottom_16x8b);
3100 
3101                 //eliminating old left for row 0,1
3102                 left_store_16x8b = _mm_srli_si128(left_store_16x8b, 2);
3103                 signdwn1_16x8b = _mm_slli_si128(src_temp1_16x8b, 8);
3104                 //creating mask 00 for +ve and -ve values and FF for zero.
3105                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
3106                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
3107                 //for the next iteration signup0_16x8b
3108                 signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(2-1) for next
3109 
3110                 //storing right of row 1 into left
3111                 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 15);
3112                 //for storing right of row 1
3113                 signdwn1_16x8b = _mm_slli_si128(src_temp0_16x8b, 8);
3114 
3115                 src_top_16x8b = src_temp1_16x8b;
3116                 //storing right of row 0 into left
3117                 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 15);
3118 
3119                 //adding constant 2
3120                 edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
3121 
3122                 //shuffle to get sao index
3123                 edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
3124                 //using availability mask
3125                 edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
3126                 //shuffle to get sao offset
3127                 edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
3128 
3129                 //the next top already in  src_top_16x8b
3130                 //cnvert to 16 bit then add and then saturated pack
3131                 signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
3132                 src_temp0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
3133                 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
3134                 src_temp1_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b);
3135                 src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
3136                 cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
3137                 src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, cmp_lt0_16x8b);
3138                 src_temp0_16x8b = _mm_packus_epi16(src_temp0_16x8b, src_temp1_16x8b);
3139 
3140                 cmp_gt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8);
3141 
3142                 _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
3143                 //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
3144                 _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
3145                 // row = 1
3146                 _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_gt0_16x8b);
3147                 src_temp0_16x8b = src_bottom_16x8b;
3148                 pu1_src_cpy += (src_strd << 1);
3149                 pu1_src_left_cpy += 2;
3150                 pu1_src_left_str += 2;
3151             }
3152             ht_rem = ht & 0x1;
3153             if(ht_rem)
3154             {
3155                 left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
3156                 //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
3157                 src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
3158                 //left store manipulation 1
3159                 left_store_16x8b = _mm_srli_si128(left_store_16x8b, 1);
3160                 //row 0 -row1
3161                 signdwn1_16x8b = _mm_srli_si128(src_bottom_16x8b, 1);
3162                 //separating +ve and and -ve values.
3163                 cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b);
3164                 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b);
3165                 //creating mask 00 for +ve and -ve values and FF for zero.
3166                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
3167                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
3168                 //combining the appropriate sign change
3169                 edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
3170                 //adding top and down substraction
3171                 edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
3172                 //for row 0 right to put into left store
3173                 signdwn1_16x8b = _mm_slli_si128(src_temp0_16x8b, 8);
3174                 //adding constant 2
3175                 edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
3176                 edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8);
3177                 edge0_16x8b = _mm_srli_si128(edge0_16x8b, 8);
3178                 //filling the left boundary value
3179                 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 15);
3180 
3181                 //shuffle to get sao index
3182                 edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
3183                 //using availability mask
3184                 edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
3185                 //shuffle to get sao offset
3186                 edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
3187                 src_top_16x8b = src_temp0_16x8b;
3188                 //cnvert to 16 bit then add and then saturated pack
3189                 signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
3190                 src_temp0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
3191                 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
3192                 src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
3193                 src_temp0_16x8b = _mm_packus_epi16(src_temp0_16x8b, const0_16x8b);
3194 
3195                 _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
3196                 //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
3197                 _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
3198                 pu1_src_cpy += (src_strd);
3199                 pu1_src_left_cpy += 1;
3200                 pu1_src_left_str += 1;
3201             }
3202             if(0 == pu1_avail[3])
3203             {
3204                 src_top_16x8b = src_bottom_16x8b;
3205                 pu1_src_left_str[0] = pu1_src_cpy[7];
3206             }
3207 
3208             if(0 == pu1_avail[2])
3209             {
3210                 pu1_src_left_str[-ht_0] = pu1_src[7 - src_strd];
3211             }
3212 
3213             _mm_storel_epi64((__m128i *)(pu1_src_top + wd - col), src_top_16x8b);
3214             pu1_src += 8;
3215             au1_mask_cpy += 16;
3216 
3217             pu1_left_tmp = pu1_src_left_cpy2;
3218             pu1_src_left_cpy2 = pu1_src_left_str2;
3219             pu1_src_left_str2 = pu1_left_tmp;
3220 
3221             pu1_src_left_cpy = pu1_src_left_cpy2;
3222             pu1_src_left_str = pu1_src_left_str2;
3223         }
3224         pu1_src_org[0] = u1_pos_0_0_tmp;
3225         pu1_src_org[wd - 1 + (ht_tmp - 1) * src_strd] = u1_pos_wd_ht_tmp;
3226         pu1_src_left_cpy = (0 == pu1_avail[2]) ? (pu1_src_left_cpy - 1) : pu1_src_left_cpy;
3227         for(row = 0; row < ht_tmp; row++)
3228         {
3229             pu1_src_left[row] = pu1_src_left_cpy[row];
3230         }
3231     }
3232 
3233 }
3234 
3235 /* 135 degree filtering */
ihevc_sao_edge_offset_class2_chroma_ssse3(UWORD8 * pu1_src,WORD32 src_strd,UWORD8 * pu1_src_left,UWORD8 * pu1_src_top,UWORD8 * pu1_src_top_left,UWORD8 * pu1_src_top_right,UWORD8 * pu1_src_bot_left,UWORD8 * pu1_avail,WORD8 * pi1_sao_offset_u,WORD8 * pi1_sao_offset_v,WORD32 wd,WORD32 ht)3236 void ihevc_sao_edge_offset_class2_chroma_ssse3(UWORD8 *pu1_src,
3237                                                WORD32 src_strd,
3238                                                UWORD8 *pu1_src_left,
3239                                                UWORD8 *pu1_src_top,
3240                                                UWORD8 *pu1_src_top_left,
3241                                                UWORD8 *pu1_src_top_right,
3242                                                UWORD8 *pu1_src_bot_left,
3243                                                UWORD8 *pu1_avail,
3244                                                WORD8 *pi1_sao_offset_u,
3245                                                WORD8 *pi1_sao_offset_v,
3246                                                WORD32 wd,
3247                                                WORD32 ht)
3248 {
3249     WORD32 row, col;
3250     UWORD8 *pu1_src_top_cpy, *pu1_src_left_cpy, *pu1_src_left_cpy2;
3251     UWORD8 *pu1_left_tmp, *pu1_src_left_str, *pu1_src_left_str2;
3252     UWORD8 *pu1_firstleft;
3253     UWORD8 *pu1_src_cpy, *pu1_src_org;
3254     UWORD8 au1_mask[MAX_CTB_SIZE], *au1_mask_cpy;
3255     UWORD8 au1_src_left_tmp[2 * (MAX_CTB_SIZE + 8)];
3256     UWORD8 au1_src_left_tmp1[2 * (MAX_CTB_SIZE + 8)];
3257     WORD32 wd_rem;
3258     UWORD8 u1_pos_0_0_tmp_u, u1_pos_0_0_tmp_v, u1_pos_wd_ht_tmp_u, u1_pos_wd_ht_tmp_v;
3259     WORD32 ht_tmp;
3260     WORD32 ht_0;
3261 
3262     WORD32 bit_depth;
3263     UWORD8 u1_avail0, u1_avail1;
3264 
3265     __m128i src_temp0_16x8b, src_temp1_16x8b;
3266     __m128i signup0_16x8b, signdwn1_16x8b;
3267     __m128i cmp_gt0_16x8b, cmp_lt0_16x8b;
3268     __m128i edge0_16x8b, edge1_16x8b;
3269     __m128i src_top_16x8b, src_bottom_16x8b;
3270     __m128i au1_mask8x16b;
3271     __m128i edge_idx_8x16b, sao_offset_8x16b;
3272     __m128i const2_16x8b, const0_16x8b;
3273     __m128i left_store_16x8b;
3274     __m128i chroma_offset_8x16b;
3275 
3276     UNUSED(pu1_src_top_right);
3277     UNUSED(pu1_src_bot_left);
3278 
3279     ht_0 = ht; ht_tmp = ht;
3280     au1_mask8x16b = _mm_set1_epi8(0xff);
3281     /* Updating left and top-left  */
3282     for(row = 0; row < 2 * ht; row++)
3283     {
3284         au1_src_left_tmp[row] = pu1_src_left[row];
3285     }
3286     //setting availability mask to ff size MAX_CTB_SIZE
3287     for(col = 0; col < MAX_CTB_SIZE; col += 16)
3288         _mm_storeu_si128((__m128i *)(au1_mask + col), au1_mask8x16b);
3289     bit_depth = BIT_DEPTH_LUMA;
3290     pu1_src_org = pu1_src;
3291     pu1_src_top_cpy = pu1_src_top;
3292     pu1_src_left_cpy2 = au1_src_left_tmp;
3293     pu1_src_left_cpy = au1_src_left_tmp;
3294     pu1_src_left_str2 = au1_src_left_tmp1;
3295     pu1_src_left_str = au1_src_left_tmp1;
3296     edge_idx_8x16b   = _mm_loadl_epi64((__m128i *)gi1_table_edge_idx);
3297     sao_offset_8x16b = _mm_loadl_epi64((__m128i *)pi1_sao_offset_u);
3298     const0_16x8b = _mm_loadl_epi64((__m128i *)pi1_sao_offset_v);
3299     chroma_offset_8x16b = _mm_set1_epi16(0x0800);
3300 
3301     /* If top-left is available, process separately */
3302     if(0 != pu1_avail[4])
3303     {
3304         WORD32 edge_idx;
3305 
3306         /* U */
3307         edge_idx = 2 + SIGN(pu1_src[0] - pu1_src_top_left[0]) +
3308                         SIGN(pu1_src[0] - pu1_src[2 + src_strd]);
3309 
3310         edge_idx = gi1_table_edge_idx[edge_idx];
3311 
3312         if(0 != edge_idx)
3313         {
3314             u1_pos_0_0_tmp_u = CLIP3(pu1_src[0] + pi1_sao_offset_u[edge_idx], 0, (1 << bit_depth) - 1);
3315         }
3316         else
3317         {
3318             u1_pos_0_0_tmp_u = pu1_src[0];
3319         }
3320 
3321         /* V */
3322         edge_idx = 2 + SIGN(pu1_src[1] - pu1_src_top_left[1]) +
3323                         SIGN(pu1_src[1] - pu1_src[1 + 2 + src_strd]);
3324 
3325         edge_idx = gi1_table_edge_idx[edge_idx];
3326 
3327         if(0 != edge_idx)
3328         {
3329             u1_pos_0_0_tmp_v = CLIP3(pu1_src[1] + pi1_sao_offset_v[edge_idx], 0, (1 << bit_depth) - 1);
3330         }
3331         else
3332         {
3333             u1_pos_0_0_tmp_v = pu1_src[1];
3334         }
3335     }
3336     else
3337     {
3338         u1_pos_0_0_tmp_u = pu1_src[0];
3339         u1_pos_0_0_tmp_v = pu1_src[1];
3340     }
3341 
3342     /* If bottom-right is available, process separately */
3343     if(0 != pu1_avail[7])
3344     {
3345         WORD32 edge_idx;
3346 
3347         /* U */
3348         edge_idx = 2 + SIGN(pu1_src[wd - 2 + (ht - 1) * src_strd] - pu1_src[wd - 2 + (ht - 1) * src_strd - 2 - src_strd]) +
3349                         SIGN(pu1_src[wd - 2 + (ht - 1) * src_strd] - pu1_src[wd - 2 + (ht - 1) * src_strd + 2 + src_strd]);
3350 
3351         edge_idx = gi1_table_edge_idx[edge_idx];
3352 
3353         if(0 != edge_idx)
3354         {
3355             u1_pos_wd_ht_tmp_u = CLIP3(pu1_src[wd - 2 + (ht - 1) * src_strd] + pi1_sao_offset_u[edge_idx], 0, (1 << bit_depth) - 1);
3356         }
3357         else
3358         {
3359             u1_pos_wd_ht_tmp_u = pu1_src[wd - 2 + (ht - 1) * src_strd];
3360         }
3361 
3362         /* V */
3363         edge_idx = 2 + SIGN(pu1_src[wd - 1 + (ht - 1) * src_strd] - pu1_src[wd - 1 + (ht - 1) * src_strd - 2 - src_strd]) +
3364                         SIGN(pu1_src[wd - 1 + (ht - 1) * src_strd] - pu1_src[wd - 1 + (ht - 1) * src_strd + 2 + src_strd]);
3365 
3366         edge_idx = gi1_table_edge_idx[edge_idx];
3367 
3368         if(0 != edge_idx)
3369         {
3370             u1_pos_wd_ht_tmp_v = CLIP3(pu1_src[wd - 1 + (ht - 1) * src_strd] + pi1_sao_offset_v[edge_idx], 0, (1 << bit_depth) - 1);
3371         }
3372         else
3373         {
3374             u1_pos_wd_ht_tmp_v = pu1_src[wd - 1 + (ht - 1) * src_strd];
3375         }
3376     }
3377     else
3378     {
3379         u1_pos_wd_ht_tmp_u = pu1_src[wd - 2 + (ht - 1) * src_strd];
3380         u1_pos_wd_ht_tmp_v = pu1_src[wd - 1 + (ht - 1) * src_strd];
3381     }
3382     pu1_firstleft = pu1_src_top_left;
3383 
3384     /* Update height and source pointers based on the availability flags */
3385     if(0 == pu1_avail[2])
3386     {
3387         pu1_firstleft = pu1_src_left_cpy2;
3388         pu1_src_left_cpy2 += 2;
3389         pu1_src_left_str2 += 2;
3390         pu1_src_top_cpy = pu1_src;
3391         pu1_src += src_strd;
3392         ht--;
3393     }
3394     if(0 == pu1_avail[3])
3395     {
3396         ht--;
3397         ht_0--;
3398     }
3399     //storing top left in a mmx register
3400     left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_firstleft);
3401     sao_offset_8x16b = _mm_unpacklo_epi64(sao_offset_8x16b, const0_16x8b);
3402     const2_16x8b = _mm_set1_epi8(2);
3403     const0_16x8b = _mm_setzero_si128();
3404     left_store_16x8b = _mm_slli_si128(left_store_16x8b, 14);
3405 
3406     //availability mask creation
3407     u1_avail0 = pu1_avail[0];
3408     u1_avail1 = pu1_avail[1];
3409     au1_mask[0] = u1_avail0;
3410     au1_mask[1] = u1_avail0;
3411     au1_mask[wd - 1] = u1_avail1;
3412     au1_mask[wd - 2] = u1_avail1;
3413 
3414     /* top-left arrays */
3415     pu1_src_top_left[0] = pu1_src_top[wd - 2];
3416     pu1_src_top_left[1] = pu1_src_top[wd - 1];
3417     {
3418         WORD32 ht_rem;
3419         au1_mask_cpy = au1_mask;
3420 
3421         pu1_src_left_cpy = pu1_src_left_cpy2;
3422         pu1_src_left_str = pu1_src_left_str2;
3423         for(col = wd; col >= 16; col -= 16)
3424         {
3425             pu1_src_cpy = pu1_src;
3426             src_top_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_top_cpy + wd - col));
3427             //row = 0
3428             src_temp0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy));
3429             src_top_16x8b = _mm_alignr_epi8(src_top_16x8b, left_store_16x8b, 14);
3430             //loading the mask
3431             au1_mask8x16b = _mm_loadu_si128((__m128i *)au1_mask_cpy);
3432             //separating +ve and and -ve values.
3433             cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_top_16x8b);
3434             cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp0_16x8b);
3435             //creating mask 00 for +ve and -ve values and FF for zero.
3436             cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
3437             cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
3438             //combining the appropriate sign change
3439             signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
3440 
3441 
3442             for(row = ht; row >= 2; row -= 2)
3443             {
3444                 left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
3445                 //row = 1
3446                 src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
3447                 // row = 1 right
3448                 src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd + 2));
3449                 //to insert left in row 0
3450                 signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 14);
3451                 //row 0 -row1
3452                 //separating +ve and and -ve values.
3453                 cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_bottom_16x8b);
3454                 cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp0_16x8b);
3455 
3456                 //creating mask 00 for +ve and -ve values and FF for zero.
3457                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
3458                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
3459                 //manipulation for row 1 - row 0
3460                 signdwn1_16x8b = _mm_alignr_epi8(src_temp0_16x8b, signdwn1_16x8b, 14);
3461                 //combining the appropriate sign change
3462                 edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(0-1)
3463                 //row1-row0
3464                 //separating +ve and and -ve values.
3465                 cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
3466                 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
3467                 //creating mask 00 for +ve and -ve values and FF for zero.
3468                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
3469                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
3470                  // row = 2 right
3471                 src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd + 2));
3472                 edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-0)
3473 
3474 
3475                 //row1 -bottom
3476                 cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_bottom_16x8b);
3477                 cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp1_16x8b);
3478                 //creating mask 00 for +ve and -ve values and FF for zero.
3479                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
3480                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
3481                 //combining the appropriate sign change
3482                 signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
3483                 // row = 2
3484                 src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd));
3485 
3486                 //combining sign-left and sign_right
3487                 edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
3488 
3489                 //storing the row 1 left for next row.
3490                 signup0_16x8b = _mm_slli_si128(left_store_16x8b, 12);
3491 
3492                 //combining sign-left and sign_right
3493                 edge1_16x8b = _mm_add_epi8(edge1_16x8b, signdwn1_16x8b);
3494                 //manipulation for bottom - row 1
3495                 signup0_16x8b = _mm_alignr_epi8(src_temp1_16x8b, signup0_16x8b, 14);
3496                 //eliminating old left for row 0 and row 1
3497                 left_store_16x8b = _mm_srli_si128(left_store_16x8b, 4);
3498                 //bottom - row1
3499                 cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signup0_16x8b);
3500                 cmp_lt0_16x8b = _mm_subs_epu8(signup0_16x8b, src_bottom_16x8b);
3501                 //creating mask 00 for +ve and -ve values and FF for zero.
3502                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
3503                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
3504                 //for the next iteration bottom -row1
3505                 signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
3506                 //row1  getting it right for left of next iteration
3507                 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp1_16x8b, 14);
3508                 //copying the next top
3509                 src_top_16x8b = src_temp1_16x8b;
3510                 //row0  getting its right for left of next iteration.
3511                 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 14);
3512 
3513 
3514                 //adding constant 2
3515                 edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
3516                 edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b);
3517                 //shuffle to get sao index
3518                 edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
3519                 edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b);
3520                 //using availability mask
3521                 edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
3522                 edge1_16x8b = _mm_and_si128(edge1_16x8b, au1_mask8x16b);
3523                 //adding chroma offset to access U and V
3524                 edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b);
3525                 edge1_16x8b = _mm_add_epi8(edge1_16x8b, chroma_offset_8x16b);
3526 
3527 
3528                 //shuffle to get sao offset
3529                 edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
3530                 edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b);
3531                 //cnvert to 16 bit then add and then saturated pack
3532                 signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
3533                 cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
3534                 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
3535                 src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
3536                 edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
3537                 cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
3538                 src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, edge0_16x8b);
3539                 src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
3540 
3541                 signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b);
3542                 cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b);
3543                 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, signdwn1_16x8b);
3544                 src_temp1_16x8b = _mm_unpackhi_epi8(src_temp1_16x8b, const0_16x8b);
3545                 edge1_16x8b = _mm_unpackhi_epi8(edge1_16x8b, signdwn1_16x8b);
3546                 cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
3547                 src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, edge1_16x8b);
3548                 src_temp1_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp1_16x8b);
3549 
3550                 //store left boundary
3551                 _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
3552                 //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
3553                 _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
3554                 // row = 1
3555                 _mm_storeu_si128((__m128i *)(pu1_src_cpy + src_strd), src_temp1_16x8b);
3556 
3557                 src_temp0_16x8b = src_bottom_16x8b;
3558                 pu1_src_cpy += (src_strd << 1);
3559                 pu1_src_left_cpy += 4;
3560                 pu1_src_left_str += 4;
3561             }
3562             ht_rem = ht & 0x1;
3563 
3564             if(ht_rem)
3565             {
3566                 left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
3567                 src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd + 2));
3568                 //current row -next row
3569                 //separating +ve and and -ve values.
3570                 cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_bottom_16x8b);
3571                 cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp0_16x8b);
3572                 //creating mask 00 for +ve and -ve values and FF for zero.
3573                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
3574                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
3575                 //combining the appropriate sign change
3576                 edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
3577                 //adding top and botton and constant 2
3578                 edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
3579                 edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
3580 
3581                 //eliminating old left for row 0 and row 1
3582                 left_store_16x8b = _mm_srli_si128(left_store_16x8b, 2);
3583                 //copying the next top
3584                 src_top_16x8b = src_temp0_16x8b;
3585                 //row0  getting it right for left of next block
3586                 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 14);
3587 
3588                 edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
3589                 //using availability mask
3590                 edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
3591                 //adding chroma offset to access U and V
3592                 edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b);
3593 
3594                 edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
3595 
3596                 //cnvert to 16 bit then add and then saturated pack
3597                 signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
3598                 cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
3599                 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
3600                 src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
3601                 edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
3602                 cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
3603                 src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, edge0_16x8b);
3604                 src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
3605 
3606                 _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
3607 
3608                 _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
3609                 pu1_src_cpy += (src_strd);
3610                 pu1_src_left_cpy += 2;
3611                 pu1_src_left_str += 2;
3612             }
3613             if(0 == pu1_avail[3])
3614             {
3615                 src_top_16x8b = src_bottom_16x8b;
3616                 pu1_src_left_str[1] = pu1_src_cpy[15];
3617                 pu1_src_left_str[0] = pu1_src_cpy[14];
3618             }
3619             if(0 == pu1_avail[2])
3620             {
3621                 pu1_src_left_str[-2 * ht_0] = pu1_src[14 - src_strd];
3622                 pu1_src_left_str[-2 * ht_0 + 1] = pu1_src[15 - src_strd];
3623             }
3624 
3625             //for the top left of next part of the block
3626             left_store_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_top_cpy + wd - col));
3627             //updating top flag
3628             _mm_storeu_si128((__m128i *)(pu1_src_top + wd - col), src_top_16x8b);
3629             pu1_src += 16;
3630             au1_mask_cpy += 16;
3631 
3632             pu1_left_tmp = pu1_src_left_cpy2;
3633             pu1_src_left_cpy2 = pu1_src_left_str2;
3634             pu1_src_left_str2 = pu1_left_tmp;
3635 
3636             pu1_src_left_cpy = pu1_src_left_cpy2;
3637             pu1_src_left_str = pu1_src_left_str2;
3638         }
3639         wd_rem = wd & 0xF;
3640         if(wd_rem)
3641         {
3642             pu1_src_left_cpy = pu1_src_left_cpy2;
3643             pu1_src_left_str = pu1_src_left_str2;
3644             pu1_src_cpy = pu1_src;
3645             src_top_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_top_cpy + wd - col));
3646             //row = 0
3647             src_temp0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy));
3648             src_top_16x8b = _mm_alignr_epi8(src_top_16x8b, left_store_16x8b, 14);
3649             au1_mask8x16b = _mm_loadl_epi64((__m128i *)au1_mask_cpy); //????
3650             //separating +ve and and -ve values.
3651             cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_top_16x8b);
3652             cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp0_16x8b);
3653             //creating mask 00 for +ve and -ve values and FF for zero.
3654             cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
3655             cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
3656             //preparing au1_mask
3657             au1_mask8x16b = _mm_unpacklo_epi64(au1_mask8x16b, au1_mask8x16b);
3658             //combining the appropriate sign change
3659             signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
3660             signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8);
3661 
3662             for(row = ht; row >= 4; row -= 4)
3663             {
3664                 left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
3665                 //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
3666                 src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
3667                 // row = 2
3668                 src_bottom_16x8b =  _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd));
3669                 //right row1
3670                 signdwn1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2);
3671                 //row 0 -row1
3672                 //separating +ve and and -ve values.
3673                 cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b);
3674                 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b);
3675                 //manipulation for row 1 -row 0
3676                 signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 14);
3677                 //creating mask 00 for +ve and -ve values and FF for zero.
3678                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
3679                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
3680                 //row 0 left
3681                 signdwn1_16x8b = _mm_alignr_epi8(src_temp0_16x8b, signdwn1_16x8b, 14);
3682                 //combining the appropriate sign change
3683                 edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
3684                 //row 1 -row0
3685                 //separating +ve and and -ve values.
3686                 cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
3687                 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
3688 
3689                 //creating mask 00 for +ve and -ve values and FF for zero.
3690                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
3691                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
3692                 //row1-row0
3693                 edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
3694 
3695                 edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8); //aligned left (0-1)
3696 
3697                 signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(1-0),(0-top)
3698                 //right row2
3699                 signdwn1_16x8b = _mm_srli_si128(src_bottom_16x8b, 2);
3700                 //packing row 0 n row 1
3701                 src_temp0_16x8b = _mm_unpacklo_epi64(src_temp0_16x8b, src_temp1_16x8b);
3702                 //row1 -row2
3703                 cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
3704                 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
3705                 //creating mask 00 for +ve and -ve values and FF for zero.
3706                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
3707                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
3708                 //combining the appropriate sign change
3709                 signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-2)
3710                 edge0_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge0_16x8b, 8); //(1-2),(0-1)
3711                 //manipulation for row 2 -row 1
3712                 signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 12);
3713                 //row 1 left
3714                 signdwn1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, signdwn1_16x8b, 14);
3715                 //row = 3
3716                 src_top_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 3 * src_strd));
3717 
3718                 // row = 4
3719                 src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 4 * src_strd));
3720 
3721                 edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); //(1,0) sign_up empty
3722 
3723                 //separating +ve and and -ve values.(2,1)
3724                 cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signdwn1_16x8b);
3725                 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_bottom_16x8b);
3726                 //manipulation for row 3 -row 2
3727                 signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 10);
3728                 //creating mask 00 for +ve and -ve values and FF for zero.
3729                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
3730                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
3731                 //row 2 left
3732                 signdwn1_16x8b = _mm_alignr_epi8(src_bottom_16x8b, signdwn1_16x8b, 14);
3733                 //combining the appropriate sign change
3734                 signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(2-1)
3735 
3736                 //separating +ve and and -ve values.(3,2)
3737                 cmp_gt0_16x8b = _mm_subs_epu8(src_top_16x8b, signdwn1_16x8b);
3738                 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_top_16x8b);
3739                 signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8); //aligned left (2-1)
3740                 //creating mask 00 for +ve and -ve values and FF for zero.
3741                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
3742                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
3743                 //right row3
3744                 signdwn1_16x8b = _mm_srli_si128(src_top_16x8b, 2);
3745                 //combining the appropriate sign change
3746                 edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(3-2)
3747 
3748                 signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(3-2) ,(2-1)
3749 
3750                 //separating +ve and and -ve values.(2,3)
3751                 cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signdwn1_16x8b);
3752                 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_bottom_16x8b);
3753                 //right row 4
3754                 signdwn1_16x8b =  _mm_srli_si128(src_temp1_16x8b, 2);
3755                 //creating mask 00 for +ve and -ve values and FF for zero.
3756                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
3757                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
3758                 //combining the appropriate sign change
3759                 edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(2-3)
3760 
3761                 //separating +ve and and -ve values.(3,bottom)
3762                 cmp_gt0_16x8b = _mm_subs_epu8(src_top_16x8b, signdwn1_16x8b);
3763                 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_top_16x8b);
3764 
3765                 //creating mask 00 for +ve and -ve values and FF for zero.
3766                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
3767                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
3768                 edge1_16x8b = _mm_slli_si128(edge1_16x8b, 8); //aligned left (2-3)
3769                 //combining the appropriate sign change
3770                 signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(3-bottom)
3771                 edge1_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge1_16x8b, 8); //(3-bottom),(2-3)
3772 
3773                 //manipulation for bottom -row 3
3774                 signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 8);
3775                 //eliminating old left for row 0,1,2,3
3776                 left_store_16x8b = _mm_srli_si128(left_store_16x8b, 8);
3777                 //packing row 2 n row 3
3778                 src_bottom_16x8b = _mm_unpacklo_epi64(src_bottom_16x8b, src_top_16x8b);
3779                 //row 3 left
3780                 signdwn1_16x8b = _mm_alignr_epi8(src_top_16x8b, signdwn1_16x8b, 14);
3781 
3782                 //adding bottom and top values of row 2 and row 3
3783                 edge1_16x8b = _mm_add_epi8(edge1_16x8b, signup0_16x8b); //(3,2)
3784                 //separating +ve and and -ve values.(botttom,3)
3785                 cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
3786                 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
3787 
3788                 //creating mask 00 for +ve and -ve values and FF for zero.
3789                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
3790                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
3791                 signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(bottom -3) for next iteration
3792 
3793                 //to store right of row 2
3794                 signdwn1_16x8b = _mm_slli_si128(src_bottom_16x8b, 8);
3795                 //loading row 3 right into left
3796                 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_bottom_16x8b, 14);
3797                 //storing right of row 2into left
3798                 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 14);
3799                 //to store right of row 0
3800                 signdwn1_16x8b = _mm_slli_si128(src_temp0_16x8b, 8);
3801                 //storing right of row 1 into left
3802                 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 14);
3803                 //storing right of row 0 into left
3804                 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 14);
3805 
3806                 //adding constant 2
3807                 edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
3808                 edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b);
3809                 //shuffle to get sao index
3810                 edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
3811                 edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b);
3812                 //using availability mask
3813                 edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
3814                 edge1_16x8b = _mm_and_si128(edge1_16x8b, au1_mask8x16b);
3815 
3816                 //adding chroma offset to access U and V
3817                 edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b);
3818                 edge1_16x8b = _mm_add_epi8(edge1_16x8b, chroma_offset_8x16b);
3819 
3820                 //shuffle to get sao offset
3821                 edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
3822                 edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b);
3823                 //cnvert to 16 bit then add and then saturated pack
3824                 signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
3825                 cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
3826                 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
3827                 src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
3828                 edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
3829                 cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
3830                 src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, edge0_16x8b);
3831                 src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
3832 
3833                 signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b);
3834                 cmp_gt0_16x8b = _mm_unpacklo_epi8(src_bottom_16x8b, const0_16x8b);
3835                 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, signdwn1_16x8b);
3836                 src_bottom_16x8b = _mm_unpackhi_epi8(src_bottom_16x8b, const0_16x8b);
3837                 edge1_16x8b = _mm_unpackhi_epi8(edge1_16x8b, signdwn1_16x8b);
3838                 cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
3839                 src_bottom_16x8b = _mm_add_epi16(src_bottom_16x8b, edge1_16x8b);
3840                 src_bottom_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_bottom_16x8b);
3841 
3842                 cmp_gt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8);
3843                 cmp_lt0_16x8b = _mm_srli_si128(src_bottom_16x8b, 8);
3844 
3845 
3846                 _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
3847                 //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
3848                 _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
3849                 // row = 1
3850                 _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_gt0_16x8b);
3851                 //row = 2
3852                 _mm_storel_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd), src_bottom_16x8b);
3853                 // row = 3
3854                 _mm_storel_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd), cmp_lt0_16x8b);
3855 
3856                 src_temp0_16x8b = src_temp1_16x8b;
3857                 signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8);
3858                 pu1_src_cpy += (src_strd << 2);
3859                 pu1_src_left_cpy += 8;
3860                 pu1_src_left_str += 8;
3861             }
3862             ht_rem = ht & 0x2;
3863             if(ht_rem)
3864             {
3865                 left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
3866                 //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
3867                 src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
3868                 // row = 2
3869                 src_bottom_16x8b =  _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd));
3870 
3871                 //row 0 -row 1
3872                 signdwn1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2);
3873                 //separating +ve and and -ve values.
3874                 cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b);
3875                 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b);
3876                 //manipulation for row 1 -row 0
3877                 signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 14);
3878                 //creating mask 00 for +ve and -ve values and FF for zero.
3879                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
3880                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
3881                 //manipulation for row 1 - row 0
3882                 signdwn1_16x8b = _mm_alignr_epi8(src_temp0_16x8b, signdwn1_16x8b, 14);
3883                 //combining the appropriate sign change
3884                 edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
3885 
3886                 //row1-row0
3887                 //separating +ve and and -ve values.
3888                 cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
3889                 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
3890 
3891                 //creating mask 00 for +ve and -ve values and FF for zero.
3892                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
3893                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
3894                 //combining the appropriate sign chang
3895                 edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
3896                 //row 1 -bottom
3897                 signdwn1_16x8b = _mm_srli_si128(src_bottom_16x8b, 2);
3898 
3899                 edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8); //aligned left (0-1)
3900                 signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(1-0),(0-top)
3901                 //row1 -bottom
3902                 cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
3903                 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
3904 
3905                 //creating mask 00 for +ve and -ve values and FF for zero.
3906                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
3907                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
3908                 //combining the appropriate sign change
3909                 signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-2)
3910                 edge0_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge0_16x8b, 8); //(1-2),(0-1)
3911                 //manipulation for bottom -row1
3912                 signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 12);
3913                 //eliminating old left for row 0,1
3914                 left_store_16x8b = _mm_srli_si128(left_store_16x8b, 4);
3915                 //manipulation for bottom- row 1
3916                 signdwn1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, signdwn1_16x8b, 14);
3917                 //adding top and down substraction
3918                 edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); //(1,0) sign_up empty
3919                 //bottom - row 1
3920                 cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signdwn1_16x8b);
3921                 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_bottom_16x8b);
3922 
3923                 //shifting row 1
3924                 signdwn1_16x8b = _mm_slli_si128(src_temp1_16x8b, 8);
3925                 //creating mask 00 for +ve and -ve values and FF for zero.
3926                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
3927                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
3928                 //for the next iteration signup0_16x8b
3929                 signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(2-1) for next
3930                 //storing right of row 1 into left
3931                 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 14); //for storing right of row 0
3932                 signdwn1_16x8b = _mm_slli_si128(src_temp0_16x8b, 8);
3933                 //the next top  in  src_top_16x8b
3934                 src_top_16x8b = src_temp1_16x8b;
3935                 //storing right of row 0 into left
3936                 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 14);
3937 
3938 
3939                 //adding constant 2
3940                 edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
3941 
3942                 //shuffle to get sao index
3943                 edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
3944                 //using availability mask
3945                 edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
3946 
3947                 //adding chroma offset to access U and V
3948                 edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b);
3949 
3950                 //shuffle to get sao offset
3951                 edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
3952                 //the next top already in  src_top_16x8b
3953                 //cnvert to 16 bit then add and then saturated pack
3954                 signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
3955                 src_temp0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
3956                 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
3957                 src_temp1_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b);
3958                 edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
3959                 src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
3960                 src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, edge0_16x8b);
3961                 src_temp0_16x8b = _mm_packus_epi16(src_temp0_16x8b, src_temp1_16x8b);
3962 
3963                 cmp_gt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8);
3964 
3965                 _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
3966                 //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
3967                 _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
3968                 // row = 1
3969                 _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_gt0_16x8b);
3970                 src_temp0_16x8b = src_bottom_16x8b;
3971                 pu1_src_cpy += (src_strd << 1);
3972                 pu1_src_left_cpy += 4;
3973                 pu1_src_left_str += 4;
3974             }
3975             ht_rem = ht & 0x1;
3976             if(ht_rem)
3977             {
3978                 left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
3979                 //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
3980                 src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
3981 
3982                 //row 0 -row1
3983                 signdwn1_16x8b = _mm_srli_si128(src_bottom_16x8b, 2);
3984                 //separating +ve and and -ve values.
3985                 cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b);
3986                 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b);
3987                 //creating mask 00 for +ve and -ve values and FF for zero.
3988                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
3989                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
3990                 //combining the appropriate sign change
3991                 edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
3992                 //adding top and down substraction
3993                 edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
3994 
3995                 //for row 0 right to put into left store
3996                 signdwn1_16x8b = _mm_slli_si128(src_temp0_16x8b, 8);
3997                 //left store manipulation 1
3998                 left_store_16x8b = _mm_srli_si128(left_store_16x8b, 2);
3999                 src_top_16x8b = src_temp0_16x8b;
4000                 //filling the left boundary value
4001                 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 14);
4002 
4003                 //adding constant 2
4004                 edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
4005                 edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8);
4006                 edge0_16x8b = _mm_srli_si128(edge0_16x8b, 8);
4007 
4008 
4009                 //shuffle to get sao index
4010                 edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
4011                 //using availability mask
4012                 edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
4013                 //adding chroma offset to access U and V
4014                 edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b);
4015 
4016                 //shuffle to get sao offset
4017                 edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
4018 
4019                 //cnvert to 16 bit then add and then saturated pack
4020                 signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
4021                 src_temp0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
4022                 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
4023                 src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
4024                 src_temp0_16x8b = _mm_packus_epi16(src_temp0_16x8b, const0_16x8b);
4025 
4026                 _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
4027                 //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
4028                 _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
4029                 pu1_src_cpy += (src_strd);
4030                 pu1_src_left_cpy += 2;
4031                 pu1_src_left_str += 2;
4032             }
4033             if(0 == pu1_avail[3])
4034             {
4035                 src_top_16x8b = src_bottom_16x8b;
4036                 pu1_src_left_str[1] = pu1_src_cpy[7];
4037                 pu1_src_left_str[0] = pu1_src_cpy[6];
4038             }
4039 
4040             if(0 == pu1_avail[2])
4041             {
4042                 pu1_src_left_str[-2 * ht_0] = pu1_src[6 - src_strd];
4043                 pu1_src_left_str[-2 * ht_0 + 1] = pu1_src[7 - src_strd];
4044             }
4045 
4046             _mm_storel_epi64((__m128i *)(pu1_src_top + wd - col), src_top_16x8b);
4047             pu1_src += 8;
4048 
4049             pu1_left_tmp = pu1_src_left_cpy2;
4050             pu1_src_left_cpy2 = pu1_src_left_str2;
4051             pu1_src_left_str2 = pu1_left_tmp;
4052 
4053             pu1_src_left_cpy = pu1_src_left_cpy2;
4054             pu1_src_left_str = pu1_src_left_str2;
4055         }
4056         pu1_src_org[0] = u1_pos_0_0_tmp_u;
4057         pu1_src_org[1] = u1_pos_0_0_tmp_v;
4058         pu1_src_org[wd - 2 + (ht_tmp - 1) * src_strd] = u1_pos_wd_ht_tmp_u;
4059         pu1_src_org[wd - 1 + (ht_tmp - 1) * src_strd] = u1_pos_wd_ht_tmp_v;
4060         pu1_src_left_cpy = (0 == pu1_avail[2]) ? (pu1_src_left_cpy - 2) : pu1_src_left_cpy;
4061         for(row = 0; row < 2 * ht_tmp; row++)
4062         {
4063             pu1_src_left[row] = pu1_src_left_cpy[row];
4064         }
4065     }
4066 
4067 }
4068 
ihevc_sao_edge_offset_class3_ssse3(UWORD8 * pu1_src,WORD32 src_strd,UWORD8 * pu1_src_left,UWORD8 * pu1_src_top,UWORD8 * pu1_src_top_left,UWORD8 * pu1_src_top_right,UWORD8 * pu1_src_bot_left,UWORD8 * pu1_avail,WORD8 * pi1_sao_offset,WORD32 wd,WORD32 ht)4069 void ihevc_sao_edge_offset_class3_ssse3(UWORD8 *pu1_src,
4070                                         WORD32 src_strd,
4071                                         UWORD8 *pu1_src_left,
4072                                         UWORD8 *pu1_src_top,
4073                                         UWORD8 *pu1_src_top_left,
4074                                         UWORD8 *pu1_src_top_right,
4075                                         UWORD8 *pu1_src_bot_left,
4076                                         UWORD8 *pu1_avail,
4077                                         WORD8 *pi1_sao_offset,
4078                                         WORD32 wd,
4079                                         WORD32 ht)
4080 {
4081     WORD32 row, col;
4082     UWORD8 *pu1_src_top_cpy, *pu1_src_left_cpy, *pu1_src_left_cpy2;
4083     UWORD8 *pu1_left_tmp, *pu1_src_left_str, *pu1_src_left_str2;
4084     UWORD8 *pu1_src_cpy, *pu1_src_org;
4085     UWORD8 au1_src_left_tmp[MAX_CTB_SIZE + 8];
4086     UWORD8 au1_src_left_tmp1[MAX_CTB_SIZE + 8];
4087     UWORD8 au1_mask[MAX_CTB_SIZE], *au1_mask_cpy;
4088     WORD32 wd_rem;
4089     UWORD8 u1_pos_wd_0_tmp, u1_pos_0_ht_tmp;
4090     WORD32 ht_tmp;
4091     WORD32 bit_depth;
4092     UWORD8 u1_avail0, u1_avail1;
4093 
4094     __m128i src_top_16x8b, src_bottom_16x8b;
4095     __m128i src_temp0_16x8b, src_temp1_16x8b;
4096     __m128i signup0_16x8b, signdwn1_16x8b;
4097     __m128i cmp_gt0_16x8b, cmp_lt0_16x8b;
4098     __m128i edge0_16x8b, edge1_16x8b;
4099     __m128i au1_mask8x16b;
4100     __m128i edge_idx_8x16b, sao_offset_8x16b;
4101     __m128i const2_16x8b, const0_16x8b;
4102     __m128i left_store_16x8b;
4103 
4104     ht_tmp = ht;
4105     au1_mask8x16b = _mm_set1_epi8(0xff);
4106 
4107     au1_src_left_tmp[0] = pu1_src[(wd - 1)];
4108     //manipulation for bottom left
4109     for(row = 1; row < ht; row++)
4110     {
4111         au1_src_left_tmp[row] = pu1_src_left[row];
4112     }
4113     au1_src_left_tmp[ht] = pu1_src_bot_left[0];
4114 
4115     *pu1_src_top_left = pu1_src_top[wd - 1];
4116     //setting availability mask to ff size MAX_CTB_SIZE
4117     for(col = 0; col < MAX_CTB_SIZE; col += 16)
4118         _mm_storeu_si128((__m128i *)(au1_mask + col), au1_mask8x16b);
4119     bit_depth = BIT_DEPTH_LUMA;
4120     pu1_src_org = pu1_src;
4121     pu1_src_top_cpy = pu1_src_top;
4122     pu1_src_left_cpy2 = au1_src_left_tmp;
4123     pu1_src_left_cpy = au1_src_left_tmp;
4124     pu1_src_left_str2 = au1_src_left_tmp1;
4125     pu1_src_left_str = au1_src_left_tmp1;
4126     edge_idx_8x16b   = _mm_loadl_epi64((__m128i *)gi1_table_edge_idx);
4127     sao_offset_8x16b = _mm_loadl_epi64((__m128i *)pi1_sao_offset);
4128 
4129     /* If top-right is available, process separately */
4130     if(0 != pu1_avail[5])
4131     {
4132         WORD32 edge_idx;
4133 
4134         edge_idx = 2 + SIGN(pu1_src[wd - 1] - pu1_src_top_right[0]) +
4135                         SIGN(pu1_src[wd - 1] - pu1_src[wd - 1 - 1 + src_strd]);
4136 
4137         edge_idx = gi1_table_edge_idx[edge_idx];
4138 
4139         if(0 != edge_idx)
4140         {
4141             u1_pos_wd_0_tmp = CLIP3(pu1_src[wd - 1] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1);
4142         }
4143         else
4144         {
4145             u1_pos_wd_0_tmp = pu1_src[wd - 1];
4146         }
4147     }
4148     else
4149     {
4150         u1_pos_wd_0_tmp = pu1_src[wd - 1];
4151     }
4152 
4153     /* If bottom-left is available, process separately */
4154     if(0 != pu1_avail[6])
4155     {
4156         WORD32 edge_idx;
4157 
4158         edge_idx = 2 + SIGN(pu1_src[(ht - 1) * src_strd] - pu1_src[(ht - 1) * src_strd + 1 - src_strd]) +
4159                         SIGN(pu1_src[(ht - 1) * src_strd] - pu1_src_bot_left[0]);
4160 
4161         edge_idx = gi1_table_edge_idx[edge_idx];
4162 
4163         if(0 != edge_idx)
4164         {
4165             u1_pos_0_ht_tmp = CLIP3(pu1_src[(ht - 1) * src_strd] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1);
4166         }
4167         else
4168         {
4169             u1_pos_0_ht_tmp = pu1_src[(ht - 1) * src_strd];
4170         }
4171     }
4172     else
4173     {
4174         u1_pos_0_ht_tmp = pu1_src[(ht - 1) * src_strd];
4175     }
4176 
4177 
4178 
4179     /* Update height and source pointers based on the availability flags */
4180     if(0 == pu1_avail[2])
4181     {
4182         pu1_src_left_cpy2++;
4183         pu1_src_left_str2++;
4184         pu1_src_top_cpy = pu1_src;
4185         pu1_src += src_strd;
4186         ht--;
4187     }
4188     if(0 == pu1_avail[3])
4189     {
4190         ht--;
4191     }
4192 
4193 
4194     const2_16x8b = _mm_set1_epi8(2);
4195     const0_16x8b = _mm_setzero_si128();
4196 
4197 
4198     //availability mask creation
4199     u1_avail0 = pu1_avail[0];
4200     u1_avail1 = pu1_avail[1];
4201     au1_mask[0] = u1_avail0;
4202     au1_mask[wd - 1] = u1_avail1;
4203     {
4204         WORD32 ht_rem;
4205 
4206         pu1_src_left_cpy = pu1_src_left_cpy2;
4207         pu1_src_left_str = pu1_src_left_str2;
4208         au1_mask_cpy = au1_mask;
4209         for(col = wd; col >= 16; col -= 16)
4210         {
4211             pu1_src_cpy = pu1_src;
4212             src_top_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_top_cpy + wd - col + 1));
4213             //row = 0
4214             src_temp0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy));
4215 
4216             //loading the mask
4217             au1_mask8x16b = _mm_loadu_si128((__m128i *)au1_mask_cpy);
4218             //separating +ve and and -ve values.
4219             cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_top_16x8b);
4220             cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp0_16x8b);
4221             //creating mask 00 for +ve and -ve values and FF for zero.
4222             cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
4223             cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
4224             //combining the appropriate sign change
4225             signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
4226 
4227             for(row = ht; row >= 2; row -= 2)
4228             {
4229                 left_store_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_left_cpy));
4230                 //row = 1
4231                 src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
4232                 //to insert left in row 1
4233                 signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 14);
4234                 // row = 0 right
4235                 src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 1));
4236 
4237                 //manipulation for row 1 - row 0
4238                 signdwn1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, signdwn1_16x8b, 15);
4239                 //row 0 -row1
4240                 //separating +ve and and -ve values.
4241                 cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b);
4242                 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b);
4243 
4244                 //creating mask 00 for +ve and -ve values and FF for zero.
4245                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
4246                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
4247 
4248                 //combining the appropriate sign change
4249                 edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(0-1)
4250                 //combining sign-left and sign_right
4251                 edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
4252 
4253                 //row1-row0
4254                 //separating +ve and and -ve values.
4255                 cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_bottom_16x8b);
4256                 cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp1_16x8b);
4257                 //creating mask 00 for +ve and -ve values and FF for zero.
4258                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
4259                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
4260 
4261                 // row = 2
4262                 src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd));
4263                 // row = 1 right
4264                 signdwn1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd + 1));
4265                 edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-0)
4266 
4267                 //bottom - row1
4268                 cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signdwn1_16x8b);
4269                 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_bottom_16x8b);
4270                 //creating mask 00 for +ve and -ve values and FF for zero.
4271                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
4272                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
4273                 //for the next iteration bottom -row1
4274                 signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
4275 
4276                 //to insert left in row 1
4277                 signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 13);
4278                 //manipulation for row 1 - bottom
4279                 signdwn1_16x8b = _mm_alignr_epi8(src_bottom_16x8b, signdwn1_16x8b, 15);
4280 
4281                 //row1 -bottom
4282                 cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
4283                 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
4284                 //creating mask 00 for +ve and -ve values and FF for zero.
4285                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
4286                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
4287                 //combining the appropriate sign change
4288                 signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
4289 
4290                 //combining sign-left and sign_right
4291                 edge1_16x8b = _mm_add_epi8(edge1_16x8b, signdwn1_16x8b);
4292 
4293                 //eliminating old left for row 0 and row 1
4294                 left_store_16x8b = _mm_srli_si128(left_store_16x8b, 2);
4295 
4296                 //row1  getting it right for left of next block
4297                 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp1_16x8b, 15);
4298                 //adding constant 2
4299                 edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
4300                 edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b);
4301                 //shuffle to get sao index
4302                 edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
4303                 edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b);
4304                 //using availability mask
4305                 edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
4306                 edge1_16x8b = _mm_and_si128(edge1_16x8b, au1_mask8x16b);
4307                 //shuffle to get sao offset
4308                 edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
4309                 edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b);
4310                 //row0  getting it right for left of next block
4311                 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 15);
4312                 //copying the next top
4313                 src_top_16x8b = src_temp1_16x8b;
4314                 //cnvert to 16 bit then add and then saturated pack
4315                 signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
4316                 cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
4317                 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
4318                 src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
4319                 cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
4320                 cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
4321                 src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
4322                 src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
4323 
4324                 signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b);
4325                 cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b);
4326                 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, signdwn1_16x8b);
4327                 src_temp1_16x8b = _mm_unpackhi_epi8(src_temp1_16x8b, const0_16x8b);
4328                 cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
4329                 cmp_lt0_16x8b = _mm_unpackhi_epi8(edge1_16x8b, signdwn1_16x8b);
4330                 src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, cmp_lt0_16x8b);
4331                 src_temp1_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp1_16x8b);
4332                 //store left boundary
4333                 _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
4334                 //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
4335                 _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
4336                 // row = 1
4337                 _mm_storeu_si128((__m128i *)(pu1_src_cpy + src_strd), src_temp1_16x8b);
4338 
4339                 src_temp0_16x8b = src_bottom_16x8b;
4340                 pu1_src_cpy += (src_strd << 1);
4341                 pu1_src_left_cpy += 2;
4342                 pu1_src_left_str += 2;
4343             }
4344             ht_rem = ht & 0x1;
4345 
4346             if(ht_rem)
4347             {
4348                 left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
4349                 src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
4350                 //to insert left in row 1
4351                 signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 14);
4352                 //manipulation for row 1 - row 0
4353                 signdwn1_16x8b = _mm_alignr_epi8(src_bottom_16x8b, signdwn1_16x8b, 15);
4354 
4355                 //current row -next row
4356                 //separating +ve and and -ve values.
4357                 cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b);
4358                 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b);
4359                 //creating mask 00 for +ve and -ve values and FF for zero.
4360                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
4361                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
4362                 //combining the appropriate sign change
4363                 edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
4364                 //adding top and bottom and constant 2
4365                 edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
4366                 edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
4367                 //eliminating old left for row 0 and row 1
4368                 left_store_16x8b = _mm_srli_si128(left_store_16x8b, 1);
4369 
4370                 edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
4371                 //using availability mask
4372                 edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
4373 
4374                 edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
4375 
4376                 //row0  getting it right for left of next block
4377                 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 15);
4378                 //copying the next top
4379                 src_top_16x8b = src_temp0_16x8b;
4380                 //cnvert to 16 bit then add and then saturated pack
4381                 signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
4382                 cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
4383                 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
4384                 src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
4385                 cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
4386                 cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
4387                 src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
4388                 src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
4389                 //store left boundary
4390                 _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
4391 
4392                 _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
4393                 pu1_src_cpy += (src_strd);
4394                 src_temp0_16x8b = src_bottom_16x8b;
4395                 pu1_src_left_cpy++;
4396                 pu1_src_left_str++;
4397             }
4398             {   //for bottom right
4399                 left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
4400                 left_store_16x8b = _mm_srli_si128(left_store_16x8b, 1);
4401                 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 15);
4402                 _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
4403             }
4404             if(0 == pu1_avail[3])
4405             {
4406                 src_top_16x8b = src_bottom_16x8b;
4407             }
4408             //for the top left of next part of the block
4409             left_store_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_top_cpy + wd - col));
4410             //updating top flag
4411             _mm_storeu_si128((__m128i *)(pu1_src_top + wd - col), src_top_16x8b);
4412             pu1_src += 16;
4413             au1_mask_cpy += 16;
4414 
4415             pu1_left_tmp = pu1_src_left_cpy2;
4416             pu1_src_left_cpy2 = pu1_src_left_str2;
4417             pu1_src_left_str2 = pu1_left_tmp;
4418 
4419             pu1_src_left_cpy = pu1_src_left_cpy2;
4420             pu1_src_left_str = pu1_src_left_str2;
4421         }
4422 
4423         wd_rem = wd & 0xF;
4424         if(wd_rem)
4425         {
4426             pu1_src_cpy = pu1_src;
4427             pu1_src_left_cpy = pu1_src_left_cpy2;
4428             pu1_src_left_str = pu1_src_left_str2;
4429             src_top_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_top_cpy + wd - col + 1));
4430             //row = 0
4431             src_temp0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy));
4432             au1_mask8x16b = _mm_loadl_epi64((__m128i *)au1_mask_cpy); //????
4433             //separating +ve and and -ve values.
4434             cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_top_16x8b);
4435             cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp0_16x8b);
4436             //creating mask 00 for +ve and -ve values and FF for zero.
4437             cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
4438             cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
4439             //preparing au1_mask
4440             au1_mask8x16b = _mm_unpacklo_epi64(au1_mask8x16b, au1_mask8x16b);
4441             //combining the appropriate sign change
4442             signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
4443             signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8);
4444 
4445             for(row = ht; row >= 4; row -= 4)
4446             {
4447                 left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
4448                 //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
4449                 src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
4450                 // row = 2
4451                 src_bottom_16x8b =  _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd));
4452                 //manipulation for row 0 -row 1
4453                 signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 14);
4454                 //row 1 left
4455                 signdwn1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, signdwn1_16x8b, 15);
4456                 //row 0 -row1
4457                 //separating +ve and and -ve values.
4458                 cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b);
4459                 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b);
4460 
4461                 //creating mask 00 for +ve and -ve values and FF for zero.
4462                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
4463                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
4464                 //manipulatiing for row 1 -row 0
4465                 signdwn1_16x8b = _mm_srli_si128(src_temp0_16x8b, 1);
4466                 //combining the appropriate sign change
4467                 edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
4468                 //row 1 -row0
4469                 //separating +ve and and -ve values.
4470                 cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
4471                 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
4472 
4473                 //creating mask 00 for +ve and -ve values and FF for zero.
4474                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
4475                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
4476                 //row1-row0
4477                 edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
4478 
4479                 edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8); //aligned left (0-1)
4480 
4481                 signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(1-0),(0-top)
4482                 //manipulation for row 1 -row 2
4483                 signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 13);
4484                 //row 2 left
4485                 signdwn1_16x8b = _mm_alignr_epi8(src_bottom_16x8b, signdwn1_16x8b, 15);
4486                 //packing row 0 n row 1
4487                 src_temp0_16x8b = _mm_unpacklo_epi64(src_temp0_16x8b, src_temp1_16x8b);
4488                 //row1 -row2
4489                 cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
4490                 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
4491                 //creating mask 00 for +ve and -ve values and FF for zero.
4492                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
4493                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
4494                 //combining the appropriate sign change
4495                 signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-2)
4496                 edge0_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge0_16x8b, 8); //(1-2),(0-1)
4497 
4498                 //row 1 right
4499                 signdwn1_16x8b = _mm_srli_si128(src_temp1_16x8b, 1);
4500                 //row = 3
4501                 src_top_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 3 * src_strd));
4502 
4503                 // row = 4
4504                 src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 4 * src_strd));
4505 
4506                 edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); //(1,0) sign_up empty
4507 
4508                 //separating +ve and and -ve values.(2,1)
4509                 cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signdwn1_16x8b);
4510                 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_bottom_16x8b);
4511 
4512                 //creating mask 00 for +ve and -ve values and FF for zero.
4513                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
4514                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
4515                 //row 2 right
4516                 signdwn1_16x8b = _mm_srli_si128(src_bottom_16x8b, 1);
4517                 //combining the appropriate sign change
4518                 signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(2-1)
4519 
4520                 //separating +ve and and -ve values.(3,2)
4521                 cmp_gt0_16x8b = _mm_subs_epu8(src_top_16x8b, signdwn1_16x8b);
4522                 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_top_16x8b);
4523                 signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8); //aligned left (2-1)
4524                 //creating mask 00 for +ve and -ve values and FF for zero.
4525                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
4526                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
4527                 //manipulation for row 2 -row 3
4528                 signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 12);
4529                 //row 3 left
4530                 signdwn1_16x8b = _mm_alignr_epi8(src_top_16x8b, signdwn1_16x8b, 15);
4531                 //combining the appropriate sign change
4532                 edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(3-2)
4533 
4534                 signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(3-2) ,(2-1)
4535 
4536                 //separating +ve and and -ve values.(2,3)
4537                 cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signdwn1_16x8b);
4538                 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_bottom_16x8b);
4539 
4540                 //manipulation for row 3 -bottom
4541                 signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 11);
4542                 //bottom left
4543                 signdwn1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, signdwn1_16x8b, 15);
4544 
4545                 //creating mask 00 for +ve and -ve values and FF for zero.
4546                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
4547                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
4548                 //combining the appropriate sign change
4549                 edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(2-3)
4550 
4551                 //separating +ve and and -ve values.(3,bottom)
4552                 cmp_gt0_16x8b = _mm_subs_epu8(src_top_16x8b, signdwn1_16x8b);
4553                 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_top_16x8b);
4554 
4555                 //creating mask 00 for +ve and -ve values and FF for zero.
4556                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
4557                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
4558                 edge1_16x8b = _mm_slli_si128(edge1_16x8b, 8); //aligned left (2-3)
4559                 //combining the appropriate sign change
4560                 signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(3-bottom)
4561                 edge1_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge1_16x8b, 8); //(3-bottom),(2-3)
4562 
4563 
4564                 //eliminating old left for row 0,1,2,3
4565                 left_store_16x8b = _mm_srli_si128(left_store_16x8b, 4);
4566                 //packing row 2 n row 3
4567                 src_bottom_16x8b = _mm_unpacklo_epi64(src_bottom_16x8b, src_top_16x8b);
4568                 //row 3 right
4569                 signdwn1_16x8b = _mm_srli_si128(src_top_16x8b, 1);
4570                 //loading row 3 right into left
4571                 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_bottom_16x8b, 15);
4572                 //adding bottom and top values of row 2 and row 3
4573                 edge1_16x8b = _mm_add_epi8(edge1_16x8b, signup0_16x8b); //(3,2)
4574                 //separating +ve and and -ve values.(botttom,3)
4575                 cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
4576                 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
4577                 //to store right of row 2
4578                 signdwn1_16x8b = _mm_slli_si128(src_bottom_16x8b, 8);
4579                 //creating mask 00 for +ve and -ve values and FF for zero.
4580                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
4581                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
4582                 signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(bottom -3) for next iteration
4583 
4584                 //storing right of row 2into left
4585                 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 15);
4586                 //to store right of row 0
4587                 signdwn1_16x8b = _mm_slli_si128(src_temp0_16x8b, 8);
4588                 //storing right of row 1 into left
4589                 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 15);
4590 
4591                 //adding constant 2
4592                 edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
4593                 edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b);
4594                 //shuffle to get sao index
4595                 edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
4596                 edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b);
4597                 //using availability mask
4598                 edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
4599                 edge1_16x8b = _mm_and_si128(edge1_16x8b, au1_mask8x16b);
4600                 //shuffle to get sao offset
4601                 edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
4602                 edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b);
4603 
4604                 //storing right of row 0 into left
4605                 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 15);
4606                 //cnvert to 16 bit then add and then saturated pack
4607                 signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
4608                 cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
4609                 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
4610                 src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
4611                 cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
4612                 cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
4613                 src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
4614                 src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
4615 
4616                 signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b);
4617                 cmp_gt0_16x8b = _mm_unpacklo_epi8(src_bottom_16x8b, const0_16x8b);
4618                 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, signdwn1_16x8b);
4619                 src_bottom_16x8b = _mm_unpackhi_epi8(src_bottom_16x8b, const0_16x8b);
4620                 cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
4621                 cmp_lt0_16x8b = _mm_unpackhi_epi8(edge1_16x8b, signdwn1_16x8b);
4622                 src_bottom_16x8b = _mm_add_epi16(src_bottom_16x8b, cmp_lt0_16x8b);
4623                 src_bottom_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_bottom_16x8b);
4624 
4625                 cmp_gt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8);
4626                 cmp_lt0_16x8b = _mm_srli_si128(src_bottom_16x8b, 8);
4627 
4628                 _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
4629                 //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
4630                 _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
4631                 // row = 1
4632                 _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_gt0_16x8b);
4633                 //row = 2
4634                 _mm_storel_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd), src_bottom_16x8b);
4635                 // row = 3
4636                 _mm_storel_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd), cmp_lt0_16x8b);
4637 
4638                 src_temp0_16x8b = src_temp1_16x8b;
4639                 signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8);
4640                 pu1_src_cpy += (src_strd << 2);
4641                 pu1_src_left_cpy += 4;
4642                 pu1_src_left_str += 4;
4643             }
4644             ht_rem = ht & 0x2;
4645             if(ht_rem)
4646             {
4647                 left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
4648                 //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
4649                 src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
4650                 // row = 2
4651                 src_bottom_16x8b =  _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd));
4652 
4653                 //manipulation for row 0 -row 1
4654                 signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 14);
4655                 //bottom left
4656                 signdwn1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, signdwn1_16x8b, 15);
4657                 //separating +ve and and -ve values.
4658                 cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b);
4659                 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b);
4660 
4661                 //creating mask 00 for +ve and -ve values and FF for zero.
4662                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
4663                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
4664                 //manipulation for row 1 - row 0
4665                 signdwn1_16x8b = _mm_srli_si128(src_temp0_16x8b, 1);
4666                 //combining the appropriate sign change
4667                 edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
4668 
4669                 //row1-row0
4670                 //separating +ve and and -ve values.
4671                 cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
4672                 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
4673 
4674                 //creating mask 00 for +ve and -ve values and FF for zero.
4675                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
4676                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
4677                 //combining the appropriate sign chang
4678                 edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
4679 
4680                 //manipulation for row 1 -bottom
4681                 signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 13);
4682                 //bottom left
4683                 signdwn1_16x8b = _mm_alignr_epi8(src_bottom_16x8b, signdwn1_16x8b, 15);
4684 
4685                 edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8); //aligned left (0-1)
4686                 signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(1-0),(0-top)
4687                 //row1 -bottom
4688                 cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
4689                 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
4690 
4691                 //creating mask 00 for +ve and -ve values and FF for zero.
4692                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
4693                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
4694                 //combining the appropriate sign change
4695                 signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-2)
4696                 edge0_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge0_16x8b, 8); //(1-2),(0-1)
4697                 //manipulation for bottom- row 1 (row 1 right)
4698                 signdwn1_16x8b = _mm_srli_si128(src_temp1_16x8b, 1);
4699                 //adding top and down substraction
4700                 edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); //(1,0) sign_up empty
4701                 //bottom - row 1
4702                 cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signdwn1_16x8b);
4703                 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_bottom_16x8b);
4704 
4705                 //eliminating old left for row 0,1
4706                 left_store_16x8b = _mm_srli_si128(left_store_16x8b, 2);
4707                 signdwn1_16x8b = _mm_slli_si128(src_temp1_16x8b, 8);
4708                 //creating mask 00 for +ve and -ve values and FF for zero.
4709                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
4710                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
4711                 //for the next iteration signup0_16x8b
4712                 signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(2-1) for next
4713 
4714                 //storing right of row 1 into left
4715                 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 15);
4716                 //for storing right of row 1
4717                 signdwn1_16x8b = _mm_slli_si128(src_temp0_16x8b, 8);
4718 
4719                 src_top_16x8b = src_temp1_16x8b;
4720                 //storing right of row 0 into left
4721                 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 15);
4722 
4723                 //adding constant 2
4724                 edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
4725 
4726                 //shuffle to get sao index
4727                 edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
4728                 //using availability mask
4729                 edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
4730                 //shuffle to get sao offset
4731                 edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
4732 
4733                 //the next top already in  src_top_16x8b
4734                 //cnvert to 16 bit then add and then saturated pack
4735                 signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
4736                 src_temp0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
4737                 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
4738                 src_temp1_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b);
4739                 src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
4740                 cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
4741                 src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, cmp_lt0_16x8b);
4742                 src_temp0_16x8b = _mm_packus_epi16(src_temp0_16x8b, src_temp1_16x8b);
4743 
4744                 cmp_gt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8);
4745 
4746                 _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
4747                 //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
4748                 _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
4749                 // row = 1
4750                 _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_gt0_16x8b);
4751                 src_temp0_16x8b = src_bottom_16x8b;
4752                 pu1_src_cpy += (src_strd << 1);
4753                 pu1_src_left_cpy += 2;
4754                 pu1_src_left_str += 2;
4755             }
4756             ht_rem = ht & 0x1;
4757             if(ht_rem)
4758             {
4759                 left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
4760                 //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
4761                 src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
4762 
4763 
4764                 //manipulation for row 0 -bottom
4765                 signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 14);
4766                 //bottom left
4767                 signdwn1_16x8b = _mm_alignr_epi8(src_bottom_16x8b, signdwn1_16x8b, 15);
4768                 //separating +ve and and -ve values.
4769                 cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b);
4770                 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b);
4771                 //creating mask 00 for +ve and -ve values and FF for zero.
4772                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
4773                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
4774                 //combining the appropriate sign change
4775                 edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
4776                 //adding top and down substraction
4777                 edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
4778                 //for row 0 right to put into left store
4779                 signdwn1_16x8b = _mm_slli_si128(src_temp0_16x8b, 8);
4780                 //adding constant 2
4781                 edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
4782                 edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8);
4783                 edge0_16x8b = _mm_srli_si128(edge0_16x8b, 8);
4784                 //left store manipulation 1
4785                 left_store_16x8b = _mm_srli_si128(left_store_16x8b, 1);
4786                 //filling the left boundary value
4787                 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 15);
4788 
4789                 //shuffle to get sao index
4790                 edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
4791                 //using availability mask
4792                 edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
4793                 //shuffle to get sao offset
4794                 edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
4795                 src_top_16x8b = src_temp0_16x8b;
4796                 //cnvert to 16 bit then add and then saturated pack
4797                 signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
4798                 src_temp0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
4799                 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
4800                 src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
4801                 src_temp0_16x8b = _mm_packus_epi16(src_temp0_16x8b, const0_16x8b);
4802 
4803                 _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
4804                 //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
4805                 _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
4806                 pu1_src_cpy += (src_strd);
4807                 src_temp0_16x8b = src_bottom_16x8b;
4808                 pu1_src_left_cpy++;
4809                 pu1_src_left_str++;
4810             }
4811             {   //for bottom right
4812                 left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
4813                 left_store_16x8b = _mm_srli_si128(left_store_16x8b, 1);
4814                 src_temp0_16x8b = _mm_slli_si128(src_temp0_16x8b, 8);
4815                 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 15);
4816                 _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
4817             }
4818             if(0 == pu1_avail[3])
4819             {
4820                 src_top_16x8b = src_bottom_16x8b;
4821             }
4822             _mm_storel_epi64((__m128i *)(pu1_src_top + wd - col), src_top_16x8b);
4823             pu1_src += 8;
4824 
4825             pu1_left_tmp = pu1_src_left_cpy2;
4826             pu1_src_left_cpy2 = pu1_src_left_str2;
4827             pu1_src_left_str2 = pu1_left_tmp;
4828 
4829             pu1_src_left_cpy = pu1_src_left_cpy2;
4830             pu1_src_left_str = pu1_src_left_str2;
4831 
4832         }
4833         pu1_src_org[wd - 1] = u1_pos_wd_0_tmp;
4834         pu1_src_org[(ht_tmp - 1) * src_strd] = u1_pos_0_ht_tmp;
4835         pu1_src_left_cpy = (0 == pu1_avail[2]) ? (pu1_src_left_cpy - 1) : pu1_src_left_cpy;
4836         pu1_src_left[0] = au1_src_left_tmp[0];
4837         for(row = 1; row < ht_tmp; row++)
4838         {
4839             pu1_src_left[row] = pu1_src_left_cpy[row];
4840         }
4841     }
4842 
4843 }
4844 
ihevc_sao_edge_offset_class3_chroma_ssse3(UWORD8 * pu1_src,WORD32 src_strd,UWORD8 * pu1_src_left,UWORD8 * pu1_src_top,UWORD8 * pu1_src_top_left,UWORD8 * pu1_src_top_right,UWORD8 * pu1_src_bot_left,UWORD8 * pu1_avail,WORD8 * pi1_sao_offset_u,WORD8 * pi1_sao_offset_v,WORD32 wd,WORD32 ht)4845 void ihevc_sao_edge_offset_class3_chroma_ssse3(UWORD8 *pu1_src,
4846                                                WORD32 src_strd,
4847                                                UWORD8 *pu1_src_left,
4848                                                UWORD8 *pu1_src_top,
4849                                                UWORD8 *pu1_src_top_left,
4850                                                UWORD8 *pu1_src_top_right,
4851                                                UWORD8 *pu1_src_bot_left,
4852                                                UWORD8 *pu1_avail,
4853                                                WORD8 *pi1_sao_offset_u,
4854                                                WORD8 *pi1_sao_offset_v,
4855                                                WORD32 wd,
4856                                                WORD32 ht)
4857 {
4858     WORD32 row, col;
4859     UWORD8 *pu1_src_top_cpy, *pu1_src_left_cpy, *pu1_src_left_cpy2;
4860     UWORD8 *pu1_src_cpy, *pu1_src_org;
4861     UWORD8 au1_src_left_tmp[2 * (MAX_CTB_SIZE + 8)];
4862     UWORD8 au1_mask[MAX_CTB_SIZE], *au1_mask_cpy;
4863     WORD32 wd_rem;
4864     UWORD8 u1_pos_wd_0_tmp_u, u1_pos_wd_0_tmp_v, u1_pos_0_ht_tmp_u, u1_pos_0_ht_tmp_v;
4865     WORD32 ht_tmp;
4866     WORD32 bit_depth;
4867     UWORD8 u1_avail0, u1_avail1;
4868 
4869     __m128i src_top_16x8b, src_bottom_16x8b;
4870     __m128i src_temp0_16x8b, src_temp1_16x8b;
4871     __m128i signup0_16x8b, signdwn1_16x8b;
4872     __m128i cmp_gt0_16x8b, cmp_lt0_16x8b;
4873     __m128i edge0_16x8b, edge1_16x8b;
4874     __m128i au1_mask8x16b;
4875     __m128i edge_idx_8x16b, sao_offset_8x16b;
4876     __m128i left_store_16x8b;
4877     __m128i const0_16x8b, const2_16x8b;
4878     __m128i chroma_offset_8x16b;
4879 
4880     ht_tmp = ht;
4881     au1_mask8x16b = _mm_set1_epi8(0xff);
4882 
4883 
4884     au1_src_left_tmp[0] = pu1_src[(wd - 2)];
4885     au1_src_left_tmp[1] = pu1_src[(wd - 1)];
4886     //manipulation for bottom left
4887     for(row = 2; row < 2 * ht; row++)
4888     {
4889         au1_src_left_tmp[row] = pu1_src_left[row];
4890     }
4891     au1_src_left_tmp[2 * ht] = pu1_src_bot_left[0];
4892     au1_src_left_tmp[2 * ht + 1] = pu1_src_bot_left[1];
4893 
4894     pu1_src_top_left[0] = pu1_src_top[wd - 2];
4895     pu1_src_top_left[1] = pu1_src_top[wd - 1];
4896     //setting availability mask to ff size MAX_CTB_SIZE
4897     for(col = 0; col < MAX_CTB_SIZE; col += 16)
4898         _mm_storeu_si128((__m128i *)(au1_mask + col), au1_mask8x16b);
4899     bit_depth = BIT_DEPTH_LUMA;
4900     pu1_src_org = pu1_src;
4901     pu1_src_top_cpy = pu1_src_top;
4902     pu1_src_left_cpy2 = au1_src_left_tmp;
4903     pu1_src_left_cpy = au1_src_left_tmp;
4904     edge_idx_8x16b   = _mm_loadl_epi64((__m128i *)gi1_table_edge_idx);
4905     sao_offset_8x16b = _mm_loadl_epi64((__m128i *)pi1_sao_offset_u);
4906     const0_16x8b = _mm_loadl_epi64((__m128i *)pi1_sao_offset_v);
4907     chroma_offset_8x16b = _mm_set1_epi16(0x0800);
4908     /* If top-right is available, process separately */
4909     if(0 != pu1_avail[5])
4910     {
4911         WORD32 edge_idx;
4912 
4913         /* U */
4914         edge_idx = 2 + SIGN(pu1_src[wd - 2] - pu1_src_top_right[0]) +
4915                         SIGN(pu1_src[wd - 2] - pu1_src[wd - 2 - 2 + src_strd]);
4916 
4917         edge_idx = gi1_table_edge_idx[edge_idx];
4918 
4919         if(0 != edge_idx)
4920         {
4921             u1_pos_wd_0_tmp_u = CLIP3(pu1_src[wd - 2] + pi1_sao_offset_u[edge_idx], 0, (1 << bit_depth) - 1);
4922         }
4923         else
4924         {
4925             u1_pos_wd_0_tmp_u = pu1_src[wd - 2];
4926         }
4927 
4928         /* V */
4929         edge_idx = 2 + SIGN(pu1_src[wd - 1] - pu1_src_top_right[1]) +
4930                         SIGN(pu1_src[wd - 1] - pu1_src[wd - 1 - 2 + src_strd]);
4931 
4932         edge_idx = gi1_table_edge_idx[edge_idx];
4933 
4934         if(0 != edge_idx)
4935         {
4936             u1_pos_wd_0_tmp_v = CLIP3(pu1_src[wd - 1] + pi1_sao_offset_v[edge_idx], 0, (1 << bit_depth) - 1);
4937         }
4938         else
4939         {
4940             u1_pos_wd_0_tmp_v = pu1_src[wd - 1];
4941         }
4942     }
4943     else
4944     {
4945         u1_pos_wd_0_tmp_u = pu1_src[wd - 2];
4946         u1_pos_wd_0_tmp_v = pu1_src[wd - 1];
4947     }
4948 
4949     /* If bottom-left is available, process separately */
4950     if(0 != pu1_avail[6])
4951     {
4952         WORD32 edge_idx;
4953 
4954         /* U */
4955         edge_idx = 2 + SIGN(pu1_src[(ht - 1) * src_strd] - pu1_src[(ht - 1) * src_strd + 2 - src_strd]) +
4956                         SIGN(pu1_src[(ht - 1) * src_strd] - pu1_src_bot_left[0]);
4957 
4958         edge_idx = gi1_table_edge_idx[edge_idx];
4959 
4960         if(0 != edge_idx)
4961         {
4962             u1_pos_0_ht_tmp_u = CLIP3(pu1_src[(ht - 1) * src_strd] + pi1_sao_offset_u[edge_idx], 0, (1 << bit_depth) - 1);
4963         }
4964         else
4965         {
4966             u1_pos_0_ht_tmp_u = pu1_src[(ht - 1) * src_strd];
4967         }
4968 
4969         /* V */
4970         edge_idx = 2 + SIGN(pu1_src[(ht - 1) * src_strd + 1] - pu1_src[(ht - 1) * src_strd + 1 + 2 - src_strd]) +
4971                         SIGN(pu1_src[(ht - 1) * src_strd + 1] - pu1_src_bot_left[1]);
4972 
4973         edge_idx = gi1_table_edge_idx[edge_idx];
4974 
4975         if(0 != edge_idx)
4976         {
4977             u1_pos_0_ht_tmp_v = CLIP3(pu1_src[(ht - 1) * src_strd + 1] + pi1_sao_offset_v[edge_idx], 0, (1 << bit_depth) - 1);
4978         }
4979         else
4980         {
4981             u1_pos_0_ht_tmp_v = pu1_src[(ht - 1) * src_strd + 1];
4982         }
4983     }
4984     else
4985     {
4986         u1_pos_0_ht_tmp_u = pu1_src[(ht - 1) * src_strd];
4987         u1_pos_0_ht_tmp_v = pu1_src[(ht - 1) * src_strd + 1];
4988     }
4989 
4990 
4991 
4992     /* Update height and source pointers based on the availability flags */
4993     if(0 == pu1_avail[2])
4994     {
4995         pu1_src_left_cpy2 += 2;
4996         pu1_src_top_cpy = pu1_src;
4997         pu1_src += src_strd;
4998         ht--;
4999     }
5000     if(0 == pu1_avail[3])
5001     {
5002         ht--;
5003     }
5004 
5005     sao_offset_8x16b = _mm_unpacklo_epi64(sao_offset_8x16b, const0_16x8b);
5006     const2_16x8b = _mm_set1_epi8(2);
5007     const0_16x8b = _mm_setzero_si128();
5008 
5009 
5010     //availability mask creation
5011     u1_avail0 = pu1_avail[0];
5012     u1_avail1 = pu1_avail[1];
5013     au1_mask[0] = u1_avail0;
5014     au1_mask[1] = u1_avail0;
5015     au1_mask[wd - 1] = u1_avail1;
5016     au1_mask[wd - 2] = u1_avail1;
5017     {
5018         WORD32 ht_rem;
5019         au1_mask_cpy = au1_mask;
5020         for(col = wd; col >= 16; col -= 16)
5021         {
5022             pu1_src_cpy = pu1_src;
5023             src_top_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_top_cpy + wd - col + 2));
5024             //row = 0
5025             src_temp0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy));
5026 
5027             //loading the mask
5028             au1_mask8x16b = _mm_loadu_si128((__m128i *)au1_mask_cpy);
5029             //separating +ve and and -ve values.
5030             cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_top_16x8b);
5031             cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp0_16x8b);
5032             //creating mask 00 for +ve and -ve values and FF for zero.
5033             cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
5034             cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
5035             //combining the appropriate sign change
5036             signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
5037             pu1_src_left_cpy = pu1_src_left_cpy2;
5038 
5039             for(row = ht; row >= 2; row -= 2)
5040             {
5041                 left_store_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_left_cpy));
5042                 //row = 1
5043                 src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
5044                 //to insert left in row 1
5045                 signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 12);
5046                 // row = 0 right
5047                 src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2));
5048 
5049                 //manipulation for row 1 - row 0
5050                 signdwn1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, signdwn1_16x8b, 14);
5051                 //row 0 -row1
5052                 //separating +ve and and -ve values.
5053                 cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b);
5054                 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b);
5055 
5056                 //creating mask 00 for +ve and -ve values and FF for zero.
5057                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
5058                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
5059 
5060                 //combining the appropriate sign change
5061                 edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(0-1)
5062                 //combining sign-left and sign_right
5063                 edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
5064 
5065                 //row1-row0
5066                 //separating +ve and and -ve values.
5067                 cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_bottom_16x8b);
5068                 cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp1_16x8b);
5069                 //creating mask 00 for +ve and -ve values and FF for zero.
5070                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
5071                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
5072 
5073                 // row = 2
5074                 src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd));
5075                 // row = 1 right
5076                 signdwn1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd + 2));
5077                 edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-0)
5078 
5079                 //bottom - row1
5080                 cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signdwn1_16x8b);
5081                 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_bottom_16x8b);
5082                 //creating mask 00 for +ve and -ve values and FF for zero.
5083                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
5084                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
5085                 //for the next iteration bottom -row1
5086                 signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
5087 
5088                 //to insert left in row 1
5089                 signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 10);
5090                 //manipulation for row 1 - bottom
5091                 signdwn1_16x8b = _mm_alignr_epi8(src_bottom_16x8b, signdwn1_16x8b, 14);
5092 
5093                 //row1 -bottom
5094                 cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
5095                 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
5096                 //creating mask 00 for +ve and -ve values and FF for zero.
5097                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
5098                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
5099                 //combining the appropriate sign change
5100                 signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
5101 
5102                 //combining sign-left and sign_right
5103                 edge1_16x8b = _mm_add_epi8(edge1_16x8b, signdwn1_16x8b);
5104 
5105                 //eliminating old left for row 0 and row 1
5106                 left_store_16x8b = _mm_srli_si128(left_store_16x8b, 4);
5107                 //row1  getting it right for left of next block
5108                 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp1_16x8b, 14);
5109                 //row0  getting it right for left of next block
5110                 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 14);
5111                 //copying the next top
5112                 src_top_16x8b = src_temp1_16x8b;
5113 
5114 
5115                 //adding constant 2
5116                 edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
5117                 edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b);
5118                 //shuffle to get sao index
5119                 edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
5120                 edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b);
5121                 //using availability mask
5122                 edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
5123                 edge1_16x8b = _mm_and_si128(edge1_16x8b, au1_mask8x16b);
5124 
5125                 //adding chroma offset to access U and V
5126                 edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b);
5127                 edge1_16x8b = _mm_add_epi8(edge1_16x8b, chroma_offset_8x16b);
5128 
5129                 //shuffle to get sao offset
5130                 edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
5131                 edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b);
5132                 //cnvert to 16 bit then add and then saturated pack
5133                 signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
5134                 cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
5135                 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
5136                 src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
5137                 edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
5138                 cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
5139                 src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, edge0_16x8b);
5140                 src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
5141 
5142                 signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b);
5143                 cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b);
5144                 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, signdwn1_16x8b);
5145                 src_temp1_16x8b = _mm_unpackhi_epi8(src_temp1_16x8b, const0_16x8b);
5146                 edge1_16x8b = _mm_unpackhi_epi8(edge1_16x8b, signdwn1_16x8b);
5147                 cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
5148                 src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, edge1_16x8b);
5149                 src_temp1_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp1_16x8b);
5150                 //store left boundary
5151                 _mm_storel_epi64((__m128i *)(pu1_src_left_cpy), left_store_16x8b);
5152                 //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
5153                 _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
5154                 // row = 1
5155                 _mm_storeu_si128((__m128i *)(pu1_src_cpy + src_strd), src_temp1_16x8b);
5156 
5157                 src_temp0_16x8b = src_bottom_16x8b;
5158                 pu1_src_cpy += (src_strd << 1);
5159                 pu1_src_left_cpy += 4;
5160             }
5161             ht_rem = ht & 0x1;
5162 
5163             if(ht_rem)
5164             {
5165                 left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
5166                 src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
5167                 //to insert left in row 1
5168                 signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 12);
5169                 //manipulation for row 1 - row 0
5170                 signdwn1_16x8b = _mm_alignr_epi8(src_bottom_16x8b, signdwn1_16x8b, 14);
5171 
5172                 //current row -next row
5173                 //separating +ve and and -ve values.
5174                 cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b);
5175                 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b);
5176                 //creating mask 00 for +ve and -ve values and FF for zero.
5177                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
5178                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
5179                 //combining the appropriate sign change
5180                 edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
5181                 //adding top and bottom and constant 2
5182                 edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
5183                 edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
5184                 //eliminating old left for row 0 and row 1
5185                 left_store_16x8b = _mm_srli_si128(left_store_16x8b, 2);
5186                 //row0  getting it right for left of next block
5187                 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 14);
5188                 //copying the next top
5189                 src_top_16x8b = src_temp0_16x8b;
5190 
5191                 edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
5192                 //using availability mask
5193                 edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
5194 
5195                 //adding chroma offset to access U and V
5196                 edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b);
5197 
5198 
5199                 edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
5200 
5201                 //cnvert to 16 bit then add and then saturated pack
5202                 signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
5203                 cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
5204                 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
5205                 src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
5206                 edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
5207                 cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
5208                 src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, edge0_16x8b);
5209                 src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
5210 
5211                 //store left boundary
5212                 _mm_storel_epi64((__m128i *)(pu1_src_left_cpy), left_store_16x8b);
5213 
5214                 _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
5215                 pu1_src_cpy += (src_strd);
5216                 src_temp0_16x8b = src_bottom_16x8b;
5217                 pu1_src_left_cpy += 2;
5218             }
5219             {   //for bottom right
5220                 left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
5221                 left_store_16x8b = _mm_srli_si128(left_store_16x8b, 2);
5222                 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 14);
5223                 _mm_storel_epi64((__m128i *)(pu1_src_left_cpy), left_store_16x8b);
5224             }
5225             if(0 == pu1_avail[3])
5226             {
5227                 src_top_16x8b = src_bottom_16x8b;
5228             }
5229             //for the top left of next part of the block
5230             left_store_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_top_cpy + wd - col));
5231             //updating top flag
5232             _mm_storeu_si128((__m128i *)(pu1_src_top + wd - col), src_top_16x8b);
5233             pu1_src += 16;
5234             au1_mask_cpy += 16;
5235         }
5236         pu1_src_left_cpy = pu1_src_left_cpy2;
5237         wd_rem = wd & 0xF;
5238         if(wd_rem)
5239         {
5240             pu1_src_cpy = pu1_src;
5241             src_top_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_top_cpy + wd - col + 2));
5242             //row = 0
5243             src_temp0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy));
5244             au1_mask8x16b = _mm_loadl_epi64((__m128i *)au1_mask_cpy); //????
5245             //separating +ve and and -ve values.
5246             cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_top_16x8b);
5247             cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp0_16x8b);
5248             //creating mask 00 for +ve and -ve values and FF for zero.
5249             cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
5250             cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
5251             //preparing au1_mask
5252             au1_mask8x16b = _mm_unpacklo_epi64(au1_mask8x16b, au1_mask8x16b);
5253             //combining the appropriate sign change
5254             signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
5255             signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8);
5256             pu1_src_left_cpy = pu1_src_left_cpy2;
5257             for(row = ht; row >= 4; row -= 4)
5258             {
5259                 left_store_16x8b = _mm_loadu_si128((__m128i *)pu1_src_left_cpy);
5260                 //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
5261                 src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
5262                 // row = 2
5263                 src_bottom_16x8b =  _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd));
5264                 //manipulation for row 0 -row 1
5265                 signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 12);
5266                 //row 1 left
5267                 signdwn1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, signdwn1_16x8b, 14);
5268                 //row 0 -row1
5269                 //separating +ve and and -ve values.
5270                 cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b);
5271                 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b);
5272 
5273                 //creating mask 00 for +ve and -ve values and FF for zero.
5274                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
5275                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
5276                 //manipulatiing for row 1 -row 0
5277                 signdwn1_16x8b = _mm_srli_si128(src_temp0_16x8b, 2);
5278                 //combining the appropriate sign change
5279                 edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
5280                 //row 1 -row0
5281                 //separating +ve and and -ve values.
5282                 cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
5283                 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
5284 
5285                 //creating mask 00 for +ve and -ve values and FF for zero.
5286                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
5287                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
5288                 //row1-row0
5289                 edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
5290 
5291                 edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8); //aligned left (0-1)
5292 
5293                 signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(1-0),(0-top)
5294                 //manipulation for row 1 -row 2
5295                 signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 10);
5296                 //row 2 left
5297                 signdwn1_16x8b = _mm_alignr_epi8(src_bottom_16x8b, signdwn1_16x8b, 14);
5298                 //packing row 0 n row 1
5299                 src_temp0_16x8b = _mm_unpacklo_epi64(src_temp0_16x8b, src_temp1_16x8b);
5300                 //row1 -row2
5301                 cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
5302                 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
5303                 //creating mask 00 for +ve and -ve values and FF for zero.
5304                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
5305                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
5306                 //combining the appropriate sign change
5307                 signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-2)
5308                 edge0_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge0_16x8b, 8); //(1-2),(0-1)
5309 
5310                 //row 1 right
5311                 signdwn1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2);
5312                 //row = 3
5313                 src_top_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 3 * src_strd));
5314 
5315                 // row = 4
5316                 src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 4 * src_strd));
5317 
5318                 edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); //(1,0) sign_up empty
5319 
5320                 //separating +ve and and -ve values.(2,1)
5321                 cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signdwn1_16x8b);
5322                 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_bottom_16x8b);
5323 
5324                 //creating mask 00 for +ve and -ve values and FF for zero.
5325                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
5326                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
5327                 //row 2 right
5328                 signdwn1_16x8b = _mm_srli_si128(src_bottom_16x8b, 2);
5329                 //combining the appropriate sign change
5330                 signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(2-1)
5331 
5332                 //separating +ve and and -ve values.(3,2)
5333                 cmp_gt0_16x8b = _mm_subs_epu8(src_top_16x8b, signdwn1_16x8b);
5334                 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_top_16x8b);
5335                 signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8); //aligned left (2-1)
5336                 //creating mask 00 for +ve and -ve values and FF for zero.
5337                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
5338                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
5339                 //manipulation for row 2 -row 3
5340                 signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 8);
5341                 //row 3 left
5342                 signdwn1_16x8b = _mm_alignr_epi8(src_top_16x8b, signdwn1_16x8b, 14);
5343                 //combining the appropriate sign change
5344                 edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(3-2)
5345 
5346                 signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(3-2) ,(2-1)
5347 
5348                 //separating +ve and and -ve values.(2,3)
5349                 cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signdwn1_16x8b);
5350                 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_bottom_16x8b);
5351 
5352                 //manipulation for row 3 -bottom
5353                 signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 6);
5354                 //bottom left
5355                 signdwn1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, signdwn1_16x8b, 14);
5356 
5357                 //creating mask 00 for +ve and -ve values and FF for zero.
5358                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
5359                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
5360                 //combining the appropriate sign change
5361                 edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(2-3)
5362 
5363                 //separating +ve and and -ve values.(3,bottom)
5364                 cmp_gt0_16x8b = _mm_subs_epu8(src_top_16x8b, signdwn1_16x8b);
5365                 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_top_16x8b);
5366 
5367                 //creating mask 00 for +ve and -ve values and FF for zero.
5368                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
5369                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
5370                 edge1_16x8b = _mm_slli_si128(edge1_16x8b, 8); //aligned left (2-3)
5371                 //combining the appropriate sign change
5372                 signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(3-bottom)
5373                 edge1_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge1_16x8b, 8); //(3-bottom),(2-3)
5374 
5375 
5376                 //eliminating old left for row 0,1,2,3
5377                 left_store_16x8b = _mm_srli_si128(left_store_16x8b, 8);
5378                 //packing row 2 n row 3
5379                 src_bottom_16x8b = _mm_unpacklo_epi64(src_bottom_16x8b, src_top_16x8b);
5380                 //row 3 right
5381                 signdwn1_16x8b = _mm_srli_si128(src_top_16x8b, 2);
5382                 //loading row 3 right into left
5383                 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_bottom_16x8b, 14);
5384                 //adding bottom and top values of row 2 and row 3
5385                 edge1_16x8b = _mm_add_epi8(edge1_16x8b, signup0_16x8b); //(3,2)
5386                 //separating +ve and and -ve values.(botttom,3)
5387                 cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
5388                 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
5389                 //to store right of row 2
5390                 signdwn1_16x8b = _mm_slli_si128(src_bottom_16x8b, 8);
5391                 //creating mask 00 for +ve and -ve values and FF for zero.
5392                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
5393                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
5394                 signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(bottom -3) for next iteration
5395 
5396                 //storing right of row 2into left
5397                 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 14);
5398                 //to store right of row 0
5399                 signdwn1_16x8b = _mm_slli_si128(src_temp0_16x8b, 8);
5400                 //storing right of row 1 into left
5401                 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 14);
5402                 //storing right of row 0 into left
5403                 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 14);
5404 
5405 
5406                 //adding constant 2
5407                 edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
5408                 edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b);
5409                 //shuffle to get sao index
5410                 edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
5411                 edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b);
5412                 //using availability mask
5413                 edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
5414                 edge1_16x8b = _mm_and_si128(edge1_16x8b, au1_mask8x16b);
5415                 //adding chroma offset to access U and V
5416                 edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b);
5417                 edge1_16x8b = _mm_add_epi8(edge1_16x8b, chroma_offset_8x16b);
5418                 //shuffle to get sao offset
5419                 edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
5420                 edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b);
5421 
5422                 //cnvert to 16 bit then add and then saturated pack
5423                 signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
5424                 cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
5425                 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
5426                 src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
5427                 edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
5428                 cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
5429                 src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, edge0_16x8b);
5430                 src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
5431 
5432                 signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b);
5433                 cmp_gt0_16x8b = _mm_unpacklo_epi8(src_bottom_16x8b, const0_16x8b);
5434                 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, signdwn1_16x8b);
5435                 src_bottom_16x8b = _mm_unpackhi_epi8(src_bottom_16x8b, const0_16x8b);
5436                 edge1_16x8b = _mm_unpackhi_epi8(edge1_16x8b, signdwn1_16x8b);
5437                 cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
5438                 src_bottom_16x8b = _mm_add_epi16(src_bottom_16x8b, edge1_16x8b);
5439                 src_bottom_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_bottom_16x8b);
5440 
5441                 cmp_gt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8);
5442                 cmp_lt0_16x8b = _mm_srli_si128(src_bottom_16x8b, 8);
5443                 _mm_storel_epi64((__m128i *)(pu1_src_left_cpy), left_store_16x8b);
5444                 //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
5445                 _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
5446                 // row = 1
5447                 _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_gt0_16x8b);
5448                 //row = 2
5449                 _mm_storel_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd), src_bottom_16x8b);
5450                 // row = 3
5451                 _mm_storel_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd), cmp_lt0_16x8b);
5452 
5453                 src_temp0_16x8b = src_temp1_16x8b;
5454                 signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8);
5455                 pu1_src_cpy += (src_strd << 2);
5456                 pu1_src_left_cpy += 8;
5457             }
5458             ht_rem = ht & 0x2;
5459             if(ht_rem)
5460             {
5461                 left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
5462                 //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
5463                 src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
5464                 // row = 2
5465                 src_bottom_16x8b =  _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd));
5466 
5467                 //manipulation for row 0 -row 1
5468                 signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 12);
5469                 //bottom left
5470                 signdwn1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, signdwn1_16x8b, 14);
5471                 //separating +ve and and -ve values.
5472                 cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b);
5473                 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b);
5474 
5475                 //creating mask 00 for +ve and -ve values and FF for zero.
5476                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
5477                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
5478                 //manipulation for row 1 - row 0
5479                 signdwn1_16x8b = _mm_srli_si128(src_temp0_16x8b, 2);
5480                 //combining the appropriate sign change
5481                 edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
5482 
5483                 //row1-row0
5484                 //separating +ve and and -ve values.
5485                 cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
5486                 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
5487 
5488                 //creating mask 00 for +ve and -ve values and FF for zero.
5489                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
5490                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
5491                 //combining the appropriate sign chang
5492                 edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
5493 
5494                 //manipulation for row 1 -bottom
5495                 signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 10);
5496                 //bottom left
5497                 signdwn1_16x8b = _mm_alignr_epi8(src_bottom_16x8b, signdwn1_16x8b, 14);
5498 
5499                 edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8); //aligned left (0-1)
5500                 signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(1-0),(0-top)
5501                 //row1 -bottom
5502                 cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
5503                 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
5504 
5505                 //creating mask 00 for +ve and -ve values and FF for zero.
5506                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
5507                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
5508                 //combining the appropriate sign change
5509                 signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-2)
5510                 edge0_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge0_16x8b, 8); //(1-2),(0-1)
5511 
5512                 //manipulation for bottom- row 1 (row 1 right)
5513                 signdwn1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2);
5514                 //adding top and down substraction
5515                 edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); //(1,0) sign_up empty
5516                 //bottom - row 1
5517                 cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signdwn1_16x8b);
5518                 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_bottom_16x8b);
5519 
5520                 //eliminating old left for row 0,1
5521                 left_store_16x8b = _mm_srli_si128(left_store_16x8b, 4);
5522                 signdwn1_16x8b = _mm_slli_si128(src_temp1_16x8b, 8);
5523                 //creating mask 00 for +ve and -ve values and FF for zero.
5524                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
5525                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
5526                 //for the next iteration signup0_16x8b
5527                 signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(2-1) for next
5528 
5529                 //storing right of row 1 into left
5530                 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 14);
5531                 //for storing right of row 1
5532                 signdwn1_16x8b = _mm_slli_si128(src_temp0_16x8b, 8);
5533 
5534                 src_top_16x8b = src_temp1_16x8b;
5535                 //storing right of row 0 into left
5536                 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 14);
5537 
5538                 //adding constant 2
5539                 edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
5540 
5541                 //shuffle to get sao index
5542                 edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
5543                 //using availability mask
5544                 edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
5545                 //adding chroma offset to access U and V
5546                 edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b);
5547                 //shuffle to get sao offset
5548                 edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
5549                 //the next top already in  src_top_16x8b
5550                 //cnvert to 16 bit then add and then saturated pack
5551                 signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
5552                 src_temp0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
5553                 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
5554                 src_temp1_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b);
5555                 edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
5556                 src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
5557                 src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, edge0_16x8b);
5558                 src_temp0_16x8b = _mm_packus_epi16(src_temp0_16x8b, src_temp1_16x8b);
5559 
5560                 cmp_gt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8);
5561 
5562                 _mm_storel_epi64((__m128i *)(pu1_src_left_cpy), left_store_16x8b);
5563                 //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
5564                 _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
5565                 // row = 1
5566                 _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_gt0_16x8b);
5567                 src_temp0_16x8b = src_bottom_16x8b;
5568                 pu1_src_cpy += (src_strd << 1);
5569                 pu1_src_left_cpy += 4;
5570             }
5571             ht_rem = ht & 0x1;
5572             if(ht_rem)
5573             {
5574                 left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
5575                 //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
5576                 src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
5577 
5578 
5579                 //manipulation for row 0 -bottom
5580                 signdwn1_16x8b =  _mm_slli_si128(left_store_16x8b, 12);
5581                 //bottom left
5582                 signdwn1_16x8b = _mm_alignr_epi8(src_bottom_16x8b, signdwn1_16x8b, 14);
5583                 //separating +ve and and -ve values.
5584                 cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b);
5585                 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b);
5586                 //creating mask 00 for +ve and -ve values and FF for zero.
5587                 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
5588                 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
5589                 //combining the appropriate sign change
5590                 edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
5591                 //adding top and down substraction
5592                 edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
5593                 //for row 0 right to put into left store
5594                 signdwn1_16x8b = _mm_slli_si128(src_temp0_16x8b, 8);
5595                 //adding constant 2
5596                 edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
5597                 edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8);
5598                 edge0_16x8b = _mm_srli_si128(edge0_16x8b, 8);
5599                 //left store manipulation 1
5600                 left_store_16x8b = _mm_srli_si128(left_store_16x8b, 2);
5601                 //filling the left boundary value
5602                 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 14);
5603                 src_top_16x8b = src_temp0_16x8b;
5604 
5605                 //shuffle to get sao index
5606                 edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
5607                 //using availability mask
5608                 edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
5609                 //adding chroma offset to access U and V
5610                 edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b);
5611                 //shuffle to get sao offset
5612                 edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
5613 
5614                 //cnvert to 16 bit then add and then saturated pack
5615                 signdwn1_16x8b =  _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
5616                 src_temp0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
5617                 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
5618                 src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
5619                 src_temp0_16x8b = _mm_packus_epi16(src_temp0_16x8b, const0_16x8b);
5620 
5621                 _mm_storel_epi64((__m128i *)(pu1_src_left_cpy), left_store_16x8b);
5622                 //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
5623                 _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
5624                 pu1_src_cpy += (src_strd);
5625                 src_temp0_16x8b = src_bottom_16x8b;
5626                 pu1_src_left_cpy += 2;
5627             }
5628             {   //for bottom right
5629                 left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
5630                 left_store_16x8b = _mm_srli_si128(left_store_16x8b, 2);
5631                 src_temp0_16x8b = _mm_slli_si128(src_temp0_16x8b, 8);
5632                 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 14);
5633                 _mm_storel_epi64((__m128i *)(pu1_src_left_cpy), left_store_16x8b);
5634             }
5635             if(0 == pu1_avail[3])
5636             {
5637                 src_top_16x8b = src_bottom_16x8b;
5638             }
5639 
5640             _mm_storel_epi64((__m128i *)(pu1_src_top + wd - col), src_top_16x8b);
5641             pu1_src += 8;
5642         }
5643         pu1_src_org[wd - 2] = u1_pos_wd_0_tmp_u;
5644         pu1_src_org[wd - 1] = u1_pos_wd_0_tmp_v;
5645         pu1_src_org[(ht_tmp - 1) * src_strd] = u1_pos_0_ht_tmp_u;
5646         pu1_src_org[(ht_tmp - 1) * src_strd + 1] = u1_pos_0_ht_tmp_v;
5647         for(row = 0; row < 2 * ht_tmp; row++)
5648         {
5649             pu1_src_left[row] = au1_src_left_tmp[row];
5650         }
5651     }
5652 
5653 }
5654