1///*****************************************************************************
2//*
3//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
4//*
5//* Licensed under the Apache License, Version 2.0 (the "License");
6//* you may not use this file except in compliance with the License.
7//* You may obtain a copy of the License at:
8//*
9//* http://www.apache.org/licenses/LICENSE-2.0
10//*
11//* Unless required by applicable law or agreed to in writing, software
12//* distributed under the License is distributed on an "AS IS" BASIS,
13//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14//* See the License for the specific language governing permissions and
15//* limitations under the License.
16//*
17//*****************************************************************************/
18///**
19//*******************************************************************************
20//* ,:file
21//*  ihevc_sao_edge_offset_class3_chroma.s
22//*
23//* ,:brief
24//*  Contains function definitions for inter prediction  interpolation.
25//* Functions are coded using NEON  intrinsics and can be compiled using@ ARM
26//* RVCT
27//*
28//* ,:author
29//*  Parthiban V
30//*
31//* ,:par List of Functions:
32//*
33//*
34//* ,:remarks
35//*  None
36//*
37//*******************************************************************************
38//*/
39//void ihevc_sao_edge_offset_class3_chroma(UWORD8 *pu1_src,
40//                              WORD32 src_strd,
41//                              UWORD8 *pu1_src_left,
42//                              UWORD8 *pu1_src_top,
43//                              UWORD8 *pu1_src_top_left,
44//                              UWORD8 *pu1_src_top_right,
45//                              UWORD8 *pu1_src_bot_left,
46//                              UWORD8 *pu1_avail,
47//                              WORD8 *pi1_sao_offset_u,
48//                              WORD8 *pi1_sao_offset_v,
49//                              WORD32 wd,
50//                              WORD32 ht)
51//**************Variables Vs Registers*****************************************
52//x0 =>    *pu1_src
53//x1 =>    src_strd
54//x2 =>    *pu1_src_left
55//x3 =>    *pu1_src_top
56//x4    =>    *pu1_src_top_left
57//x5    =>    *pu1_avail
58//x6    =>    *pi1_sao_offset_u
59//x9 =>  *pi1_sao_offset_v
60//x7    =>    wd
61//x8=>    ht
62
63.text
64.p2align 2
65.include "ihevc_neon_macros.s"
66.globl gi1_table_edge_idx
67.globl ihevc_sao_edge_offset_class3_chroma_av8
68
69ihevc_sao_edge_offset_class3_chroma_av8:
70
71
72    // STMFD sp!,{x4-x12,x14}            //stack stores the values of the arguments
73
74
75    ldr         x8,[sp,#0]
76    ldr         x9,[sp,#8]
77    ldr         w10,[sp,#16]
78    ldr         w11,[sp,#24]
79
80
81    // STMFD sp!, {x4-x12, x14}            //stack stores the values of the arguments
82    stp         x19, x20,[sp,#-16]!
83    stp         x21, x22,[sp,#-16]!
84    stp         x23, x24,[sp,#-16]!
85    stp         x25, x26,[sp,#-16]!
86    stp         x27, x28,[sp,#-16]!
87
88    mov         x15,x4 // *pu1_src_top_left 0x28
89    mov         x16,x5 // *pu1_src_top_right 0x2c
90    mov         x17,x6 // *pu1_src_bot_left 0x30
91    mov         x21,x7 // *pu1_avail 0x34
92    mov         x22,x8 // *pi1_sao_offset_u 0x38
93    mov         x23,x9 // *pi1_sao_offset_v 0x3c
94    mov         x24,x10 // wd 0x40
95    mov         x25,x11 // ht 0x44
96
97
98    mov         w7, w24                     //Loads wd
99    mov         w8, w25                     //Loads ht
100    SUB         x9,x7,#2                    //wd - 2
101
102    mov         x4, x15                     //Loads pu1_src_top_left
103    LDRH        w10,[x3,x9]                 //pu1_src_top[wd - 2]
104
105    MOV         x9,x7                       //Move width to x9 for loop count
106
107    mov         x5, x21                     //Loads pu1_avail
108    mov         x6, x22                     //Loads pi1_sao_offset_u
109
110    mov         x22, x3                     //Store pu1_src_top in sp
111    SUB         sp,sp,#0xE0                 //Decrement the stack pointer to store some temp arr values
112
113    STRH        w10,[sp]                    //u1_src_top_left_tmp = pu1_src_top[wd - 2]
114    SUB         x10,x8,#1                   //ht-1
115    madd        x11, x10, x1, x0            //pu1_src[(ht - 1) * src_strd + col]
116    ADD         x12,sp,#10                  //temp array
117
118AU1_SRC_TOP_LOOP:
119    LD1         {v0.8b},[x11],#8            //pu1_src[(ht - 1) * src_strd + col]
120    SUBS        x9,x9,#8                    //Decrement the loop count by 8
121    ST1         {v0.8b},[x12],#8            //au1_src_top_tmp[col] = pu1_src[(ht - 1) * src_strd + col]
122    BNE         AU1_SRC_TOP_LOOP
123
124PU1_AVAIL_5_LOOP_U:
125    LDRB        w9,[x5,#5]                  //pu1_avail[5]
126    CMP         x9,#0
127    SUB         x14,x7,#2                   //[wd - 2]
128    LDRB        w9,[x0,x14]                 //u1_pos_0_0_tmp_u = pu1_src[wd - 2]
129    SUB         x11,x7,#1                   //[wd - 1]
130    LDRB        w10,[x0,x11]                //u1_pos_0_0_tmp_v = pu1_src[wd - 1]
131    BEQ         PU1_AVAIL_6_LOOP_U
132
133    mov         x11, x16                    //Load pu1_src_top_right from sp
134    LDRB        w11,[x11]                   //pu1_src_top_right[0]
135    SUB         x12,x9,x11                  //pu1_src[wd - 2] - pu1_src_top_right[0]
136    CMP         x12,#0
137    movn        x20,#0
138    csel        x12, x20, x12,LT
139    MOV         x20,#1
140    csel        x12, x20, x12,GT            //SIGN(pu1_src[wd - 2] - pu1_src_top_right[0])
141    ADD         x11,x0,x1                   //pu1_src + src_strd
142    SUB         x14,x14,#2                  //[wd - 2 - 2]
143    LDRB        w14,[x11,x14]               //pu1_src[wd - 2 - 2 + src_strd]
144    SUB         x11,x9,x14                  //pu1_src[wd - 2] - pu1_src[wd - 2 - 2 + src_strd]
145    CMP         x11,#0
146    movn        x20,#0
147    csel        x11, x20, x11,LT
148    MOV         x20,#1
149    csel        x11, x20, x11,GT            //SIGN(pu1_src[wd - 2] - pu1_src[wd - 2 - 2 + src_strd])
150    ADD         x11,x12,x11                 //SIGN(pu1_src[wd - 2] - pu1_src_top_right[0]) +  SIGN(pu1_src[wd - 2] - pu1_src[wd - 2 - 2 + src_strd])
151    ADD         x11,x11,#2                  //edge_idx
152    ADRP        x14, :got:gi1_table_edge_idx //table pointer
153    LDR         x14, [x14, #:got_lo12:gi1_table_edge_idx]
154
155    LDRSB       x12,[x14,x11]               //edge_idx = gi1_table_edge_idx[edge_idx]
156    CMP         x12,#0                      //0 != edge_idx
157    BEQ         PU1_AVAIL_5_LOOP_V
158    LDRSB       x11,[x6,x12]                //pi1_sao_offset_u[edge_idx]
159    ADD         x9,x9,x11                   //pu1_src[wd - 2] + pi1_sao_offset_u[edge_idx]
160    mov         x20,#255
161    cmp         x9,x20
162    csel        x9, x20, x9, ge             //u1_pos_0_0_tmp_u = CLIP3(pu1_src[wd - 2] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1)
163    mov         x20,#0
164    cmp         x9,x20
165    csel        x9, x20, x9, LT             //u1_pos_0_0_tmp_u = CLIP3(pu1_src[wd - 2] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1)
166
167PU1_AVAIL_5_LOOP_V:
168
169    mov         x11, x16                    //Load pu1_src_top_right from sp
170    LDRB        w11,[x11,#1]                //pu1_src_top_right[1]
171    SUB         x12,x10,x11                 //pu1_src[wd - 1] - pu1_src_top_right[1]
172    CMP         x12,#0
173    movn        x20,#0
174    csel        x12, x20, x12,LT
175    MOV         x20,#1
176    csel        x12, x20, x12,GT            //SIGN(pu1_src[wd - 1] - pu1_src_top_right[1])
177    ADD         x11,x0,x1                   //pu1_src + src_strd
178    SUB         x14,x7,#3                   //[wd - 1 - 2]
179    LDRB        w14,[x11,x14]               //pu1_src[wd - 1 - 2 + src_strd]
180    SUB         x11,x10,x14                 //pu1_src[wd - 1] - pu1_src[wd - 1 - 2 + src_strd]
181    CMP         x11,#0
182    movn        x20,#0
183    csel        x11, x20, x11,LT
184    MOV         x20,#1
185    csel        x11, x20, x11,GT            //SIGN(pu1_src[wd - 1] - pu1_src[wd - 1 - 2 + src_strd])
186    ADD         x11,x12,x11                 //SIGN(pu1_src[wd - 1] - pu1_src_top_right[1]) +  SIGN(pu1_src[wd - 1] - pu1_src[wd - 1 - 2 + src_strd])
187    ADD         x11,x11,#2                  //edge_idx
188    ADRP        x14, :got:gi1_table_edge_idx //table pointer
189    LDR         x14, [x14, #:got_lo12:gi1_table_edge_idx]
190
191    LDRSB       x12,[x14,x11]               //edge_idx = gi1_table_edge_idx[edge_idx]
192    CMP         x12,#0                      //0 != edge_idx
193    BEQ         PU1_AVAIL_6_LOOP_U
194    mov         x11, x23                    //Loads pi1_sao_offset_v
195    LDRSB       x11,[x11,x12]               //pi1_sao_offset_v[edge_idx]
196    ADD         x10,x10,x11                 //pu1_src[wd - 1] + pi1_sao_offset_v[edge_idx]
197    mov         x20,#255
198    cmp         x10,x20
199    csel        x10, x20, x10, ge           //u1_pos_0_0_tmp_v = CLIP3(pu1_src[wd - 1] + pi1_sao_offset_v[edge_idx], 0, (1 << bit_depth) - 1)
200    mov         x20,#0
201    cmp         x10,x20
202    csel        x10, x20, x10, LT           //u1_pos_0_0_tmp_v = CLIP3(pu1_src[wd - 1] + pi1_sao_offset_v[edge_idx], 0, (1 << bit_depth) - 1)
203
204PU1_AVAIL_6_LOOP_U:
205    STRB        w9,[sp,#6]
206    STRB        w10,[sp,#7]
207    mov         x26, x0                     //Store pu1_src in sp
208
209    LDRB        w10,[x5,#6]                 //pu1_avail[6]
210    CMP         x10,#0
211    SUB         x11,x8,#1                   //ht - 1
212    madd        x12, x11, x1, x0            //pu1_src[(ht - 1) * src_strd]
213    LDRB        w10,[x12]                   //u1_pos_wd_ht_tmp_u = pu1_src[(ht - 1) * src_strd]
214    LDRB        w9,[x12,#1]                 //u1_pos_wd_ht_tmp_v = pu1_src[(ht - 1) * src_strd + 1]
215    BEQ         PU1_AVAIL_3_LOOP
216
217    SUB         x11,x12,x1                  //pu1_src[(ht - 1) * src_strd - src_strd]
218    ADD         x11,x11,#2                  //pu1_src[(ht - 1) * src_strd +  2 - src_strd]
219    LDRB        w11,[x11]                   //Load pu1_src[(ht - 1) * src_strd +  2 - src_strd]
220    SUB         x11,x10,x11                 //pu1_src[(ht - 1) * src_strd] - pu1_src[(ht - 1) * src_strd +  2 - src_strd]
221    CMP         x11,#0
222    movn        x20,#0
223    csel        x11, x20, x11,LT
224    MOV         x20,#1
225    csel        x11, x20, x11,GT            //SIGN(pu1_src[(ht - 1) * src_strd] - pu1_src[(ht - 1) * src_strd +  2 - src_strd])
226
227    mov         x14, x17                    //Load pu1_src_bot_left from sp
228    LDRB        w14,[x14]                   //Load pu1_src_bot_left[0]
229    SUB         x14,x10,x14                 //pu1_src[(ht - 1) * src_strd] - pu1_src_bot_left[0]
230    CMP         x14,#0
231    movn        x20,#0
232    csel        x14, x20, x14,LT
233    MOV         x20,#1
234    csel        x14, x20, x14,GT            //SIGN(pu1_src[(ht - 1) * src_strd] - pu1_src_bot_left[0])
235
236    ADD         x11,x11,x14                 //Add 2 sign value
237    ADD         x11,x11,#2                  //edge_idx
238    ADRP        x14, :got:gi1_table_edge_idx //table pointer
239    LDR         x14, [x14, #:got_lo12:gi1_table_edge_idx]
240
241    LDRSB       x14,[x14,x11]               //edge_idx = gi1_table_edge_idx[edge_idx]
242    CMP         x14,#0
243    BEQ         PU1_AVAIL_6_LOOP_V
244    LDRSB       x11,[x6,x14]                //pi1_sao_offset_u[edge_idx]
245    ADD         x10,x10,x11                 //pu1_src[(ht - 1) * src_strd] + pi1_sao_offset[edge_idx]
246    mov         x20,#255
247    cmp         x10,x20
248    csel        x10, x20, x10, ge           //u1_pos_wd_ht_tmp = CLIP3(pu1_src[(ht - 1) * src_strd] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1)
249    mov         x20,#0
250    cmp         x10,x20
251    csel        x10, x20, x10, LT           //u1_pos_wd_ht_tmp = CLIP3(pu1_src[(ht - 1) * src_strd] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1)
252
253PU1_AVAIL_6_LOOP_V:
254    ADD         x12,x12,#1                  //pu1_src[(ht - 1) * src_strd + 1]
255    SUB         x11,x12,x1                  //pu1_src[(ht - 1) * src_strd + 1) - src_strd]
256    ADD         x11,x11,#2                  //pu1_src[(ht - 1) * src_strd + 2 - src_strd]
257    LDRB        w11,[x11]                   //Load pu1_src[(ht - 1) * src_strd + 2 - src_strd]
258    SUB         x11,x9,x11                  //pu1_src[(ht - 1) * src_strd + 1] - pu1_src[(ht - 1) * src_strd + 1 + 2 - src_strd]
259    CMP         x11,#0
260    movn        x20,#0
261    csel        x11, x20, x11,LT
262    MOV         x20,#1
263    csel        x11, x20, x11,GT            //SIGN(pu1_src[(ht - 1) * src_strd + 1] - pu1_src[(ht - 1) * src_strd + 1 + 2 - src_strd])
264
265    mov         x14, x17                    //Load pu1_src_bot_left from sp
266    LDRB        w14,[x14,#1]                //Load pu1_src_bot_left[1]
267    SUB         x14,x9,x14                  //pu1_src[(ht - 1) * src_strd + 1] - pu1_src_bot_left[1]
268    CMP         x14,#0
269    movn        x20,#0
270    csel        x14, x20, x14,LT
271    MOV         x20,#1
272    csel        x14, x20, x14,GT            //SIGN(pu1_src[(ht - 1) * src_strd + 1] - pu1_src_bot_left[1])
273
274    ADD         x11,x11,x14                 //Add 2 sign value
275    ADD         x11,x11,#2                  //edge_idx
276    ADRP        x14, :got:gi1_table_edge_idx //table pointer
277    LDR         x14, [x14, #:got_lo12:gi1_table_edge_idx]
278
279    LDRSB       x12,[x14,x11]               //edge_idx = gi1_table_edge_idx[edge_idx]
280    CMP         x12,#0
281    BEQ         PU1_AVAIL_3_LOOP
282    mov         x14, x23                    //Loads pi1_sao_offset_v
283    LDRSB       x11,[x14,x12]               //pi1_sao_offset_v[edge_idx]
284    ADD         x9,x9,x11                   //pu1_src[(ht - 1) * src_strd] + pi1_sao_offset[edge_idx]
285    mov         x20,#255
286    cmp         x9,x20
287    csel        x9, x20, x9, ge             //u1_pos_wd_ht_tmp_v = CLIP3(pu1_src[(ht - 1) * src_strd] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1)
288    mov         x20,#0
289    cmp         x9,x20
290    csel        x9, x20, x9, LT             //u1_pos_wd_ht_tmp_v = CLIP3(pu1_src[(ht - 1) * src_strd] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1)
291
292PU1_AVAIL_3_LOOP:
293    STRB        w10,[sp,#8]
294    STRB        w9,[sp,#9]
295    mov         x27, x2                     //Store pu1_src_left in sp
296
297    MOV         x12,x8                      //Move ht
298    MOV         x14,x2                      //Move pu1_src_left to pu1_src_left_cpy
299    LDRB        w11,[x5,#3]                 //pu1_avail[3]
300    CMP         x11,#0
301    BNE         PU1_AVAIL_2_LOOP
302    SUB         x12,x12,#1                  //ht_tmp--
303
304PU1_AVAIL_2_LOOP:
305    LDRB        w5,[x5,#2]                  //pu1_avail[2]
306    CMP         x5,#0
307    BNE         PU1_AVAIL_2_LOOP_END
308
309    ADD         x0,x0,x1                    //pu1_src += src_strd
310    SUB         x12,x12,#1                  //ht_tmp--
311    ADD         x14,x14,#2                  //pu1_src_left_cpy += 2
312
313PU1_AVAIL_2_LOOP_END:
314    mov         x28, x0                     //Store pu1_src in sp
315    movi        v0.16b, #2                  //const_2 = vdupq_n_s8(2)
316    movi        v2.8h, #0                   //const_min_clip = vdupq_n_s16(0)
317    movi        v4.8h, #255                 //const_max_clip = vdupq_n_u16((1 << bit_depth) - 1)
318    LD1         {v6.8b},[x6]                //offset_tbl_u = vld1_s8(pi1_sao_offset_u)
319    mov         x6, x23                     //Loads pi1_sao_offset_v
320    LD1         {v7.8b},[x6]                //offset_tbl_v = vld1_s8(pi1_sao_offset_v)
321    ADRP        x2, :got:gi1_table_edge_idx //table pointer
322    LDR         x2, [x2, #:got_lo12:gi1_table_edge_idx]
323
324    //VLD1.8        D6,[x6]                        @edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
325    movi        v1.16b, #0xFF               //au1_mask = vdupq_n_s8(-1)
326    MOV         x6,x7                       //move wd to x6 loop_count
327
328    CMP         x7,#16                      //Compare wd with 16
329    BLT         WIDTH_RESIDUE               //If not jump to WIDTH_RESIDUE where loop is unrolled for 8 case
330    CMP         x8,#4                       //Compare ht with 4
331    BLE         WD_16_HT_4_LOOP             //If jump to WD_16_HT_4_LOOP
332
333WIDTH_LOOP_16:
334    mov         w7, w24                     //Loads wd
335    CMP         x6,x7                       //col == wd
336    mov         x5, x21                     //Loads pu1_avail
337
338    LDRb        w20, [x5]                   //pu1_avail[0]
339    csel        w8,w20,w8,EQ
340    MOV         x20,#-1
341    csel        x8, x20, x8,NE
342
343    mov         v1.b[0], w8                 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
344    LDRB        w11,[x5,#2]                 //pu1_avail[2]
345
346    CMP         x6,#16                      //if(col == 16)
347    mov         v1.b[1], w8                 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
348
349    BNE         SKIP_AU1_MASK_VAL
350    LDRB        w8,[x5,#1]                  //pu1_avail[1]
351    mov         v1.b[14], w8                //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
352    mov         v1.b[15], w8                //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
353
354SKIP_AU1_MASK_VAL:
355    CMP         x11,#0
356    LD1         {v5.16b},[x0]               //pu1_cur_row = vld1q_u8(pu1_src)
357    //LD1 {v13.8b},[x0]                        //pu1_cur_row = vld1q_u8(pu1_src)
358    //SUB x0, x0,#8
359    ADD         x5,sp,#0x4B                 //*au1_src_left_tmp
360
361    SUB         x20,x0,x1                   //pu1_src - src_strd
362    csel        x8, x20, x8,EQ
363    movi        v18.16b, #0
364    csel        x8, x3, x8,NE
365
366    ADD         x8,x8,#2                    //pu1_src - src_strd + 2
367    LD1         {v3.16b},[x8]               //pu1_top_row = vld1q_u8(pu1_src - src_strd + 2)
368    //LD1 {v11.8b},[x8]                        //pu1_top_row = vld1q_u8(pu1_src - src_strd + 2)
369    //SUB x8, x8,#8
370    ADD         x3,x3,#16
371
372    mov         w4, w25                     //Loads ht
373    cmhi        v17.16b,  v5.16b ,  v3.16b  //vcgtq_u8(pu1_cur_row, pu1_top_row)
374    mov         w7, w24                     //Loads wd
375
376    SUB         x7,x7,x6                    //(wd - col)
377    cmhi        v16.16b,  v3.16b ,  v5.16b  //vcltq_u8(pu1_cur_row, pu1_top_row)
378    ADD         x7,x7,#14                   //15 + (wd - col)
379
380    mov         x8, x26                     //Loads *pu1_src
381    SUB         v17.16b,  v16.16b ,  v17.16b //sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
382    ADD         x7,x8,x7                    //pu1_src[0 * src_strd + 15 + (wd - col)]
383
384AU1_SRC_LEFT_LOOP:
385    LDRH        w8,[x7]                     //load the value and increment by src_strd
386    SUBS        x4,x4,#1                    //decrement the loop count
387
388    STRH        w8,[x5],#2                  //store it in the stack pointer
389    ADD         x7,x7,x1
390    BNE         AU1_SRC_LEFT_LOOP
391
392
393    MOV         x7,x12                      //row count, move ht_tmp to x7
394    movi        v18.16b, #0                 //I
395    ADD         x11,x0,x1                   //I *pu1_src + src_strd
396
397    SUB         x5,x12,x7                   //I ht_tmp - row
398    LD1         {v16.16b},[x11]             //I pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
399    //LD1 {v17.8b},[x11]                    //I pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
400    //SUB x11, x11,#8
401    ADD         x8,x14,x5,LSL #1            //I pu1_src_left_cpy[(ht_tmp - row) * 2]
402
403    LDRH        w5,[x8,#2]                  //I
404    mov         v18.h[7], w5                //I vsetq_lane_u8
405    mov         x11, x21                    //I Loads pu1_avail
406
407    LDRB        w11,[x11,#2]                //I pu1_avail[2]
408    EXT         v18.16b,  v18.16b ,  v16.16b,#14 //I pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 14)
409    CMP         x11,#0                      //I
410    BNE         SIGN_UP_CHANGE_DONE         //I
411
412    LDRB        w8,[x0,#14]                 //I pu1_src_cpy[14]
413    SUB         x5,x0,x1                    //I
414
415    LDRB        w11,[x5,#16]                //I load the value pu1_src_cpy[16 - src_strd]
416
417    LDRB        w9,[x0,#15]                 //I pu1_src_cpy[15]
418    SUB         x8,x8,x11                   //I pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd]
419
420    LDRB        w10,[x5,#17]                //I load the value pu1_src_cpy[17 - src_strd]
421    CMP         x8,#0                       //I
422
423    movn        x20,#0
424    csel        x8, x20, x8,LT              //I
425    SUB         x9,x9,x10                   //I pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd]
426
427    MOV         x20,#1
428    csel        x8, x20, x8,GT              //I SIGN(pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd])
429    CMP         x9,#0                       //I
430
431    movn        x20,#0
432    csel        x9, x20, x9,LT              //I
433    mov         v17.b[14], w8               //I sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[14] -pu1_src_cpy[16 - src_strd]), sign_up, 0)
434    MOV         x20,#1
435    csel        x9, x20, x9,GT              //I SIGN(pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd]
436
437    mov         v17.b[15], w9               //I sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] -pu1_src_cpy[17 - src_strd]), sign_up, 1)
438
439SIGN_UP_CHANGE_DONE:
440    LD1         {v28.8b},[x2]               //edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
441    cmhi        v20.16b,  v5.16b ,  v18.16b //I vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
442
443    cmhi        v22.16b,  v18.16b ,  v5.16b //I vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
444    SUB         v22.16b,  v22.16b ,  v20.16b //I sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
445
446    ADD         v18.16b,  v0.16b ,  v17.16b //I edge_idx = vaddq_s8(const_2, sign_up)
447    ADD         v18.16b,  v18.16b ,  v22.16b //I edge_idx = vaddq_s8(edge_idx, sign_down)
448    TBL         v18.16b, {v28.16b},v18.16b  //I vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
449    NEG         v17.16b, v22.16b            //I sign_up = vnegq_s8(sign_down)
450
451    //TBL v19.8b, {v28.16b},v19.8b                //I vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
452    EXT         v17.16b,  v17.16b ,  v17.16b,#2 //I sign_up = vextq_s8(sign_up, sign_up, 2)
453
454    Uxtl        v20.8h, v5.8b               //I pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
455    AND         v18.16b,  v18.16b ,  v1.16b //I edge_idx = vandq_s8(edge_idx, au1_mask)
456    mov         v19.d[0],v18.d[1]
457
458    UZP1        v31.8b, v18.8b, v19.8b
459    UZP2        v19.8b, v18.8b, v19.8b      //I
460    mov         v18.8b,v31.8b
461    TBL         v22.8b, {v6.16b},v18.8b     //I
462    TBL         v23.8b, {v7.16b},v19.8b     //I
463    ZIP1        v31.8b, v22.8b, v23.8b
464    ZIP2        v23.8b, v22.8b, v23.8b      //I
465    mov         v22.8b,v31.8b
466
467    Uxtl2       v18.8h, v5.16b              //I pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
468    SADDW       v20.8h,  v20.8h ,  v22.8b   //I pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
469
470    SMAX        v20.8h,  v20.8h ,  v2.8h    //I pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
471    UMIN        v20.8h,  v20.8h ,  v4.8h    //I pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
472
473    mov         v5.16b, v16.16b             //I pu1_cur_row = pu1_next_row
474    SADDW       v18.8h,  v18.8h ,  v23.8b   //I pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
475
476    SUB         x7,x7,#1                    //I Decrement the ht_tmp loop count by 1
477    SMAX        v18.8h,  v18.8h ,  v2.8h    //I pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
478
479    UMIN        v18.8h,  v18.8h ,  v4.8h    //I pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
480
481
482PU1_SRC_LOOP:
483    ADD         x11,x0,x1,LSL #1            //II *pu1_src + src_strd
484    xtn         v20.8b,  v20.8h             //I vmovn_s16(pi2_tmp_cur_row.val[0])
485    SUB         x5,x12,x7                   //II ht_tmp - row
486
487    ADD         x4,x0,x1                    //III *pu1_src + src_strd
488    xtn2        v20.16b,  v18.8h            //I vmovn_s16(pi2_tmp_cur_row.val[1])
489    ADD         x8,x14,x5,LSL #1            //II pu1_src_left_cpy[(ht_tmp - row) * 2]
490
491    LDRH        w9,[x8,#2]
492    LD1         {v16.16b},[x11]             //II pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
493    //LD1 {v17.8b},[x11]                    //II pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
494    //SUB x11, x11,#8
495    LDRB        w10,[x4,#14]                //II pu1_src_cpy[14]
496
497    LDRB        w8,[x4,#15]                 //II pu1_src_cpy[15]
498    mov         v28.h[7], w9                //II vsetq_lane_u8
499    ADD         x4,x11,x1                   //III *pu1_src + src_strd
500
501    LDRB        w5,[x0,#17]                 //II load the value pu1_src_cpy[17 - src_strd]
502    LD1         {v30.16b},[x4]              //III pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
503    //LD1 {v31.8b},[x4]                    //III pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
504    //SUB x4, x4,#8
505    LDRB        w11,[x0,#16]                //II load the value pu1_src_cpy[16 - src_strd]
506
507    SUB         x7,x7,#1                    //II Decrement the ht_tmp loop count by 1
508    ST1         { v20.16b},[x0],x1          //I vst1q_u8(pu1_src_cpy, pu1_cur_row)
509    SUB         x10,x10,x11                 //II pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd]
510
511    CMP         x10,#0                      //II
512    EXT         v28.16b,  v28.16b ,  v16.16b,#14 //II pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 14)
513    SUB         x8,x8,x5                    //II pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd]
514
515    movn        x20,#0
516    csel        x10, x20, x10,LT            //II
517    LD1         {v21.8b},[x2]               //edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
518    MOV         x20,#1
519    csel        x10, x20, x10,GT            //II SIGN(pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd])
520
521    CMP         x8,#0                       //II
522    mov         v17.b[14], w10              //II sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[14] -pu1_src_cpy[16 - src_strd]), sign_up, 0)
523    movn        x20,#0
524    csel        x8, x20, x8,LT              //II
525
526    MOV         x20,#1
527    csel        x8, x20, x8,GT              //II SIGN(pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd]
528    SUB         x10,x12,x7                  //III ht_tmp - row
529    mov         v17.b[15], w8               //II sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] -pu1_src_cpy[17 - src_strd]), sign_up, 1)
530    ADD         x11,x14,x10,LSL #1          //III pu1_src_left_cpy[(ht_tmp - row) * 2]
531
532    CMP         x7,#1                       //III
533    cmhi        v22.16b,  v5.16b ,  v28.16b //II vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
534    BNE         NEXT_ROW_POINTER_ASSIGNED_2 //III
535
536    mov         x5, x21                     //III Loads pu1_avail
537    LDRB        w5,[x5,#3]                  //III pu1_avail[3]
538    CMP         x5,#0                       //III
539    SUB         x20,x4,#4                   //III pu1_src[src_strd - 2]
540    csel        x11, x20, x11,NE
541
542NEXT_ROW_POINTER_ASSIGNED_2:
543    LDRH        w5,[x11,#2]                 //III
544    cmhi        v24.16b,  v28.16b ,  v5.16b //II vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
545    ADD         x11,x0,x1                   //III
546
547    LDRB        w9,[x11,#14]                //III pu1_src_cpy[14]
548    mov         v18.h[7], w5                //III vsetq_lane_u8
549    LDRB        w8,[x11,#15]                //III pu1_src_cpy[15]
550
551    LDRB        w11,[x0,#16]                //III load the value pu1_src_cpy[16 - src_strd]
552    SUB         v24.16b,  v24.16b ,  v22.16b //II sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
553    LDRB        w10,[x0,#17]                //III load the value pu1_src_cpy[17 - src_strd]
554
555    SUB         x9,x9,x11                   //III pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd]
556    EXT         v18.16b,  v18.16b ,  v30.16b,#14 //III pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 14)
557    SUB         x10,x8,x10                  //III pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd]
558
559    CMP         x9,#0                       //III
560    ADD         v26.16b,  v0.16b ,  v17.16b //II edge_idx = vaddq_s8(const_2, sign_up)
561    movn        x20,#0
562    csel        x9, x20, x9,LT              //III
563
564    MOV         x20,#1
565    csel        x9, x20, x9,GT              //III SIGN(pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd])
566    ADD         v26.16b,  v26.16b ,  v24.16b //II edge_idx = vaddq_s8(edge_idx, sign_down)
567    CMP         x10,#0                      //III
568
569    NEG         v17.16b, v24.16b            //II sign_up = vnegq_s8(sign_down)
570    TBL         v26.16b, {v21.16b},v26.16b  //II vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
571    movn        x20,#0
572    csel        x10, x20, x10,LT            //III
573    MOV         x20,#1
574    csel        x10, x20, x10,GT            //III SIGN(pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd]
575
576    EXT         v17.16b,  v17.16b ,  v17.16b,#2 //II sign_up = vextq_s8(sign_up, sign_up, 2)
577    //TBL v27.8b, {v21.16b},v27.8b                //II vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
578    cmhi        v22.16b,  v16.16b ,  v18.16b //III vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
579
580    mov         v17.b[14], w9               //III sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[14] -pu1_src_cpy[16 - src_strd]), sign_up, 0)
581    AND         v26.16b,  v26.16b ,  v1.16b //II edge_idx = vandq_s8(edge_idx, au1_mask)
582    mov         v27.d[0],v26.d[1]
583
584    mov         v17.b[15], w10              //III sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] -pu1_src_cpy[17 - src_strd]), sign_up, 1)
585    UZP1        v31.8b, v26.8b, v27.8b
586    UZP2        v27.8b, v26.8b, v27.8b      //II
587    mov         v26.8b,v31.8b
588
589    cmhi        v20.16b,  v18.16b ,  v16.16b //III vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
590    TBL         v24.8b, {v6.16b},v26.8b     //II
591    SUB         v22.16b,  v20.16b ,  v22.16b //III sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
592
593    ADD         v18.16b,  v0.16b ,  v17.16b //III edge_idx = vaddq_s8(const_2, sign_up)
594    TBL         v25.8b, {v7.16b},v27.8b     //II
595    ADD         v18.16b,  v18.16b ,  v22.16b //III edge_idx = vaddq_s8(edge_idx, sign_down)
596
597    LD1         {v20.8b},[x2]               //edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
598    ZIP1        v31.8b, v24.8b, v25.8b
599    ZIP2        v25.8b, v24.8b, v25.8b      //II
600    mov         v24.8b,v31.8b
601
602    Uxtl        v28.8h, v5.8b               //II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
603    TBL         v18.16b, {v20.16b},v18.16b  //III vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
604    NEG         v17.16b, v22.16b            //III sign_up = vnegq_s8(sign_down)
605
606    SADDW       v28.8h,  v28.8h ,  v24.8b   //II pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
607    //TBL v19.8b, {v20.16b},v19.8b                //III vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
608    EXT         v17.16b,  v17.16b ,  v17.16b,#2 //III sign_up = vextq_s8(sign_up, sign_up, 2)
609
610    Uxtl2       v26.8h, v5.16b              //II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
611    AND         v18.16b,  v18.16b ,  v1.16b //III edge_idx = vandq_s8(edge_idx, au1_mask)
612    mov         v19.d[0],v18.d[1]
613
614    Uxtl        v20.8h, v16.8b              //III pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
615    UZP1        v31.8b, v18.8b, v19.8b
616    UZP2        v19.8b, v18.8b, v19.8b      //III
617    mov         v18.8b,v31.8b
618
619    SMAX        v28.8h,  v28.8h ,  v2.8h    //II pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
620    TBL         v22.8b, {v6.16b},v18.8b     //III
621    UMIN        v28.8h,  v28.8h ,  v4.8h    //II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
622
623    SADDW       v26.8h,  v26.8h ,  v25.8b   //II pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
624    TBL         v23.8b, {v7.16b},v19.8b     //III
625    SMAX        v26.8h,  v26.8h ,  v2.8h    //II pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
626
627    Uxtl2       v18.8h, v16.16b             //III pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
628    ZIP1        v31.8b, v22.8b, v23.8b
629    ZIP2        v23.8b, v22.8b, v23.8b      //III
630    mov         v22.8b,v31.8b
631
632    xtn         v28.8b,  v28.8h             //II vmovn_s16(pi2_tmp_cur_row.val[0])
633    SADDW       v20.8h,  v20.8h ,  v22.8b   //III pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
634
635    mov         v5.16b, v30.16b             //III pu1_cur_row = pu1_next_row
636    UMIN        v26.8h,  v26.8h ,  v4.8h    //II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
637
638    SUB         x7,x7,#1                    //III Decrement the ht_tmp loop count by 1
639    SMAX        v20.8h,  v20.8h ,  v2.8h    //III pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
640    CMP         x7,#1                       //III
641
642    xtn2        v28.16b,  v26.8h            //II vmovn_s16(pi2_tmp_cur_row.val[1])
643    UMIN        v20.8h,  v20.8h ,  v4.8h    //III pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
644
645    SADDW       v18.8h,  v18.8h ,  v23.8b   //III pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
646
647    SMAX        v18.8h,  v18.8h ,  v2.8h    //III pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
648
649    ST1         { v28.16b},[x0],x1          //II vst1q_u8(pu1_src_cpy, pu1_cur_row)
650    UMIN        v18.8h,  v18.8h ,  v4.8h    //III pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
651
652    BGT         PU1_SRC_LOOP                //If not equal jump to PU1_SRC_LOOP
653    BLT         INNER_LOOP_DONE
654
655
656    ADD         x11,x0,x1,LSL #1            //*pu1_src + src_strd
657    xtn         v20.8b,  v20.8h             //III vmovn_s16(pi2_tmp_cur_row.val[0])
658    SUB         x5,x12,x7                   //ht_tmp - row
659
660    ADD         x8,x14,x5,LSL #1            //pu1_src_left_cpy[(ht_tmp - row) * 2]
661    xtn2        v20.16b,  v18.8h            //III vmovn_s16(pi2_tmp_cur_row.val[1])
662    CMP         x7,#1
663
664    LDRB        w4,[x0,#16]                 //load the value pu1_src_cpy[16 - src_strd]
665    LD1         {v16.16b},[x11]             //pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
666    //LD1 {v17.8b},[x11]                    //pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
667    //SUB x11, x11,#8
668    LDRB        w9,[x0,#17]                 //load the value pu1_src_cpy[17 - src_strd]
669
670    BNE         NEXT_ROW_POINTER_ASSIGNED_3
671    mov         x5, x21                     //Loads pu1_avail
672    LDRB        w5,[x5,#3]                  //pu1_avail[3]
673    CMP         x5,#0
674    SUB         x20,x11,#4                  //pu1_src[src_strd - 2]
675    csel        x8, x20, x8,NE
676
677NEXT_ROW_POINTER_ASSIGNED_3:
678    LDRH        w5,[x8,#2]
679    ST1         { v20.16b},[x0],x1          //III vst1q_u8(pu1_src_cpy, pu1_cur_row)
680    LDRB        w8,[x0,#14]                 //pu1_src_cpy[14]
681
682    SUB         x8,x8,x4                    //pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd]
683    mov         v18.h[7], w5                //vsetq_lane_u8
684    LDRB        w10,[x0,#15]                //pu1_src_cpy[15]
685
686    CMP         x8,#0
687    EXT         v18.16b,  v18.16b ,  v16.16b,#14 //pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 14)
688    SUB         x10,x10,x9                  //pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd]
689
690    movn        x20,#0
691    csel        x8, x20, x8,LT
692    LD1         {v28.8b},[x2]               //edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
693    MOV         x20,#1
694    csel        x8, x20, x8,GT              //SIGN(pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd])
695
696    CMP         x10,#0
697    mov         v17.b[14], w8               //sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[14] -pu1_src_cpy[16 - src_strd]), sign_up, 0)
698    movn        x20,#0
699    csel        x10, x20, x10,LT
700
701    MOV         x20,#1
702    csel        x10, x20, x10,GT            //SIGN(pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd]
703    mov         v17.b[15], w10              //sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] -pu1_src_cpy[17 - src_strd]), sign_up, 1)
704    cmhi        v20.16b,  v5.16b ,  v18.16b //vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
705
706    cmhi        v22.16b,  v18.16b ,  v5.16b //vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
707    SUB         v22.16b,  v22.16b ,  v20.16b //sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
708
709    ADD         v18.16b,  v0.16b ,  v17.16b //edge_idx = vaddq_s8(const_2, sign_up)
710    ADD         v18.16b,  v18.16b ,  v22.16b //edge_idx = vaddq_s8(edge_idx, sign_down)
711    TBL         v18.16b, {v28.16b},v18.16b  //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
712    //TBL v19.8b, {v28.16b},v19.8b                //vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
713
714    AND         v18.16b,  v18.16b ,  v1.16b //edge_idx = vandq_s8(edge_idx, au1_mask)
715    mov         v19.d[0],v18.d[1]
716
717    Uxtl        v20.8h, v5.8b               //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
718    UZP1        v31.8b, v18.8b, v19.8b
719    UZP2        v19.8b, v18.8b, v19.8b
720    mov         v18.8b,v31.8b
721
722    TBL         v22.8b, {v6.16b},v18.8b
723    TBL         v23.8b, {v7.16b},v19.8b
724
725    Uxtl2       v18.8h, v5.16b              //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
726    ZIP1        v31.8b, v22.8b, v23.8b
727    ZIP2        v23.8b, v22.8b, v23.8b
728    mov         v22.8b,v31.8b
729
730    SADDW       v20.8h,  v20.8h ,  v22.8b   //pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
731    SMAX        v20.8h,  v20.8h ,  v2.8h    //pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
732    UMIN        v20.8h,  v20.8h ,  v4.8h    //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
733
734    SADDW       v18.8h,  v18.8h ,  v23.8b   //pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
735    SMAX        v18.8h,  v18.8h ,  v2.8h    //pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
736    UMIN        v18.8h,  v18.8h ,  v4.8h    //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
737
738
739INNER_LOOP_DONE:
740
741    mov         w8, w25                     //Loads ht
742    xtn         v20.8b,  v20.8h             //III vmovn_s16(pi2_tmp_cur_row.val[0])
743    ADD         x5,sp,#0x4B                 //*au1_src_left_tmp
744
745    LSL         x8,x8,#1
746    xtn2        v20.16b,  v18.8h            //III vmovn_s16(pi2_tmp_cur_row.val[1])
747    mov         x11, x27                    //Loads *pu1_src_left
748
749SRC_LEFT_LOOP:
750    LDR         w7, [x5],#4                 //au1_src_left_tmp[row]
751    SUBS        x8,x8,#4
752    STR         w7, [x11],#4                //pu1_src_left[row] = au1_src_left_tmp[row]
753    BNE         SRC_LEFT_LOOP
754
755    SUBS        x6,x6,#16                   //Decrement the wd loop count by 16
756    ST1         { v20.16b},[x0],x1          //III vst1q_u8(pu1_src_cpy, pu1_cur_row)
757    CMP         x6,#8                       //Check whether residue remains
758
759    BLT         RE_ASSINING_LOOP            //Jump to re-assigning loop
760    mov         w7, w24                     //Loads wd
761    mov         x0, x28                     //Loads *pu1_src
762    SUB         x7,x7,x6
763    ADD         x0,x0,x7
764    BGT         WIDTH_LOOP_16               //If not equal jump to width_loop
765    BEQ         WIDTH_RESIDUE               //If residue remains jump to residue loop
766
767WD_16_HT_4_LOOP:
768    mov         w7, w24                     //Loads wd
769
770    mov         x5, x21                     //Loads pu1_avail
771    CMP         x6,x7                       //col == wd
772
773    LDRb        w20, [x5]                   //pu1_avail[0]
774    csel        w8,w20,w8,EQ
775    MOV         x20,#-1
776    csel        x8, x20, x8,NE
777    mov         v1.b[0], w8                 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
778
779    CMP         x6,#16                      //if(col == 16)
780    mov         v1.b[1], w8                 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
781
782    BNE         SKIP_AU1_MASK_VAL_WD_16_HT_4
783    LDRB        w8,[x5,#1]                  //pu1_avail[1]
784    mov         v1.b[14], w8                //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
785    mov         v1.b[15], w8                //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
786
787SKIP_AU1_MASK_VAL_WD_16_HT_4:
788    LDRB        w11,[x5,#2]                 //pu1_avail[2]
789    SUB         x20,x0,x1                   //pu1_src - src_strd
790    CMP         x11,#0
791    csel        x8, x20, x8,EQ
792
793    csel        x8, x3, x8,NE
794    LD1         {v5.16b},[x0]               //pu1_cur_row = vld1q_u8(pu1_src)
795    //LD1 {v13.8b},[x0]                        //pu1_cur_row = vld1q_u8(pu1_src)
796    //SUB x0, x0,#8
797    ADD         x8,x8,#2                    //pu1_src - src_strd + 2
798
799    ADD         x3,x3,#16
800    LD1         {v3.16b},[x8]               //pu1_top_row = vld1q_u8(pu1_src - src_strd + 2)
801    //LD1 {v11.8b},[x8]                        //pu1_top_row = vld1q_u8(pu1_src - src_strd + 2)
802    //SUB x8, x8,#8
803    ADD         x5,sp,#0x4B                 //*au1_src_left_tmp
804
805    mov         w4, w25                     //Loads ht
806    cmhi        v17.16b,  v5.16b ,  v3.16b  //vcgtq_u8(pu1_cur_row, pu1_top_row)
807    mov         w7, w24                     //Loads wd
808
809    SUB         x7,x7,x6                    //(wd - col)
810    cmhi        v16.16b,  v3.16b ,  v5.16b  //vcltq_u8(pu1_cur_row, pu1_top_row)
811    ADD         x7,x7,#14                   //15 + (wd - col)
812
813    mov         x8, x26                     //Loads *pu1_src
814    SUB         v17.16b,  v16.16b ,  v17.16b //sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
815    ADD         x7,x8,x7                    //pu1_src[0 * src_strd + 15 + (wd - col)]
816
817AU1_SRC_LEFT_LOOP_WD_16_HT_4:
818    LDRH        w8,[x7]                     //load the value and increment by src_strd
819    SUBS        x4,x4,#1                    //decrement the loop count
820
821    STRH        w8,[x5],#2                  //store it in the stack pointer
822    ADD         x7,x7,x1
823    BNE         AU1_SRC_LEFT_LOOP_WD_16_HT_4
824
825    movi        v18.16b, #0
826    MOV         x7,x12                      //row count, move ht_tmp to x7
827
828PU1_SRC_LOOP_WD_16_HT_4:
829    ADD         x9,x0,x1                    //*pu1_src + src_strd
830
831    mov         x5, x21                     //Loads pu1_avail
832    LD1         {v16.16b},[x9]              //pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
833    //LD1 {v17.8b},[x9]                        //pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
834    //SUB x9, x9,#8
835    LDRB        w5,[x5,#3]                  //pu1_avail[3]
836
837    SUB         x11,x12,x7                  //ht_tmp - row
838    ADD         x8,x14,x11,LSL #1           //pu1_src_left_cpy[(ht_tmp - row) * 2]
839    ADD         x8,x8,#2                    //pu1_src_left_cpy[(ht_tmp - row + 1) * 2]
840
841    CMP         x5,#0
842    BEQ         NEXT_ROW_POINTER_ASSIGNED_WD_16_HT_4
843    CMP         x7,#1
844    SUB         x20,x9,#2                   //pu1_src[src_strd - 2]
845    csel        x8, x20, x8,EQ
846
847NEXT_ROW_POINTER_ASSIGNED_WD_16_HT_4:
848    LDRH        w5,[x8]
849    mov         v18.h[7], w5                //vsetq_lane_u8
850    EXT         v18.16b,  v18.16b ,  v16.16b,#14 //pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 14)
851
852    CMP         x7,x12
853    BLT         SIGN_UP_CHANGE_WD_16_HT_4
854    mov         x5, x21                     //Loads pu1_avail
855    LDRB        w5,[x5,#2]                  //pu1_avail[2]
856    CMP         x5,#0
857    BNE         SIGN_UP_CHANGE_DONE_WD_16_HT_4
858
859SIGN_UP_CHANGE_WD_16_HT_4:
860    LDRB        w8,[x0,#14]                 //pu1_src_cpy[14]
861    SUB         x9,x0,x1
862
863    LDRB        w5,[x9,#16]                 //load the value pu1_src_cpy[16 - src_strd]
864
865    LDRB        w10,[x0,#15]                //pu1_src_cpy[15]
866    SUB         x8,x8,x5                    //pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd]
867
868    LDRB        w11,[x9,#17]                //load the value pu1_src_cpy[17 - src_strd]
869    CMP         x8,#0
870
871    movn        x20,#0
872    csel        x8, x20, x8,LT
873    SUB         x10,x10,x11                 //pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd]
874
875    MOV         x20,#1
876    csel        x8, x20, x8,GT              //SIGN(pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd])
877
878    CMP         x10,#0
879    mov         v17.b[14], w8               //sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[14] -pu1_src_cpy[16 - src_strd]), sign_up, 0)
880    movn        x20,#0
881    csel        x10, x20, x10,LT
882
883    MOV         x20,#1
884    csel        x10, x20, x10,GT            //SIGN(pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd]
885    mov         v17.b[15], w10              //sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] -pu1_src_cpy[17 - src_strd]), sign_up, 1)
886
887SIGN_UP_CHANGE_DONE_WD_16_HT_4:
888    LD1         {v20.8b},[x2]               //edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
889    cmhi        v22.16b,  v5.16b ,  v18.16b //vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
890
891    cmhi        v24.16b,  v18.16b ,  v5.16b //vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
892    SUB         v24.16b,  v24.16b ,  v22.16b //sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
893
894    ADD         v26.16b,  v0.16b ,  v17.16b //edge_idx = vaddq_s8(const_2, sign_up)
895    ADD         v26.16b,  v26.16b ,  v24.16b //edge_idx = vaddq_s8(edge_idx, sign_down)
896
897    mov         v20.d[1],v20.d[0]
898    NEG         v17.16b, v24.16b            //sign_up = vnegq_s8(sign_down)
899    TBL         v26.16b, {v20.16b},v26.16b  //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
900
901    //TBL v27.8b, {v20.16b},v27.8b                //vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
902    EXT         v17.16b,  v17.16b ,  v17.16b,#2 //sign_up = vextq_s8(sign_up, sign_up, 2)
903
904    Uxtl        v28.8h, v5.8b               //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
905    AND         v26.16b,  v26.16b ,  v1.16b //edge_idx = vandq_s8(edge_idx, au1_mask)
906    mov         v27.d[0],v26.d[1]
907
908    UZP1        v31.8b, v26.8b, v27.8b
909    UZP2        v27.8b, v26.8b, v27.8b
910    mov         v26.8b,v31.8b
911    TBL         v24.8b, {v6.16b},v26.8b
912    TBL         v25.8b, {v7.16b},v27.8b
913    ZIP1        v31.8b, v24.8b, v25.8b
914    ZIP2        v25.8b, v24.8b, v25.8b
915    mov         v24.8b,v31.8b
916
917    Uxtl2       v30.8h, v5.16b              //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
918    SADDW       v28.8h,  v28.8h ,  v24.8b   //pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
919
920    SMAX        v28.8h,  v28.8h ,  v2.8h    //pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
921    UMIN        v28.8h,  v28.8h ,  v4.8h    //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
922
923    mov         v5.16b, v16.16b             //pu1_cur_row = pu1_next_row
924    SADDW       v30.8h,  v30.8h ,  v25.8b   //pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
925
926    SMAX        v30.8h,  v30.8h ,  v2.8h    //pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
927    UMIN        v30.8h,  v30.8h ,  v4.8h    //pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
928
929    xtn         v28.8b,  v28.8h             //vmovn_s16(pi2_tmp_cur_row.val[0])
930    xtn2        v28.16b,  v30.8h            //vmovn_s16(pi2_tmp_cur_row.val[1])
931
932    SUBS        x7,x7,#1                    //Decrement the ht_tmp loop count by 1
933    ST1         { v28.16b},[x0],x1          //vst1q_u8(pu1_src_cpy, pu1_cur_row)
934    BNE         PU1_SRC_LOOP_WD_16_HT_4     //If not equal jump to PU1_SRC_LOOP_WD_16_HT_4
935
936    mov         w8, w25                     //Loads ht
937    ADD         x5,sp,#0x4B                 //*au1_src_left_tmp
938    mov         x11, x27                    //Loads *pu1_src_left
939
940SRC_LEFT_LOOP_WD_16_HT_4:
941    LDR         w7, [x5],#4                 //au1_src_left_tmp[row]
942    SUBS        x8,x8,#2
943    STR         w7, [x11],#4                //pu1_src_left[row] = au1_src_left_tmp[row]
944    BNE         SRC_LEFT_LOOP_WD_16_HT_4
945
946    SUBS        x6,x6,#16                   //Decrement the wd loop count by 16
947    CMP         x6,#8                       //Check whether residue remains
948    BLT         RE_ASSINING_LOOP            //Jump to re-assigning loop
949    mov         w7, w24                     //Loads wd
950    mov         x0, x28                     //Loads *pu1_src
951    SUB         x7,x7,x6
952    ADD         x0,x0,x7
953    BGT         WD_16_HT_4_LOOP             //If not equal jump to width_loop
954    BEQ         WIDTH_RESIDUE               //If residue remains jump to residue loop
955
956WIDTH_RESIDUE:
957    mov         w7, w24                     //Loads wd
958
959    mov         x5, x21                     //Loads pu1_avail
960    CMP         x6,x7                       //wd_residue == wd
961
962    LDRb        w20, [x5]                   //pu1_avail[0]
963    csel        w8,w20,w8,EQ
964
965    MOV         x20,#-1
966    csel        x8, x20, x8,NE
967    LDRB        w11,[x5,#1]                 //pu1_avail[1]
968
969    LDRB        w9,[x5,#2]                  //pu1_avail[2]
970    mov         v1.b[0], w8                 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
971    CMP         x9,#0
972
973    SUB         x20,x0,x1                   //pu1_src - src_strd
974    csel        x10, x20, x10,EQ
975    mov         v1.b[1], w8                 //au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
976    csel        x10, x3, x10,NE
977
978    ADD         x10,x10,#2                  //pu1_src - src_strd + 2
979    mov         v1.b[6], w11                //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
980    ADD         x5,sp,#0x4B                 //*au1_src_left_tmp
981
982    mov         w4, w25                     //Loads ht
983    mov         v1.b[7], w11                //au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
984    mov         w7, w24                     //Loads wd
985
986    mov         x8, x26                     //Loads *pu1_src
987    LD1         {v3.16b},[x10]              //pu1_top_row = vld1q_u8(pu1_src - src_strd + 2)
988    //LD1 {v11.8b},[x10]                    //pu1_top_row = vld1q_u8(pu1_src - src_strd + 2)
989    //SUB x10, x10,#8
990    SUB         x7,x7,#2                    //(wd - 2)
991
992    ADD         x7,x8,x7                    //pu1_src[0 * src_strd + (wd - 2)]
993
994AU1_SRC_LEFT_LOOP_RESIDUE:
995    LDRH        w8,[x7]                     //load the value and increment by src_strd
996    ADD         x7,x7,x1
997    STRH        w8,[x5],#2                  //store it in the stack pointer
998    SUBS        x4,x4,#1                    //decrement the loop count
999    BNE         AU1_SRC_LEFT_LOOP_RESIDUE
1000
1001    LD1         {v5.16b},[x0]               //pu1_cur_row = vld1q_u8(pu1_src)
1002    //LD1 {v13.8b},[x0]                        //pu1_cur_row = vld1q_u8(pu1_src)
1003    //SUB x0, x0,#8
1004
1005    movi        v18.16b, #0
1006    cmhi        v17.16b,  v5.16b ,  v3.16b  //vcgtq_u8(pu1_cur_row, pu1_top_row)
1007
1008    cmhi        v16.16b,  v3.16b ,  v5.16b  //vcltq_u8(pu1_cur_row, pu1_top_row)
1009    SUB         v17.16b,  v16.16b ,  v17.16b //sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
1010    MOV         x7,x12                      //row count, move ht_tmp to x7
1011
1012PU1_SRC_LOOP_RESIDUE:
1013    ADD         x9,x0,x1                    //*pu1_src + src_strd
1014
1015    SUB         x11,x12,x7                  //ht_tmp - row
1016    LD1         {v16.16b},[x9]              //pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
1017    //LD1 {v17.8b},[x9]                        //pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
1018    //SUB x9, x9,#8
1019    mov         x5, x21                     //Loads pu1_avail
1020
1021    LDRB        w5,[x5,#3]                  //pu1_avail[3]
1022    ADD         x8,x14,x11,LSL #1           //pu1_src_left_cpy[(ht_tmp - row) * 2]
1023
1024    CMP         x5,#0
1025    ADD         x8,x8,#2                    //pu1_src_left_cpy[(ht_tmp - row + 1) * 2]
1026
1027    BEQ         NEXT_ROW_POINTER_ASSIGNED_RESIDUE
1028    CMP         x7,#1
1029    SUB         x20,x9,#2                   //pu1_src[src_strd - 2]
1030    csel        x8, x20, x8,EQ
1031
1032NEXT_ROW_POINTER_ASSIGNED_RESIDUE:
1033    LDRB        w5,[x8]
1034
1035    LDRB        w8,[x8,#1]
1036    mov         v18.b[14], w5               //vsetq_lane_u8
1037    CMP         x7,x12
1038
1039    mov         v18.b[15], w8               //vsetq_lane_u8
1040    EXT         v18.16b,  v18.16b ,  v16.16b,#14 //pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 14)
1041
1042    BLT         SIGN_UP_CHANGE_RESIDUE
1043    mov         x5, x21                     //Loads pu1_avail
1044    LDRB        w5,[x5,#2]                  //pu1_avail[2]
1045    CMP         x5,#0
1046    BNE         SIGN_UP_CHANGE_DONE_RESIDUE
1047
1048SIGN_UP_CHANGE_RESIDUE:
1049    LDRB        w8,[x0,#14]                 //pu1_src_cpy[14]
1050    SUB         x9,x0,x1
1051
1052    LDRB        w5,[x9,#16]                 //load the value pu1_src_cpy[16 - src_strd]
1053
1054    LDRB        w10,[x0,#15]                //pu1_src_cpy[15]
1055    SUB         x8,x8,x5                    //pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd]
1056
1057    LDRB        w11,[x9,#17]                //load the value pu1_src_cpy[17 - src_strd]
1058    CMP         x8,#0
1059
1060    movn        x20,#0
1061    csel        x8, x20, x8,LT
1062    SUB         x10,x10,x11                 //pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd]
1063
1064    MOV         x20,#1
1065    csel        x8, x20, x8,GT              //SIGN(pu1_src_cpy[14] - pu1_src_cpy[16 - src_strd])
1066
1067    CMP         x10,#0
1068    mov         v17.b[14], w8               //sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[14] -pu1_src_cpy[16 - src_strd]), sign_up, 0)
1069    movn        x20,#0
1070    csel        x10, x20, x10,LT
1071
1072    MOV         x20,#1
1073    csel        x10, x20, x10,GT            //SIGN(pu1_src_cpy[15] - pu1_src_cpy[17 - src_strd]
1074    mov         v17.b[15], w10              //sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] -pu1_src_cpy[17 - src_strd]), sign_up, 1)
1075
1076SIGN_UP_CHANGE_DONE_RESIDUE:
1077    LD1         {v20.8b},[x2]               //edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
1078    cmhi        v22.16b,  v5.16b ,  v18.16b //vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
1079
1080    cmhi        v24.16b,  v18.16b ,  v5.16b //vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
1081    SUB         v24.16b,  v24.16b ,  v22.16b //sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
1082
1083    ADD         v26.16b,  v0.16b ,  v17.16b //edge_idx = vaddq_s8(const_2, sign_up)
1084    ADD         v26.16b,  v26.16b ,  v24.16b //edge_idx = vaddq_s8(edge_idx, sign_down)
1085
1086    mov         v20.d[1],v20.d[0]
1087    NEG         v17.16b, v24.16b            //sign_up = vnegq_s8(sign_down)
1088    TBL         v26.16b, {v20.16b},v26.16b  //vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
1089
1090    //TBL v27.8b, {v20.16b},v27.8b                //vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
1091    EXT         v17.16b,  v17.16b ,  v17.16b,#2 //sign_up = vextq_s8(sign_up, sign_up, 14)
1092
1093    Uxtl        v28.8h, v5.8b               //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
1094    AND         v26.16b,  v26.16b ,  v1.16b //edge_idx = vandq_s8(edge_idx, au1_mask)
1095    mov         v27.d[0],v26.d[1]
1096
1097    UZP1        v31.8b, v26.8b, v27.8b
1098    UZP2        v27.8b, v26.8b, v27.8b
1099    mov         v26.8b,v31.8b
1100    TBL         v24.8b, {v6.16b},v26.8b
1101    TBL         v25.8b, {v7.16b},v27.8b
1102    ZIP1        v31.8b, v24.8b, v25.8b
1103    ZIP2        v25.8b, v24.8b, v25.8b
1104    mov         v24.8b,v31.8b
1105
1106    mov         v5.16b, v16.16b             //pu1_cur_row = pu1_next_row
1107    SADDW       v28.8h,  v28.8h ,  v24.8b   //pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
1108
1109    SMAX        v28.8h,  v28.8h ,  v2.8h    //pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
1110    UMIN        v28.8h,  v28.8h ,  v4.8h    //pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
1111
1112    SUBS        x7,x7,#1                    //Decrement the ht_tmp loop count by 1
1113    xtn         v30.8b,  v28.8h             //vmovn_s16(pi2_tmp_cur_row.val[0])
1114
1115    ST1         {v30.8b},[x0],x1            //vst1q_u8(pu1_src_cpy, pu1_cur_row)
1116
1117    BNE         PU1_SRC_LOOP_RESIDUE        //If not equal jump to PU1_SRC_LOOP
1118
1119    mov         w8, w25                     //Loads ht
1120    ADD         x5,sp,#0x4B                 //*au1_src_left_tmp
1121
1122    mov         x11, x27                    //Loads *pu1_src_left
1123
1124SRC_LEFT_LOOP_RESIDUE:
1125    LDR         w7, [x5],#4                 //au1_src_left_tmp[row]
1126    SUBS        x8,x8,#2
1127    STR         w7, [x11],#4                //pu1_src_left[row] = au1_src_left_tmp[row]
1128    BNE         SRC_LEFT_LOOP_RESIDUE
1129
1130
1131RE_ASSINING_LOOP:
1132    mov         w7, w24                     //Loads wd
1133    mov         w8, w25                     //Loads ht
1134
1135    mov         x0, x26                     //Loads *pu1_src
1136    SUB         x10,x7,#2                   //wd - 2
1137
1138    LDRH        w9,[sp,#6]
1139    SUB         x8,x8,#1                    //ht - 1
1140
1141    STRH        w9,[x0,x10]                 //pu1_src_org[0] = u1_pos_0_0_tmp
1142    madd        x6, x8, x1, x0              //pu1_src[(ht - 1) * src_strd]
1143
1144    mov         x4, x15                     //Loads pu1_src_top_left
1145
1146    LDRH        w9,[sp,#8]
1147    ADD         x12,sp,#10
1148
1149    STRH        w9,[x6]                     //pu1_src_org[(ht - 1) * src_strd] = u1_pos_wd_ht_tmp_u
1150
1151    LDRH        w10,[sp]                    //load u1_src_top_left_tmp from stack pointer
1152    STRH        w10,[x4]                    //*pu1_src_top_left = u1_src_top_left_tmp
1153    mov         x3, x22                     //Loads pu1_src_top
1154
1155SRC_TOP_LOOP:
1156    LD1         {v0.8b},[x12],#8            //pu1_src_top[col] = au1_src_top_tmp[col]
1157    SUBS        x7,x7,#8                    //Decrement the width
1158    ST1         {v0.8b},[x3],#8             //pu1_src_top[col] = au1_src_top_tmp[col]
1159    BNE         SRC_TOP_LOOP
1160
1161END_LOOPS:
1162    ADD         sp,sp,#0xE0
1163    // LDMFD sp!,{x4-x12,x15}             //Reload the registers from SP
1164    ldp         x27, x28,[sp],#16
1165    ldp         x25, x26,[sp],#16
1166    ldp         x23, x24,[sp],#16
1167    ldp         x21, x22,[sp],#16
1168    ldp         x19, x20,[sp],#16
1169
1170    ret
1171
1172
1173
1174