1@/*****************************************************************************
2@*
3@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
4@*
5@* Licensed under the Apache License, Version 2.0 (the "License");
6@* you may not use this file except in compliance with the License.
7@* You may obtain a copy of the License at:
8@*
9@* http://www.apache.org/licenses/LICENSE-2.0
10@*
11@* Unless required by applicable law or agreed to in writing, software
12@* distributed under the License is distributed on an "AS IS" BASIS,
13@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14@* See the License for the specific language governing permissions and
15@* limitations under the License.
16@*
17@*****************************************************************************/
18@/**
19@*******************************************************************************
20@* ,:file
21@*  ihevc_sao_edge_offset_class2_chroma.s
22@*
23@* ,:brief
24@*  Contains function definitions for inter prediction  interpolation.
25@* Functions are coded using NEON  intrinsics and can be compiled using@ ARM
26@* RVCT
27@*
28@* ,:author
29@*  Parthiban V
30@*
31@* ,:par List of Functions:
32@*
33@*
34@* ,:remarks
35@*  None
36@*
37@*******************************************************************************
38@*/
39@void ihevc_sao_edge_offset_class2_chroma(UWORD8 *pu1_src,
40@                              WORD32 src_strd,
41@                              UWORD8 *pu1_src_left,
42@                              UWORD8 *pu1_src_top,
43@                              UWORD8 *pu1_src_top_left,
44@                              UWORD8 *pu1_src_top_right,
45@                              UWORD8 *pu1_src_bot_left,
46@                              UWORD8 *pu1_avail,
47@                              WORD8 *pi1_sao_offset_u,
48@                              WORD8 *pi1_sao_offset_v,
49@                              WORD32 wd,
50@                              WORD32 ht)
51@**************Variables Vs Registers*****************************************
52@r0 =>  *pu1_src
53@r1 =>  src_strd
54@r2 =>  *pu1_src_left
55@r3 =>  *pu1_src_top
56@r4 =>  *pu1_src_top_left
57@r5 =>  *pu1_avail
58@r6 =>  *pi1_sao_offset_u
59@r9 =>  *pi1_sao_offset_v
60@r7 =>  wd
61@r8=>   ht
62
63.text
64.syntax unified
65.p2align 2
66
67.extern gi1_table_edge_idx
68.globl ihevc_sao_edge_offset_class2_chroma_a9q
69
70gi1_table_edge_idx_addr_1:
71.long gi1_table_edge_idx - ulbl1 - 8
72
73gi1_table_edge_idx_addr_2:
74.long gi1_table_edge_idx - ulbl2 - 8
75
76gi1_table_edge_idx_addr_3:
77.long gi1_table_edge_idx - ulbl3 - 8
78
79gi1_table_edge_idx_addr_4:
80.long gi1_table_edge_idx - ulbl4 - 8
81
82gi1_table_edge_idx_addr_5:
83.long gi1_table_edge_idx - ulbl5 - 8
84
85ihevc_sao_edge_offset_class2_chroma_a9q:
86
87
88    STMFD       sp!,{r4-r12,r14}            @stack stores the values of the arguments
89
90    LDR         r7,[sp,#0x40]               @Loads wd
91    LDR         r8,[sp,#0x44]               @Loads ht
92    SUB         r9,r7,#2                    @wd - 2
93
94    LDR         r4,[sp,#0x28]               @Loads pu1_src_top_left
95    LDRH        r10,[r3,r9]                 @pu1_src_top[wd - 2]
96
97    STR         r0,[sp,#0x2C]               @Store pu1_src in sp
98    MOV         r9,r7                       @Move width to r9 for loop count
99
100    STR         r2,[sp,#0x30]               @Store pu1_src_left in sp
101    LDR         r5,[sp,#0x34]               @Loads pu1_avail
102    LDR         r6,[sp,#0x38]               @Loads pi1_sao_offset_u
103
104    STR         r3,[sp,#0x38]               @Store pu1_src_top in sp
105    SUB         sp,sp,#0xD4                 @Decrement the stack pointer to store some temp arr values
106
107    STRH        r10,[sp]                    @u1_src_top_left_tmp = pu1_src_top[wd - 2]
108    SUB         r10,r8,#1                   @ht-1
109    MLA         r11,r10,r1,r0               @pu1_src[(ht - 1) * src_strd + col]
110    ADD         r12,sp,#10                  @temp array
111
112AU1_SRC_TOP_LOOP:
113    VLD1.8      D0,[r11]!                   @pu1_src[(ht - 1) * src_strd + col]
114    SUBS        r9,r9,#8                    @Decrement the loop count by 8
115    VST1.8      D0,[r12]!                   @au1_src_top_tmp[col] = pu1_src[(ht - 1) * src_strd + col]
116    BNE         AU1_SRC_TOP_LOOP
117
118PU1_AVAIL_4_LOOP_U:
119    LDRB        r9,[r5,#4]                  @pu1_avail[4]
120    CMP         r9,#0
121    LDRB        r9,[r0]                     @u1_pos_0_0_tmp_u = pu1_src[0]
122    LDRB        r10,[r0,#1]                 @u1_pos_0_0_tmp_v = pu1_src[1]
123    BEQ         PU1_AVAIL_7_LOOP_U
124
125    LDRB        r11,[r4]                    @pu1_src_top_left[0]
126    ADD         r14,r0,r1                   @pu1_src + src_strd
127
128    SUB         r12,r9,r11                  @pu1_src[0] - pu1_src_top_left[0]
129
130    LDRB        r14,[r14,#2]                @pu1_src[2 + src_strd]
131    CMP         r12,#0
132
133    MVNLT       r12,#0
134    SUB         r11,r9,r14                  @pu1_src[0] - pu1_src[2 + src_strd]
135
136    MOVGT       r12,#1                      @SIGN(pu1_src[0] - pu1_src_top_left[0])
137
138    CMP         r11,#0
139    MVNLT       r11,#0
140    LDR         r14, gi1_table_edge_idx_addr_1 @table pointer
141ulbl1:
142    add         r14,r14,pc
143    MOVGT       r11,#1                      @SIGN(pu1_src[0] - pu1_src[2 + src_strd])
144
145    ADD         r11,r12,r11                 @SIGN(pu1_src[0] - pu1_src_top_left[0]) +  SIGN(pu1_src[0] - pu1_src[2 + src_strd])
146    ADD         r11,r11,#2                  @edge_idx
147
148    LDRSB       r12,[r14,r11]               @edge_idx = gi1_table_edge_idx[edge_idx]
149    CMP         r12,#0                      @0 != edge_idx
150    BEQ         PU1_AVAIL_4_LOOP_V
151    LDRSB       r11,[r6,r12]                @pi1_sao_offset_u[edge_idx]
152    ADD         r9,r9,r11                   @pu1_src[0] + pi1_sao_offset_u[edge_idx]
153    USAT        r9,#8,r9                    @u1_pos_0_0_tmp_u = CLIP3(pu1_src[0] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1)
154
155PU1_AVAIL_4_LOOP_V:
156
157    LDRB        r11,[r4,#1]                 @pu1_src_top_left[1]
158    ADD         r14,r0,r1                   @pu1_src + src_strd
159
160    SUB         r12,r10,r11                 @pu1_src[1] - pu1_src_top_left[1]
161    LDRB        r14,[r14,#3]                @pu1_src[3 + src_strd]
162
163    CMP         r12,#0
164    MVNLT       r12,#0
165    SUB         r11,r10,r14                 @pu1_src[1] - pu1_src[3 + src_strd]
166    MOVGT       r12,#1                      @SIGN(pu1_src[0] - pu1_src_top_left[0])
167
168    CMP         r11,#0
169    MVNLT       r11,#0
170    LDR         r14, gi1_table_edge_idx_addr_2 @table pointer
171ulbl2:
172    add         r14,r14,pc
173    MOVGT       r11,#1                      @SIGN(pu1_src[0] - pu1_src[3 + src_strd])
174
175    ADD         r11,r12,r11                 @SIGN(pu1_src[0] - pu1_src_top_left[0]) +  SIGN(pu1_src[0] - pu1_src[3 + src_strd])
176    ADD         r11,r11,#2                  @edge_idx
177
178    LDRSB       r12,[r14,r11]               @edge_idx = gi1_table_edge_idx[edge_idx]
179    CMP         r12,#0                      @0 != edge_idx
180    BEQ         PU1_AVAIL_7_LOOP_U
181    LDR         r11,[sp,#0x110]             @Loads pi1_sao_offset_v
182    LDRSB       r11,[r11,r12]               @pi1_sao_offset_v[edge_idx]
183    ADD         r10,r10,r11                 @pu1_src[0] + pi1_sao_offset_v[edge_idx]
184    USAT        r10,#8,r10                  @u1_pos_0_0_tmp_v = CLIP3(pu1_src[0] + pi1_sao_offset_v[edge_idx], 0, (1 << bit_depth) - 1)
185
186PU1_AVAIL_7_LOOP_U:
187    STRB        r10,[sp,#7]
188    STRB        r9,[sp,#6]
189
190    LDRB        r10,[r5,#7]                 @pu1_avail[7]
191    CMP         r10,#0
192    SUB         r10,r7,#2                   @wd - 2
193    SUB         r11,r8,#1                   @ht - 1
194    MLA         r12,r11,r1,r10              @wd - 2 + (ht - 1) * src_strd
195    ADD         r12,r12,r0                  @pu1_src[wd - 2 + (ht - 1) * src_strd]
196    LDRB        r10,[r12]                   @u1_pos_wd_ht_tmp_u = pu1_src[wd - 2 + (ht - 1) * src_strd]
197    LDRB        r9,[r12,#1]                 @u1_pos_wd_ht_tmp_v = pu1_src[wd - 2 + (ht - 1) * src_strd]
198    BEQ         PU1_AVAIL_3_LOOP
199
200    SUB         r11,r12,r1                  @pu1_src[(wd - 2 + (ht - 1) * src_strd) - src_strd]
201    SUB         r11,r11,#2                  @pu1_src[wd - 2 + (ht - 1) * src_strd - 2 - src_strd]
202    LDRB        r11,[r11]                   @Load pu1_src[wd - 2 + (ht - 1) * src_strd - 2 - src_strd]
203    SUB         r11,r10,r11                 @pu1_src[wd - 2 + (ht - 1) * src_strd] - pu1_src[wd - 2 + (ht - 1) * src_strd- 2 - src_strd]
204    CMP         r11,#0
205    MVNLT       r11,#0
206    MOVGT       r11,#1                      @SIGN(pu1_src[wd - 2 + (ht - 1) * src_strd] - pu1_src[wd - 2 + (ht - 1) * src_strd- 2 - src_strd])
207
208    ADD         r14,r12,r1                  @pu1_src[(wd - 2 + (ht - 1) * src_strd) + src_strd]
209    ADD         r14,r14,#2                  @pu1_src[wd - 2 + (ht - 1) * src_strd + 2 + src_strd]
210    LDRB        r14,[r14]                   @Load pu1_src[wd - 2 + (ht - 1) * src_strd + 2 + src_strd]
211    SUB         r14,r10,r14                 @pu1_src[wd - 2 + (ht - 1) * src_strd] - pu1_src[wd - 1 + (ht - 1) * src_strd + 2 + src_strd]
212    CMP         r14,#0
213    MVNLT       r14,#0
214    MOVGT       r14,#1                      @SIGN(pu1_src[wd - 2 + (ht - 1) * src_strd] - pu1_src[wd - 2 + (ht - 1) * src_strd + 2 + src_strd])
215
216    ADD         r11,r11,r14                 @Add 2 sign value
217    ADD         r11,r11,#2                  @edge_idx
218    LDR         r14, gi1_table_edge_idx_addr_3 @table pointer
219ulbl3:
220    add         r14,r14,pc
221
222    LDRSB       r14,[r14,r11]               @edge_idx = gi1_table_edge_idx[edge_idx]
223    CMP         r14,#0
224    BEQ         PU1_AVAIL_7_LOOP_V
225    LDRSB       r11,[r6,r14]                @pi1_sao_offset_u[edge_idx]
226    ADD         r10,r10,r11                 @pu1_src[wd - 2 + (ht - 1) * src_strd] + pi1_sao_offset[edge_idx]
227    USAT        r10,#8,r10                  @u1_pos_wd_ht_tmp = CLIP3(pu1_src[wd - 1 + (ht - 1) * src_strd] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1)
228
229PU1_AVAIL_7_LOOP_V:
230    ADD         r12,r12,#1
231    SUB         r11,r12,r1                  @pu1_src[(wd - 1 + (ht - 1) * src_strd) - src_strd]
232    SUB         r11,r11,#2                  @pu1_src[wd - 1 + (ht - 1) * src_strd - 2 - src_strd]
233    LDRB        r11,[r11]                   @Load pu1_src[wd - 1 + (ht - 1) * src_strd - 2 - src_strd]
234    SUB         r11,r9,r11                  @pu1_src[wd - 1 + (ht - 1) * src_strd] - pu1_src[wd - 1 + (ht - 1) * src_strd- 2 - src_strd]
235    CMP         r11,#0
236    MVNLT       r11,#0
237    MOVGT       r11,#1                      @SIGN(pu1_src[wd - 1 + (ht - 1) * src_strd] - pu1_src[wd - 1 + (ht - 1) * src_strd - 2 - src_strd])
238
239    ADD         r14,r12,r1                  @pu1_src[(wd - 1 + (ht - 1) * src_strd) + src_strd]
240    ADD         r14,r14,#2                  @pu1_src[wd - 1 + (ht - 1) * src_strd + 2 + src_strd]
241    LDRB        r14,[r14]                   @Load pu1_src[wd - 1 + (ht - 1) * src_strd + 2 + src_strd]
242    SUB         r14,r9,r14                  @pu1_src[wd - 1 + (ht - 1) * src_strd] - pu1_src[wd - 1 + (ht - 1) * src_strd + 2 + src_strd]
243    CMP         r14,#0
244    MVNLT       r14,#0
245    MOVGT       r14,#1                      @SIGN(pu1_src[wd - 1 + (ht - 1) * src_strd] - pu1_src[wd - 1 + (ht - 1) * src_strd + 1 + src_strd])
246
247    ADD         r11,r11,r14                 @Add 2 sign value
248    ADD         r11,r11,#2                  @edge_idx
249    LDR         r14, gi1_table_edge_idx_addr_4 @table pointer
250ulbl4:
251    add         r14,r14,pc
252
253    LDRSB       r12,[r14,r11]               @edge_idx = gi1_table_edge_idx[edge_idx]
254    CMP         r12,#0
255    BEQ         PU1_AVAIL_3_LOOP
256    LDR         r14,[sp,#0x110]             @Loads pi1_sao_offset_v
257    LDRSB       r11,[r14,r12]               @pi1_sao_offset_v[edge_idx]
258    ADD         r9,r9,r11                   @pu1_src[wd - 1 + (ht - 1) * src_strd] + pi1_sao_offset[edge_idx]
259    USAT        r9,#8,r9                    @u1_pos_wd_ht_tmp_v = CLIP3(pu1_src[wd - 1 + (ht - 1) * src_strd] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1)
260
261PU1_AVAIL_3_LOOP:
262    STRB        r10,[sp,#8]
263    VMOV.I8     Q0,#2                       @const_2 = vdupq_n_s8(2)
264    STRB        r9,[sp,#9]
265
266    MOV         r12,r8                      @Move ht
267    VMOV.I16    Q1,#0                       @const_min_clip = vdupq_n_s16(0)
268    MOV         r14,r2                      @Move pu1_src_left to pu1_src_left_cpy
269
270    LDRB        r11,[r5,#3]                 @pu1_avail[3]
271    VMOV.I16    Q2,#255                     @const_max_clip = vdupq_n_u16((1 << bit_depth) - 1)
272    CMP         r11,#0
273
274    SUBEQ       r12,r12,#1                  @ht_tmp--
275    LDRB        r5,[r5,#2]                  @pu1_avail[2]
276
277    CMP         r5,#0
278
279    ADDEQ       r0,r0,r1                    @pu1_src += src_strd
280    VLD1.8      D6,[r6]                     @offset_tbl_u = vld1_s8(pi1_sao_offset_u)
281    SUBEQ       r12,r12,#1                  @ht_tmp--
282
283    LDR         r6,[sp,#0x110]              @Loads pi1_sao_offset_v
284    ADDEQ       r14,r14,#2                  @pu1_src_left_cpy += 2
285
286    STR         r0,[sp,#2]                  @Store pu1_src in sp
287    VLD1.8      D7,[r6]                     @offset_tbl_v = vld1_s8(pi1_sao_offset_v)
288    LDR         r2, gi1_table_edge_idx_addr_5 @table pointer
289ulbl5:
290    add         r2,r2,pc
291
292    MOV         r6,r7                       @move wd to r6 loop_count
293    VMOV.S8     Q4,#0xFF                    @au1_mask = vdupq_n_s8(-1)
294    CMP         r7,#16                      @Compare wd with 16
295
296    BLT         WIDTH_RESIDUE               @If not jump to WIDTH_RESIDUE where loop is unrolled for 8 case
297    CMP         r8,#4                       @Compare ht with 4
298    BLE         WD_16_HT_4_LOOP             @If jump to WD_16_HT_4_LOOP
299
300WIDTH_LOOP_16:
301    LDR         r5,[sp,#0x108]              @Loads pu1_avail
302    LDR         r7,[sp,#0x114]              @Loads wd
303    CMP         r6,r7                       @col == wd
304    LDRBEQ      r8,[r5]                     @pu1_avail[0]
305
306    MOVNE       r8,#-1
307    VMOV.8      D8[0],r8                    @au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
308
309    CMP         r6,#16                      @if(col == 16)
310    VMOV.8      D8[1],r8                    @au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
311
312    BNE         SKIP_AU1_MASK_VAL
313    LDRB        r8,[r5,#1]                  @pu1_avail[1]
314    VMOV.8      D9[6],r8                    @au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
315    VMOV.8      D9[7],r8                    @au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
316
317SKIP_AU1_MASK_VAL:
318    LDRB        r9,[r5,#2]                  @pu1_avail[2]
319    VLD1.8      D12,[r0]!                   @pu1_cur_row = vld1q_u8(pu1_src)
320    VLD1.8      D13,[r0]                    @pu1_cur_row = vld1q_u8(pu1_src)
321    SUB         r0,#8
322    CMP         r9,#0
323
324    LDR         r4,[sp,#0x118]              @Loads ht
325    SUBEQ       r8,r0,r1                    @pu1_src - src_strd
326
327    LDR         r7,[sp,#0x114]              @Loads wd
328    MOVNE       r8,r3                       @pu1_src_top_cpy
329
330    SUB         r8,r8,#2                    @pu1_src - src_strd - 2
331    ADD         r3,r3,#16
332
333    ADD         r5,sp,#0x4B                 @*au1_src_left_tmp
334    VLD1.8      D10,[r8]!                   @pu1_top_row = vld1q_u8(pu1_src - src_strd - 2) || vld1q_u8(pu1_src_top_cpy - 2)
335    VLD1.8      D11,[r8]                    @pu1_top_row = vld1q_u8(pu1_src - src_strd - 2) || vld1q_u8(pu1_src_top_cpy - 2)
336    SUB         r8,#8
337    SUB         r7,r7,r6                    @(wd - col)
338
339    ADD         r7,r7,#14                   @15 + (wd - col)
340    VCGT.U8     Q7,Q6,Q5                    @vcgtq_u8(pu1_cur_row, pu1_top_row)
341    LDR         r8,[sp,#0x100]              @Loads *pu1_src
342
343    ADD         r7,r8,r7                    @pu1_src[0 * src_strd + 15 + (wd - col)]
344    VCLT.U8     Q8,Q6,Q5                    @vcltq_u8(pu1_cur_row, pu1_top_row)
345
346AU1_SRC_LEFT_LOOP:
347    LDRH        r8,[r7]                     @load the value and increment by src_strd
348    SUBS        r4,r4,#1                    @decrement the loop count
349
350    STRH        r8,[r5],#2                  @store it in the stack pointer
351    ADD         r7,r7,r1
352
353    BNE         AU1_SRC_LEFT_LOOP
354
355    ADD         r8,r0,r1                    @I *pu1_src + src_strd
356    VSUB.U8     Q7,Q8,Q7                    @sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
357    MOV         r7,r12                      @row count, move ht_tmp to r7
358
359    VLD1.8      D16,[r8]!                   @I pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
360    VLD1.8      D17,[r8]                    @I pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
361    SUB         r8,#8
362
363    ADD         r8,r8,#16                   @I
364    VMOV.I8     Q9,#0
365    LDRH        r5,[r8]                     @I pu1_src_cpy[src_strd + 16]
366
367    LDR         r10,[sp,#0x108]             @I Loads pu1_avail
368    VMOV.16     D18[0],r5                   @I pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0)
369    LDRB        r10,[r10,#2]                @I pu1_avail[2]
370
371    CMP         r10,#0                      @I
372    VEXT.8      Q9,Q8,Q9,#2                 @I pu1_next_row_tmp = vextq_u8(pu1_next_row, pu1_next_row_tmp, 2)
373    BNE         SIGN_UP_CHANGE_DONE         @I
374
375    LDRB        r11,[r0]                    @I pu1_src_cpy[0]
376    SUB         r4,r12,r7                   @I ht_tmp - row
377
378    LDRB        r10,[r0,#1]                 @I pu1_src_cpy[0]
379    LSL         r4,r4,#1                    @I (ht_tmp - row) * 2
380
381    ADD         r9,r14,r4                   @I pu1_src_left_cpy[(ht_tmp - row) * 2]
382    LDRB        r5,[r9,#-2]                 @I load the value
383
384    SUB         r8,r11,r5                   @I pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]
385    LDRB        r5,[r9,#-1]                 @I load the value
386
387    CMP         r8,#0                       @I
388    SUB         r4,r10,r5                   @I pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]
389
390    MVNLT       r8,#0                       @I
391    MOVGT       r8,#1                       @I SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2])
392
393    CMP         r4,#0                       @I
394    VMOV.8      D14[0],r8                   @I sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]), sign_up, 0)
395    MVNLT       r4,#0                       @I
396
397    MOVGT       r4,#1                       @I SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2])
398    VMOV.8      D14[1],r4                   @I sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]), sign_up, 1)
399
400SIGN_UP_CHANGE_DONE:
401    VLD1.8      D30,[r2]                    @edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
402    VCGT.U8     Q10,Q6,Q9                   @I vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
403
404    VCLT.U8     Q11,Q6,Q9                   @I vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
405    VSUB.U8     Q11,Q11,Q10                 @I sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
406
407    VADD.I8     Q9,Q0,Q7                    @I edge_idx = vaddq_s8(const_2, sign_up)
408    VADD.I8     Q9,Q9,Q11                   @I edge_idx = vaddq_s8(edge_idx, sign_down)
409
410    VTBL.8      D18,{D30},D18               @I vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
411    VNEG.S8     Q7,Q11                      @I sign_up = vnegq_s8(sign_down)
412
413    VTBL.8      D19,{D30},D19               @I vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
414    VEXT.8      Q7,Q7,Q7,#14                @I sign_up = vextq_s8(sign_up, sign_up, 14)
415
416    VMOVL.U8    Q10,D12                     @I pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
417    VAND        Q11,Q9,Q4                   @I edge_idx = vandq_s8(edge_idx, au1_mask)
418
419    VMOVL.U8    Q9,D13                      @I pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
420    VUZP.8      D22,D23                     @I
421
422    VTBL.8      D22,{D6},D22                @I
423    VTBL.8      D23,{D7},D23                @I
424    VZIP.8      D22,D23                     @I
425
426    VMOV        Q6,Q8                       @I pu1_cur_row = pu1_next_row
427    VADDW.S8    Q10,Q10,D22                 @I pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
428
429    VMAX.S16    Q10,Q10,Q1                  @I pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
430    VMIN.U16    Q10,Q10,Q2                  @I pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
431
432    VADDW.S8    Q9,Q9,D23                   @I pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
433    VMAX.S16    Q9,Q9,Q1                    @I pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
434
435    VMIN.U16    Q9,Q9,Q2                    @I pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
436    SUB         r7,r7,#1                    @I Decrement the ht_tmp loop count by 1
437
438
439PU1_SRC_LOOP:
440    ADD         r8,r0,r1,LSL #1             @II *pu1_src + src_strd
441    VMOVN.I16   D20,Q10                     @I vmovn_s16(pi2_tmp_cur_row.val[0])
442    ADD         r11,r8,r1                   @III *pu1_src + src_strd
443
444    VLD1.8      D16,[r8]!                   @II pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
445    VLD1.8      D17,[r8]                    @II pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
446    SUB         r8,#8
447    VLD1.8      D30,[r11]!                  @III pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
448    VLD1.8      D31,[r11]                   @III pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
449    SUB         r11,#8
450
451    ADD         r8,r8,#16                   @II
452    VMOVN.I16   D21,Q9                      @I vmovn_s16(pi2_tmp_cur_row.val[1])
453    LDRH        r5,[r8]                     @II pu1_src_cpy[src_strd + 16]
454
455    ADD         r11,r11,#16                 @III
456    VMOV.16     D28[0],r5                   @II pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0)
457    LDRH        r4,[r11]                    @III pu1_src_cpy[src_strd + 16]
458
459    LDRB        r8,[r0,r1]                  @II pu1_src_cpy[0]
460    VEXT.8      Q14,Q8,Q14,#2               @II pu1_next_row_tmp = vextq_u8(pu1_next_row, pu1_next_row_tmp, 2)
461    SUB         r5,r12,r7                   @II ht_tmp - row
462
463    LSL         r5,r5,#1                    @II (ht_tmp - row) * 2
464    VMOV.16     D18[0],r4                   @III pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0)
465    ADD         r9,r14,r5                   @II pu1_src_left_cpy[(ht_tmp - row) * 2]
466
467    LDRB        r11,[r9,#-2]                @II load the value
468    VST1.8      {Q10},[r0],r1               @I vst1q_u8(pu1_src_cpy, pu1_cur_row)
469    SUB         r8,r8,r11                   @II pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]
470
471    CMP         r8,#0                       @II
472    VEXT.8      Q9,Q15,Q9,#2                @III pu1_next_row_tmp = vextq_u8(pu1_next_row, pu1_next_row_tmp, 2)
473    LDRB        r11,[r0,#1]                 @II pu1_src_cpy[0]
474
475    MVNLT       r8,#0                       @II
476    VCGT.U8     Q11,Q6,Q14                  @II vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
477    MOVGT       r8,#1                       @II SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2])
478
479    LDRB        r5,[r9,#-1]                 @II load the value
480    VMOV.8      D14[0],r8                   @II sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]), sign_up, 0)
481    SUB         r7,r7,#1                    @II Decrement the ht_tmp loop count by 1
482
483    SUB         r11,r11,r5                  @II pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]
484    VCLT.U8     Q12,Q6,Q14                  @II vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
485    CMP         r11,#0                      @II
486
487    MVNLT       r11,#0                      @II
488    VSUB.U8     Q12,Q12,Q11                 @II sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
489    MOVGT       r11,#1                      @II SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2])
490
491    LDRB        r4,[r0,r1]                  @III pu1_src_cpy[0]
492    VLD1.8      D22,[r2]                    @edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
493    SUB         r5,r12,r7                   @III ht_tmp - row
494
495    ADD         r10,r0,r1
496    VMOV.8      D14[1],r11                  @II sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]), sign_up, 1)
497    LSL         r5,r5,#1                    @III (ht_tmp - row) * 2
498
499    ADD         r9,r14,r5                   @III pu1_src_left_cpy[(ht_tmp - row) * 2]
500    VADD.I8     Q13,Q0,Q7                   @II edge_idx = vaddq_s8(const_2, sign_up)
501    LDRB        r10,[r10,#1]                @III pu1_src_cpy[0]
502
503    LDRB        r5,[r9,#-2]                 @III load the value
504    VADD.I8     Q13,Q13,Q12                 @II edge_idx = vaddq_s8(edge_idx, sign_down)
505    SUB         r4,r4,r5                    @III pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]
506
507    CMP         r4,#0                       @III
508    LDRB        r9,[r9,#-1]                 @III load the value
509    VTBL.8      D26,{D22},D26               @II vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
510    VNEG.S8     Q7,Q12                      @II sign_up = vnegq_s8(sign_down)
511
512    MVNLT       r4,#0                       @III
513    SUB         r10,r10,r9                  @III pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]
514    VTBL.8      D27,{D22},D27               @II vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
515    VEXT.8      Q7,Q7,Q7,#14                @II sign_up = vextq_s8(sign_up, sign_up, 14)
516
517    MOVGT       r4,#1                       @III SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2])
518    VAND        Q13,Q13,Q4                  @II edge_idx = vandq_s8(edge_idx, au1_mask)
519    CMP         r10,#0                      @III
520
521    VUZP.8      D26,D27                     @II
522    VMOV.8      d14[0],r4                   @III sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]), sign_up, 0)
523
524    MVNLT       r10,#0                      @III
525    MOVGT       r10,#1                      @III SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2])
526    VTBL.8      D24,{D6},D26                @II
527    VCGT.U8     Q10,Q8,Q9                   @III vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
528
529    VCLT.U8     Q11,Q8,Q9                   @III vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
530    VTBL.8      D25,{D7},D27                @II
531    VSUB.U8     Q11,Q11,Q10                 @III sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
532
533    VMOV.8      D14[1],r10                  @III sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]), sign_up, 1)
534    VZIP.8      D24,D25                     @II
535
536    VMOVL.U8    Q14,D12                     @II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
537    VADD.I8     Q9,Q0,Q7                    @III edge_idx = vaddq_s8(const_2, sign_up)
538
539    VLD1.8      D20,[r2]                    @edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
540    VADDW.S8    Q14,Q14,D24                 @II pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
541
542    VADD.I8     Q9,Q9,Q11                   @III edge_idx = vaddq_s8(edge_idx, sign_down)
543    VMAX.S16    Q14,Q14,Q1                  @II pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
544
545    VMIN.U16    Q14,Q14,Q2                  @II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
546    VTBL.8      D18,{D20},D18               @III vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
547    VNEG.S8     Q7,Q11                      @III sign_up = vnegq_s8(sign_down)
548
549    VTBL.8      D19,{D20},D19               @III vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
550    VEXT.8      Q7,Q7,Q7,#14                @III sign_up = vextq_s8(sign_up, sign_up, 14)
551
552    VMOVL.U8    Q13,D13                     @II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
553    VAND        Q9,Q9,Q4                    @III edge_idx = vandq_s8(edge_idx, au1_mask)
554
555    VUZP.8      D18,D19                     @III
556    VTBL.8      D22,{D6},D18                @III
557    VADDW.S8    Q13,Q13,D25                 @II pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
558
559    VMOV        Q6,Q15                      @III pu1_cur_row = pu1_next_row
560    VTBL.8      D23,{D7},D19                @III
561    VMAX.S16    Q13,Q13,Q1                  @II pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
562
563    VMOVL.U8    Q10,D16                     @III pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
564    VMIN.U16    Q13,Q13,Q2                  @II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
565
566    VZIP.8      D22,D23                     @III
567    VMOVN.I16   D28,Q14                     @II vmovn_s16(pi2_tmp_cur_row.val[0])
568
569    VMOVN.I16   D29,Q13                     @II vmovn_s16(pi2_tmp_cur_row.val[1])
570    VADDW.S8    Q10,Q10,D22                 @III pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
571
572    VMOVL.U8    Q9,D17                      @III pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
573    VMAX.S16    Q10,Q10,Q1                  @III pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
574
575    VMIN.U16    Q10,Q10,Q2                  @III pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
576    VADDW.S8    Q9,Q9,D23                   @III pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
577
578    SUB         r7,r7,#1                    @III Decrement the ht_tmp loop count by 1
579    VMAX.S16    Q9,Q9,Q1                    @III pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
580    CMP         r7,#1
581
582    VST1.8      {Q14},[r0],r1               @II vst1q_u8(pu1_src_cpy, pu1_cur_row)
583    VMIN.U16    Q9,Q9,Q2                    @III pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
584
585    BGT         PU1_SRC_LOOP                @If not equal jump to PU1_SRC_LOOP
586    BLT         INNER_LOOP_DONE
587
588    ADD         r8,r0,r1,LSL #1             @*pu1_src + src_strd
589    VMOVN.I16   D20,Q10                     @III vmovn_s16(pi2_tmp_cur_row.val[0])
590
591    LDRB        r11,[r0,r1]                 @pu1_src_cpy[0]
592    VLD1.8      D16,[r8]!                   @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
593    VLD1.8      D17,[r8]                    @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
594    SUB         r8,#8
595    SUB         r4,r12,r7                   @ht_tmp - row
596
597    ADD         r8,r8,#16
598    VMOVN.I16   D21,Q9                      @III vmovn_s16(pi2_tmp_cur_row.val[1])
599    LDRH        r5,[r8]                     @pu1_src_cpy[src_strd + 16]
600
601    LSL         r4,r4,#1                    @(ht_tmp - row) * 2
602    VMOV.16     D18[0],r5                   @pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0)
603    ADD         r9,r14,r4                   @pu1_src_left_cpy[(ht_tmp - row) * 2]
604
605    LDRB        r5,[r9,#-2]                 @load the value
606    VEXT.8      Q9,Q8,Q9,#2                 @pu1_next_row_tmp = vextq_u8(pu1_next_row, pu1_next_row_tmp, 2)
607    SUB         r8,r11,r5                   @pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]
608
609    CMP         r8,#0
610    VST1.8      {Q10},[r0],r1               @III vst1q_u8(pu1_src_cpy, pu1_cur_row)
611    MVNLT       r8,#0
612
613    MOVGT       r8,#1                       @SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2])
614    VLD1.8      D30,[r2]                    @edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
615
616    LDRB        r11,[r0,#1]                 @pu1_src_cpy[0]
617    VMOV.8      D14[0],r8                   @sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]), sign_up, 0)
618    LDRB        r5,[r9,#-1]                 @load the value
619
620    SUB         r4,r11,r5                   @pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]
621    VCGT.U8     Q11,Q6,Q9                   @vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
622    CMP         r4,#0
623
624    MVNLT       r4,#0
625    VCLT.U8     Q12,Q6,Q9                   @vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
626    MOVGT       r4,#1                       @SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2])
627
628    VMOV.8      D14[1],r4                   @sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]), sign_up, 1)
629    VSUB.U8     Q12,Q12,Q11                 @sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
630
631    VADD.I8     Q13,Q0,Q7                   @edge_idx = vaddq_s8(const_2, sign_up)
632    VADD.I8     Q13,Q13,Q12                 @edge_idx = vaddq_s8(edge_idx, sign_down)
633
634    VTBL.8      D26,{D30},D26               @vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
635    VTBL.8      D27,{D30},D27               @vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
636
637    VMOVL.U8    Q10,D12                     @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
638    VAND        Q13,Q13,Q4                  @edge_idx = vandq_s8(edge_idx, au1_mask)
639
640    VMOVL.U8    Q9,D13                      @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
641    VUZP.8      D26,D27
642
643    VTBL.8      D24,{D6},D26
644    VTBL.8      D25,{D7},D27
645    VZIP.8      D24,D25
646
647    VADDW.S8    Q10,Q10,D24                 @pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
648    VMAX.S16    Q10,Q10,Q1                  @pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
649    VMIN.U16    Q10,Q10,Q2                  @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
650
651    VADDW.S8    Q9,Q9,D25                   @pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
652    VMAX.S16    Q9,Q9,Q1                    @pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
653    VMIN.U16    Q9,Q9,Q2                    @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
654
655
656INNER_LOOP_DONE:
657    LDR         r8,[sp,#0x118]              @Loads ht
658    VMOVN.I16   D20,Q10                     @vmovn_s16(pi2_tmp_cur_row.val[0])
659    ADD         r5,sp,#0x4B                 @*au1_src_left_tmp
660
661    LDR         r11,[sp,#0x104]             @Loads *pu1_src_left
662    VMOVN.I16   D21,Q9                      @vmovn_s16(pi2_tmp_cur_row.val[1])
663
664
665SRC_LEFT_LOOP:
666    LDR         r7,[r5],#4                  @au1_src_left_tmp[row]
667    SUBS        r8,r8,#2
668    STR         r7,[r11],#4                 @pu1_src_left[row] = au1_src_left_tmp[row]
669    BNE         SRC_LEFT_LOOP
670
671    SUBS        r6,r6,#16                   @Decrement the wd loop count by 16
672    VST1.8      {Q10},[r0],r1               @vst1q_u8(pu1_src_cpy, pu1_cur_row)
673    CMP         r6,#8                       @Check whether residue remains
674
675    BLT         RE_ASSINING_LOOP            @Jump to re-assigning loop
676    LDR         r7,[sp,#0x114]              @Loads wd
677    LDR         r0,[sp,#0x02]               @Loads *pu1_src
678    SUB         r7,r7,r6
679    ADD         r0,r0,r7
680    BGT         WIDTH_LOOP_16               @If not equal jump to width_loop
681    BEQ         WIDTH_RESIDUE               @If residue remains jump to residue loop
682
683
684WD_16_HT_4_LOOP:
685    LDR         r5,[sp,#0x108]              @Loads pu1_avail
686    LDR         r7,[sp,#0x114]              @Loads wd
687    CMP         r6,r7                       @col == wd
688    LDRBEQ      r8,[r5]                     @pu1_avail[0]
689
690    MOVNE       r8,#-1
691    VMOV.8      D8[0],r8                    @au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
692    VMOV.8      D8[1],r8                    @au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
693
694    CMP         r6,#16                      @if(col == 16)
695    BNE         SKIP_AU1_MASK_VAL_WD_16_HT_4
696    LDRB        r8,[r5,#1]                  @pu1_avail[1]
697    VMOV.8      D9[6],r8                    @au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
698    VMOV.8      D9[7],r8                    @au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
699
700SKIP_AU1_MASK_VAL_WD_16_HT_4:
701    LDRB        r8,[r5,#2]                  @pu1_avail[2]
702    CMP         r8,#0
703
704    SUBEQ       r8,r0,r1                    @pu1_src - src_strd
705    MOVNE       r8,r3                       @pu1_src_top_cpy
706    SUB         r8,r8,#2                    @pu1_src - src_strd - 2
707    VLD1.8      D10,[r8]!                   @pu1_top_row = vld1q_u8(pu1_src - src_strd - 2) || vld1q_u8(pu1_src_top_cpy - 2)
708    VLD1.8      D11,[r8]                    @pu1_top_row = vld1q_u8(pu1_src - src_strd - 2) || vld1q_u8(pu1_src_top_cpy - 2)
709    SUB         r8,#8
710
711    ADD         r3,r3,#16
712    ADD         r5,sp,#0x4B                 @*au1_src_left_tmp
713    LDR         r4,[sp,#0x118]              @Loads ht
714    LDR         r7,[sp,#0x114]              @Loads wd
715    SUB         r7,r7,r6                    @(wd - col)
716    ADD         r7,r7,#14                   @15 + (wd - col)
717    LDR         r8,[sp,#0x100]              @Loads *pu1_src
718    ADD         r7,r8,r7                    @pu1_src[0 * src_strd + 15 + (wd - col)]
719
720AU1_SRC_LEFT_LOOP_WD_16_HT_4:
721    LDRH        r8,[r7]                     @load the value and increment by src_strd
722    STRH        r8,[r5],#2                  @store it in the stack pointer
723    ADD         r7,r7,r1
724
725    SUBS        r4,r4,#1                    @decrement the loop count
726    BNE         AU1_SRC_LEFT_LOOP_WD_16_HT_4
727
728    VLD1.8      D12,[r0]!                   @pu1_cur_row = vld1q_u8(pu1_src)
729    VLD1.8      D13,[r0]                    @pu1_cur_row = vld1q_u8(pu1_src)
730    SUB         r0,#8
731
732    VCGT.U8     Q7,Q6,Q5                    @vcgtq_u8(pu1_cur_row, pu1_top_row)
733    VCLT.U8     Q8,Q6,Q5                    @vcltq_u8(pu1_cur_row, pu1_top_row)
734    VSUB.U8     Q7,Q8,Q7                    @sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
735    VMOV.I8     Q9,#0
736    MOV         r7,r12                      @row count, move ht_tmp to r7
737
738PU1_SRC_LOOP_WD_16_HT_4:
739    VMOV.I8     Q9,#0
740    ADD         r8,r0,r1                    @*pu1_src + src_strd
741    VLD1.8      D16,[r8]!                   @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
742    VLD1.8      D17,[r8]                    @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
743    SUB         r8,#8
744
745    ADD         r8,r8,#16
746    LDRH        r5,[r8]                     @pu1_src_cpy[src_strd + 16]
747    VMOV.16     D18[0],r5                   @pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0)
748    VEXT.8      Q9,Q8,Q9,#2                 @pu1_next_row_tmp = vextq_u8(pu1_next_row, pu1_next_row_tmp, 2)
749
750    CMP         r7,r12
751    BLT         SIGN_UP_CHANGE_WD_16_HT_4
752    LDR         r5,[sp,#0x108]              @Loads pu1_avail
753    LDRB        r5,[r5,#2]                  @pu1_avail[2]
754    CMP         r5,#0
755    BNE         SIGN_UP_CHANGE_DONE_WD_16_HT_4
756
757SIGN_UP_CHANGE_WD_16_HT_4:
758    LDRB        r8,[r0]                     @pu1_src_cpy[0]
759    SUB         r5,r12,r7                   @ht_tmp - row
760    LSL         r5,r5,#1                    @(ht_tmp - row) * 2
761    ADD         r9,r14,r5                   @pu1_src_left_cpy[(ht_tmp - row) * 2]
762    LDRB        r5,[r9,#-2]                 @load the value
763    SUB         r8,r8,r5                    @pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]
764    CMP         r8,#0
765    MVNLT       r8,#0
766    MOVGT       r8,#1                       @SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2])
767    VMOV.8      d14[0],r8                   @sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]), sign_up, 0)
768
769    LDRB        r8,[r0,#1]                  @pu1_src_cpy[0]
770    LDRB        r5,[r9,#-1]                 @load the value
771    SUB         r8,r8,r5                    @pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]
772    CMP         r8,#0
773    MVNLT       r8,#0
774    MOVGT       r8,#1                       @SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2])
775    VMOV.8      d14[1],r8                   @sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]), sign_up, 1)
776
777SIGN_UP_CHANGE_DONE_WD_16_HT_4:
778    VCGT.U8     Q11,Q6,Q9                   @vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
779    VCLT.U8     Q12,Q6,Q9                   @vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
780    VSUB.U8     Q12,Q12,Q11                 @sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
781
782    VADD.I8     Q13,Q0,Q7                   @edge_idx = vaddq_s8(const_2, sign_up)
783    VADD.I8     Q13,Q13,Q12                 @edge_idx = vaddq_s8(edge_idx, sign_down)
784
785    VLD1.8      D22,[r2]                    @edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
786    VTBL.8      D26,{D22},D26               @vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
787    VTBL.8      D27,{D22},D27               @vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
788
789    VAND        Q13,Q13,Q4                  @edge_idx = vandq_s8(edge_idx, au1_mask)
790
791    VNEG.S8     Q7,Q12                      @sign_up = vnegq_s8(sign_down)
792    VEXT.8      Q7,Q7,Q7,#14                @sign_up = vextq_s8(sign_up, sign_up, 14)
793
794    VUZP.8      D26,D27
795    VTBL.8      D24,{D6},D26
796    VTBL.8      D25,{D7},D27
797    VZIP.8      D24,D25
798
799    VMOVL.U8    Q14,D12                     @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
800    VADDW.S8    Q14,Q14,D24                 @pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
801    VMAX.S16    Q14,Q14,Q1                  @pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
802    VMIN.U16    Q14,Q14,Q2                  @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
803
804    VMOVL.U8    Q13,D13                     @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
805    VADDW.S8    Q13,Q13,D25                 @pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
806    VMAX.S16    Q13,Q13,Q1                  @pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
807    VMIN.U16    Q13,Q13,Q2                  @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
808
809    VMOVN.I16   D28,Q14                     @vmovn_s16(pi2_tmp_cur_row.val[0])
810    VMOVN.I16   D29,Q13                     @vmovn_s16(pi2_tmp_cur_row.val[1])
811
812    VST1.8      {Q14},[r0],r1               @vst1q_u8(pu1_src_cpy, pu1_cur_row)
813
814    VMOV        Q6,Q8                       @pu1_cur_row = pu1_next_row
815    SUBS        r7,r7,#1                    @Decrement the ht_tmp loop count by 1
816    BNE         PU1_SRC_LOOP_WD_16_HT_4     @If not equal jump to PU1_SRC_LOOP_WD_16_HT_4
817
818    LDR         r8,[sp,#0x118]              @Loads ht
819    ADD         r5,sp,#0x4B                 @*au1_src_left_tmp
820    LDR         r11,[sp,#0x104]             @Loads *pu1_src_left
821
822SRC_LEFT_LOOP_WD_16_HT_4:
823    LDR         r7,[r5],#4                  @au1_src_left_tmp[row]
824    STR         r7,[r11],#4                 @pu1_src_left[row] = au1_src_left_tmp[row]
825
826    SUBS        r8,r8,#2
827    BNE         SRC_LEFT_LOOP_WD_16_HT_4
828
829
830    SUBS        r6,r6,#16                   @Decrement the wd loop count by 16
831    BLE         RE_ASSINING_LOOP            @Jump to re-assigning loop
832    BGT         WD_16_HT_4_LOOP
833
834
835WIDTH_RESIDUE:
836    LDR         r7,[sp,#0x114]              @Loads wd
837    LDR         r5,[sp,#0x108]              @Loads pu1_avail
838    CMP         r6,r7                       @wd_residue == wd
839    LDRBEQ      r8,[r5]                     @pu1_avail[0]
840
841    MOVNE       r8,#-1
842    VMOV.8      d8[0],r8                    @au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
843    VMOV.8      d8[1],r8                    @au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
844
845    LDRB        r8,[r5,#1]                  @pu1_avail[1]
846    VMOV.8      d8[6],r8                    @au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
847    VMOV.8      d8[7],r8                    @au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
848
849    LDRB        r8,[r5,#2]                  @pu1_avail[2]
850    CMP         r8,#0
851
852    SUBEQ       r8,r0,r1                    @pu1_src - src_strd
853    MOVNE       r8,r3
854    SUB         r8,r8,#2                    @pu1_src - src_strd - 2
855    VLD1.8      D10,[r8]!                   @pu1_top_row = vld1q_u8(pu1_src - src_strd - 2)
856    VLD1.8      D11,[r8]                    @pu1_top_row = vld1q_u8(pu1_src - src_strd - 2)
857    SUB         r8,#8
858
859    ADD         r5,sp,#0x4B                 @*au1_src_left_tmp
860    LDR         r4,[sp,#0x118]              @Loads ht
861    LDR         r7,[sp,#0x114]              @Loads wd
862    LDR         r8,[sp,#0x100]              @Loads *pu1_src
863    SUB         r7,r7,#2                    @(wd - 2)
864    ADD         r7,r8,r7                    @pu1_src[0 * src_strd + (wd - 2)]
865
866AU1_SRC_LEFT_LOOP_RESIDUE:
867    LDRH        r8,[r7]                     @load the value and increment by src_strd
868    STRH        r8,[r5],#2                  @store it in the stack pointer
869    ADD         r7,r7,r1
870    SUBS        r4,r4,#1                    @decrement the loop count
871    BNE         AU1_SRC_LEFT_LOOP_RESIDUE
872
873    VLD1.8      D12,[r0]!                   @pu1_cur_row = vld1q_u8(pu1_src)
874    VLD1.8      D13,[r0]                    @pu1_cur_row = vld1q_u8(pu1_src)
875    SUB         r0,#8
876
877    VCGT.U8     Q7,Q6,Q5                    @vcgtq_u8(pu1_cur_row, pu1_top_row)
878    VCLT.U8     Q8,Q6,Q5                    @vcltq_u8(pu1_cur_row, pu1_top_row)
879    VSUB.U8     Q7,Q8,Q7                    @sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
880    MOV         r7,r12                      @row count, move ht_tmp to r7
881
882PU1_SRC_LOOP_RESIDUE:
883    VMOV.I8     Q9,#0
884    ADD         r8,r0,r1                    @*pu1_src + src_strd
885    VLD1.8      D16,[r8]!                   @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
886    VLD1.8      D17,[r8]                    @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
887    SUB         r8,#8
888
889    ADD         r8,r8,#16
890    LDRH        r5,[r8]                     @pu1_src_cpy[src_strd + 16]
891    VMOV.16     D18[0],r5                   @pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0)
892    VEXT.8      Q9,Q8,Q9,#2                 @pu1_next_row_tmp = vextq_u8(pu1_next_row, pu1_next_row_tmp, 2)
893
894    CMP         r7,r12
895    BLT         SIGN_UP_CHANGE_RESIDUE
896    LDR         r5,[sp,#0x108]              @Loads pu1_avail
897    LDRB        r5,[r5,#2]                  @pu1_avail[2]
898    CMP         r5,#0
899    BNE         SIGN_UP_CHANGE_DONE_RESIDUE
900
901SIGN_UP_CHANGE_RESIDUE:
902    LDRB        r8,[r0]                     @pu1_src_cpy[0]
903    SUB         r5,r12,r7                   @ht_tmp - row
904    LSL         r5,r5,#1                    @(ht_tmp - row) * 2
905    ADD         r9,r14,r5                   @pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]
906    LDRB        r5,[r9,#-2]                 @load the value
907    SUB         r8,r8,r5                    @pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]
908    CMP         r8,#0
909    MVNLT       r8,#0
910    MOVGT       r8,#1                       @SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2])
911    VMOV.8      d14[0],r8                   @sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]), sign_up, 0)
912
913    LDRB        r8,[r0,#1]                  @pu1_src_cpy[0]
914    LDRB        r5,[r9,#-1]                 @load the value
915    SUB         r8,r8,r5                    @pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]
916    CMP         r8,#0
917    MVNLT       r8,#0
918    MOVGT       r8,#1                       @SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2])
919    VMOV.8      d14[1],r8                   @sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]), sign_up, 1)
920
921SIGN_UP_CHANGE_DONE_RESIDUE:
922    VCGT.U8     Q11,Q6,Q9                   @vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
923    VCLT.U8     Q12,Q6,Q9                   @vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
924    VSUB.U8     Q12,Q12,Q11                 @sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
925
926    VADD.I8     Q13,Q0,Q7                   @edge_idx = vaddq_s8(const_2, sign_up)
927    VADD.I8     Q13,Q13,Q12                 @edge_idx = vaddq_s8(edge_idx, sign_down)
928
929    VLD1.8      D22,[r2]                    @edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
930    VTBL.8      D26,{D22},D26               @vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
931    VTBL.8      D27,{D22},D27               @vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
932
933    VAND        Q13,Q13,Q4                  @edge_idx = vandq_s8(edge_idx, au1_mask)
934
935    VNEG.S8     Q7,Q12                      @sign_up = vnegq_s8(sign_down)
936    VEXT.8      Q7,Q7,Q7,#14                @sign_up = vextq_s8(sign_up, sign_up, 14)
937
938    VUZP.8      D26,D27
939    VTBL.8      D24,{D6},D26
940    VTBL.8      D25,{D7},D27
941    VZIP.8      D24,D25
942
943    VMOVL.U8    Q14,D12                     @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
944    VADDW.S8    Q14,Q14,D24                 @pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
945    VMAX.S16    Q14,Q14,Q1                  @pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
946    VMIN.U16    Q14,Q14,Q2                  @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
947
948    VMOVN.I16   D28,Q14                     @vmovn_s16(pi2_tmp_cur_row.val[0])
949
950    VST1.8      {D28},[r0],r1               @vst1q_u8(pu1_src_cpy, pu1_cur_row)
951
952    VMOV        Q6,Q8                       @pu1_cur_row = pu1_next_row
953    SUBS        r7,r7,#1                    @Decrement the ht_tmp loop count by 1
954    BNE         PU1_SRC_LOOP_RESIDUE        @If not equal jump to PU1_SRC_LOOP
955
956    LDR         r8,[sp,#0x118]              @Loads ht
957    LDR         r11,[sp,#0x104]             @Loads *pu1_src_left
958    ADD         r5,sp,#0x4B                 @*au1_src_left_tmp
959
960SRC_LEFT_LOOP_RESIDUE:
961    LDR         r7,[r5],#4                  @au1_src_left_tmp[row]
962    SUBS        r8,r8,#2
963    STR         r7,[r11],#4                 @pu1_src_left[row] = au1_src_left_tmp[row]
964
965    BNE         SRC_LEFT_LOOP_RESIDUE
966
967
968RE_ASSINING_LOOP:
969    LDR         r8,[sp,#0x118]              @Loads ht
970
971    LDR         r0,[sp,#0x100]              @Loads *pu1_src
972    SUB         r8,r8,#1                    @ht - 1
973
974    LDR         r7,[sp,#0x114]              @Loads wd
975
976    LDRH        r9,[sp,#6]
977    MLA         r6,r8,r1,r7                 @wd - 2 + (ht - 1) * src_strd
978
979    STRH        r9,[r0]                     @pu1_src_org[0] = u1_pos_0_0_tmp
980    ADD         r6,r0,r6                    @pu1_src[wd - 2 + (ht - 1) * src_strd]
981
982    LDRH        r9,[sp,#8]
983    ADD         r12,sp,#10
984    STRH        r9,[r6,#-2]                 @pu1_src_org[wd - 1 + (ht - 1) * src_strd] = u1_pos_wd_ht_tmp_u
985
986    LDR         r4,[sp,#0xFC]               @Loads pu1_src_top_left
987    LDRH        r10,[sp]                    @load u1_src_top_left_tmp from stack pointer
988    STRH        r10,[r4]                    @*pu1_src_top_left = u1_src_top_left_tmp
989    LDR         r3,[sp,#0x10C]              @Loads pu1_src_top
990
991SRC_TOP_LOOP:
992    VLD1.8      D0,[r12]!                   @pu1_src_top[col] = au1_src_top_tmp[col]
993    SUBS        r7,r7,#8                    @Decrement the width
994    VST1.8      D0,[r3]!                    @pu1_src_top[col] = au1_src_top_tmp[col]
995    BNE         SRC_TOP_LOOP
996
997END_LOOPS:
998    ADD         sp,sp,#0xD4
999    LDMFD       sp!,{r4-r12,r15}            @Reload the registers from SP
1000
1001
1002
1003