1@/*****************************************************************************
2@*
3@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
4@*
5@* Licensed under the Apache License, Version 2.0 (the "License");
6@* you may not use this file except in compliance with the License.
7@* You may obtain a copy of the License at:
8@*
9@* http://www.apache.org/licenses/LICENSE-2.0
10@*
11@* Unless required by applicable law or agreed to in writing, software
12@* distributed under the License is distributed on an "AS IS" BASIS,
13@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14@* See the License for the specific language governing permissions and
15@* limitations under the License.
16@*
17@*****************************************************************************/
18@/**
19@*******************************************************************************
20@* ,:file
21@*  ihevc_sao_edge_offset_class3.s
22@*
23@* ,:brief
24@*  Contains function definitions for inter prediction  interpolation.
25@* Functions are coded using NEON  intrinsics and can be compiled using@ ARM
26@* RVCT
27@*
28@* ,:author
29@*  Parthiban V
30@*
31@* ,:par List of Functions:
32@*
33@*
34@* ,:remarks
35@*  None
36@*
37@*******************************************************************************
38@*/
39@void ihevc_sao_edge_offset_class3(UWORD8 *pu1_src,
40@                              WORD32 src_strd,
41@                              UWORD8 *pu1_src_left,
42@                              UWORD8 *pu1_src_top,
43@                              UWORD8 *pu1_src_top_left,
44@                              UWORD8 *pu1_src_top_right,
45@                              UWORD8 *pu1_src_bot_left,
46@                              UWORD8 *pu1_avail,
47@                              WORD8 *pi1_sao_offset,
48@                              WORD32 wd,
49@                              WORD32 ht)
50@**************Variables Vs Registers*****************************************
51@r0 =>  *pu1_src
52@r1 =>  src_strd
53@r2 =>  *pu1_src_left
54@r3 =>  *pu1_src_top
55@r4 =>  *pu1_src_top_left
56@r5 =>  *pu1_avail
57@r6 =>  *pi1_sao_offset
58@r7 =>  wd
59@r8=>   ht
60
61.text
62.syntax unified
63.p2align 2
64
65.extern gi1_table_edge_idx
66.globl ihevc_sao_edge_offset_class3_a9q
67
68gi1_table_edge_idx_addr_1:
69.long gi1_table_edge_idx - ulbl1 - 8
70
71gi1_table_edge_idx_addr_2:
72.long gi1_table_edge_idx - ulbl2 - 8
73
74gi1_table_edge_idx_addr_3:
75.long gi1_table_edge_idx - ulbl3 - 8
76
77ihevc_sao_edge_offset_class3_a9q:
78
79
80    STMFD       sp!,{r4-r12,r14}            @stack stores the values of the arguments
81    LDR         r7,[sp,#0x3C]               @Loads wd
82
83    LDR         r8,[sp,#0x40]               @Loads ht
84    SUB         r9,r7,#1                    @wd - 1
85
86    LDR         r4,[sp,#0x28]               @Loads pu1_src_top_left
87    LDRB        r10,[r3,r9]                 @pu1_src_top[wd - 1]
88
89    MOV         r9,r7                       @Move width to r9 for loop count
90
91    LDR         r5,[sp,#0x34]               @Loads pu1_avail
92    LDR         r6,[sp,#0x38]               @Loads pi1_sao_offset
93    STR         r3,[sp,#0x38]               @Store pu1_src_top in sp
94
95    SUB         sp,sp,#0x94                 @Decrement the stack pointer to store some temp arr values
96
97    STRB        r10,[sp]                    @u1_src_top_left_tmp = pu1_src_top[wd - 1]
98    SUB         r10,r8,#1                   @ht-1
99    MLA         r11,r10,r1,r0               @pu1_src[(ht - 1) * src_strd + col]
100    ADD         r12,sp,#0x02                @temp array
101
102AU1_SRC_TOP_LOOP:
103    VLD1.8      D0,[r11]!                   @pu1_src[(ht - 1) * src_strd + col]
104    SUBS        r9,r9,#8                    @Decrement the loop count by 8
105    VST1.8      D0,[r12]!                   @au1_src_top_tmp[col] = pu1_src[(ht - 1) * src_strd + col]
106    BNE         AU1_SRC_TOP_LOOP
107
108PU1_AVAIL_5_LOOP:
109    LDRB        r9,[r5,#5]                  @pu1_avail[5]
110    CMP         r9,#0
111    SUB         r10,r7,#1                   @[wd - 1]
112    LDRB        r9,[r0,r10]                 @u1_pos_0_0_tmp = pu1_src[wd - 1]
113    BEQ         PU1_AVAIL_6_LOOP
114
115    LDR         r11,[sp,#0xC0]              @Load pu1_src_top_right from sp
116    SUB         r10,r10,#1                  @[wd - 1 - 1]
117
118    LDRB        r11,[r11]                   @pu1_src_top_right[0]
119    SUB         r12,r9,r11                  @pu1_src[wd - 1] - pu1_src_top_right[0]
120
121    ADD         r11,r0,r1                   @pu1_src + src_strd
122
123    LDRB        r14,[r11,r10]               @pu1_src[wd - 1 - 1 + src_strd]
124    CMP         r12,#0
125    MVNLT       r12,#0
126    SUB         r11,r9,r14                  @pu1_src[wd - 1] - pu1_src[wd - 1 - 1 + src_strd]
127
128    MOVGT       r12,#1                      @SIGN(pu1_src[wd - 1] - pu1_src_top_right[0])
129    CMP         r11,#0
130    MVNLT       r11,#0
131    MOVGT       r11,#1                      @SIGN(pu1_src[wd - 1] - pu1_src[wd - 1 - 1 + src_strd])
132    LDR         r14, gi1_table_edge_idx_addr_1 @table pointer
133ulbl1:
134    add         r14,r14,pc
135    ADD         r11,r12,r11                 @SIGN(pu1_src[wd - 1] - pu1_src_top_right[0]) +  SIGN(pu1_src[wd - 1] - pu1_src[wd - 1 - 1 + src_strd])
136    ADD         r11,r11,#2                  @edge_idx
137
138    LDRSB       r12,[r14,r11]               @edge_idx = gi1_table_edge_idx[edge_idx]
139    CMP         r12,#0                      @0 != edge_idx
140    BEQ         PU1_AVAIL_6_LOOP
141    LDRSB       r10,[r6,r12]                @pi1_sao_offset[edge_idx]
142    ADD         r9,r9,r10                   @pu1_src[0] + pi1_sao_offset[edge_idx]
143    USAT        r9,#8,r9                    @u1_pos_0_0_tmp = CLIP3(pu1_src[0] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1)
144
145PU1_AVAIL_6_LOOP:
146    LDRB        r10,[r5,#6]                 @pu1_avail[6]
147    SUB         r11,r8,#1                   @ht - 1
148
149    CMP         r10,#0
150    STR         r0,[sp,#0xC0]               @Store pu1_src in sp
151    MLA         r12,r11,r1,r0               @pu1_src[(ht - 1) * src_strd]
152
153    LDRB        r10,[r12]                   @u1_pos_wd_ht_tmp = pu1_src[(ht - 1) * src_strd]
154    BEQ         PU1_AVAIL_3_LOOP
155
156    LDR         r14,[sp,#0xC4]              @Load pu1_src_bot_left from sp
157    SUB         r11,r12,r1                  @pu1_src[(ht - 1) * src_strd) - src_strd]
158
159    LDRB        r14,[r14]                   @Load pu1_src_bot_left[0]
160    ADD         r11,r11,#1                  @pu1_src[(ht - 1) * src_strd + 1 - src_strd]
161
162    LDRB        r11,[r11]                   @Load pu1_src[(ht - 1) * src_strd + 1 - src_strd]
163    SUB         r14,r10,r14                 @pu1_src[(ht - 1) * src_strd] - pu1_src_bot_left[0]
164
165    SUB         r11,r10,r11                 @pu1_src[(ht - 1) * src_strd] - pu1_src[(ht - 1) * src_strd + 1 - src_strd]
166    CMP         r11,#0
167    MVNLT       r11,#0
168    MOVGT       r11,#1                      @SIGN(pu1_src[(ht - 1) * src_strd] - pu1_src[(ht - 1) * src_strd + 1 - src_strd])
169
170    CMP         r14,#0
171    MVNLT       r14,#0
172    MOVGT       r14,#1                      @SIGN(pu1_src[(ht - 1) * src_strd] - pu1_src_bot_left[0])
173
174    ADD         r11,r11,r14                 @Add 2 sign value
175
176    LDR         r14, gi1_table_edge_idx_addr_2 @table pointer
177ulbl2:
178    add         r14,r14,pc
179    ADD         r11,r11,#2                  @edge_idx
180
181    LDRSB       r12,[r14,r11]               @edge_idx = gi1_table_edge_idx[edge_idx]
182    CMP         r12,#0
183    BEQ         PU1_AVAIL_3_LOOP
184    LDRSB       r11,[r6,r12]                @pi1_sao_offset[edge_idx]
185    ADD         r10,r10,r11                 @pu1_src[(ht - 1) * src_strd] + pi1_sao_offset[edge_idx]
186    USAT        r10,#8,r10                  @u1_pos_wd_ht_tmp = CLIP3(pu1_src[(ht - 1) * src_strd] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1)
187
188PU1_AVAIL_3_LOOP:
189    STR         r2,[sp,#0xC4]               @Store pu1_src_left in sp
190    MOV         r12,r8                      @Move ht
191
192    MOV         r14,r2                      @Move pu1_src_left to pu1_src_left_cpy
193    VMOV.I8     Q0,#2                       @const_2 = vdupq_n_s8(2)
194    LDRB        r11,[r5,#3]                 @pu1_avail[3]
195
196    CMP         r11,#0
197    VMOV.I16    Q1,#0                       @const_min_clip = vdupq_n_s16(0)
198    SUBEQ       r12,r12,#1                  @ht_tmp--
199
200    LDRB        r5,[r5,#2]                  @pu1_avail[2]
201    VMOV.I16    Q2,#255                     @const_max_clip = vdupq_n_u16((1 << bit_depth) - 1)
202    CMP         r5,#0
203
204    ADDEQ       r0,r0,r1                    @pu1_src += src_strd
205    VLD1.8      D7,[r6]                     @offset_tbl = vld1_s8(pi1_sao_offset)
206    SUBEQ       r12,r12,#1                  @ht_tmp--
207
208    LDR         r6, gi1_table_edge_idx_addr_3 @table pointer
209ulbl3:
210    add         r6,r6,pc
211    VMOV.S8     Q4,#0xFF                    @au1_mask = vdupq_n_s8(-1)
212    ADDEQ       r14,r14,#1                  @pu1_src_left_cpy += 1
213
214    STR         r0,[sp,#0x90]               @Store pu1_src in sp
215    VLD1.8      D6,[r6]                     @edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
216    MOV         r6,r7                       @move wd to r6 loop_count
217
218    CMP         r7,#16                      @Compare wd with 16
219    BLT         WIDTH_RESIDUE               @If not jump to WIDTH_RESIDUE where loop is unrolled for 8 case
220    CMP         r8,#4                       @Compare ht with 4
221    BLE         WD_16_HT_4_LOOP             @If jump to WD_16_HT_4_LOOP
222
223WIDTH_LOOP_16:
224    LDR         r7,[sp,#0xD0]               @Loads wd
225
226    LDR         r5,[sp,#0xC8]               @Loads pu1_avail
227    CMP         r6,r7                       @col == wd
228    LDRBEQ      r8,[r5]                     @pu1_avail[0]
229    MOVNE       r8,#-1
230    VMOV.8      d8[0],r8                    @au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
231
232    CMP         r6,#16                      @if(col == 16)
233    BNE         SKIP_AU1_MASK_VAL
234    LDRB        r8,[r5,#1]                  @pu1_avail[1]
235    VMOV.8      d9[7],r8                    @au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
236
237SKIP_AU1_MASK_VAL:
238    LDRB        r8,[r5,#2]                  @pu1_avail[2]
239    CMP         r8,#0
240
241    LDR         r4,[sp,#0xD4]               @Loads ht
242    SUBEQ       r8,r0,r1                    @pu1_src - src_strd
243
244    MOVNE       r8,r3
245    ADD         r5,sp,#0x42                 @*au1_src_left_tmp
246
247    LDR         r7,[sp,#0xD0]               @Loads wd
248    ADD         r8,r8,#1                    @pu1_src - src_strd + 1
249
250    SUB         r7,r7,r6                    @(wd - col)
251    VLD1.8      D10,[r8]!                   @pu1_top_row = vld1q_u8(pu1_src - src_strd + 1)
252    VLD1.8      D11,[r8]                    @pu1_top_row = vld1q_u8(pu1_src - src_strd + 1)
253    SUB         r8,#8
254    ADD         r3,r3,#16
255
256    LDR         r8,[sp,#0xC0]               @Loads *pu1_src
257    VLD1.8      D12,[r0]!                   @pu1_cur_row = vld1q_u8(pu1_src)
258    VLD1.8      D13,[r0]                    @pu1_cur_row = vld1q_u8(pu1_src)
259    SUB         r0,#8
260    ADD         r7,r7,#15                   @15 + (wd - col)
261
262    ADD         r7,r8,r7                    @pu1_src[0 * src_strd + 15 + (wd - col)]
263    VCGT.U8     Q7,Q6,Q5                    @vcgtq_u8(pu1_cur_row, pu1_top_row)
264    SUB         r5,r5,#1
265
266AU1_SRC_LEFT_LOOP:
267    LDRB        r8,[r7],r1                  @load the value and increment by src_strd
268    SUBS        r4,r4,#1                    @decrement the loop count
269    STRB        r8,[r5,#1]!                 @store it in the stack pointer
270    BNE         AU1_SRC_LEFT_LOOP
271
272    VMOV.I8     Q9,#0
273    VCLT.U8     Q8,Q6,Q5                    @vcltq_u8(pu1_cur_row, pu1_top_row)
274
275    ADD         r8,r0,r1                    @I *pu1_src + src_strd
276    VSUB.U8     Q7,Q8,Q7                    @sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
277    MOV         r7,r12                      @row count, move ht_tmp to r7
278
279    SUB         r5,r12,r7                   @I ht_tmp - row
280    VLD1.8      D16,[r8]!                   @I pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
281    VLD1.8      D17,[r8]                    @I pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
282    SUB         r8,#8
283    ADD         r8,r14,r5                   @I pu1_src_left_cpy[ht_tmp - row]
284
285    ADD         r8,r8,#1                    @I pu1_src_left_cpy[ht_tmp - row + 1]
286    LDRB        r8,[r8]
287
288    LDR         r5,[sp,#0xC8]               @I Loads pu1_avail
289    VMOV.8      D19[7],r8                   @I vsetq_lane_u8
290    LDRB        r5,[r5,#2]                  @I pu1_avail[2]
291
292    VEXT.8      Q9,Q9,Q8,#15                @I pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 15)
293    CMP         r5,#0                       @I
294    BNE         SIGN_UP_CHANGE_DONE         @I
295
296SIGN_UP_CHANGE:
297    LDRB        r8,[r0,#15]                 @I pu1_src_cpy[15]
298    SUB         r5,r0,r1                    @I pu1_src_cpy[16 - src_strd]
299
300    LDRB        r5,[r5,#16]                 @I load the value
301    SUB         r8,r8,r5                    @I pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]
302    CMP         r8,#0                       @I
303    MVNLT       r8,#0                       @I
304    MOVGT       r8,#1                       @I SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd])
305    VMOV.8      D15[7],r8                   @I sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]), sign_up, 15)
306
307SIGN_UP_CHANGE_DONE:
308    VCGT.U8     Q5,Q6,Q9                    @I vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
309    VCLT.U8     Q9,Q6,Q9                    @I vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
310    VSUB.U8     Q5,Q9,Q5                    @I sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
311
312    VADD.I8     Q9,Q0,Q7                    @I edge_idx = vaddq_s8(const_2, sign_up)
313    VADD.I8     Q9,Q9,Q5                    @I edge_idx = vaddq_s8(edge_idx, sign_down)
314    VTBL.8      D18,{D6},D18                @I vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
315    VNEG.S8     Q7,Q5                       @I sign_up = vnegq_s8(sign_down)
316
317    VEXT.8      Q7,Q7,Q7,#1                 @I sign_up = vextq_s8(sign_up, sign_up, 1)
318    VTBL.8      D19,{D6},D19                @I vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
319
320    VMOVL.U8    Q10,D12                     @I pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
321    VAND        Q9,Q9,Q4                    @I edge_idx = vandq_s8(edge_idx, au1_mask)
322
323    VTBL.8      D10,{D7},D18                @I offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
324
325    VMOVL.U8    Q11,D13                     @I pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
326    VADDW.S8    Q10,Q10,D10                 @I pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
327
328    VMAX.S16    Q10,Q10,Q1                  @I pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
329    VTBL.8      D11,{D7},D19                @I offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx))
330    VMIN.U16    Q10,Q10,Q2                  @I pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
331
332    VMOV        Q6,Q8
333    VADDW.S8    Q11,Q11,D11                 @I pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
334
335    VMAX.S16    Q11,Q11,Q1                  @I pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
336    VMIN.U16    Q11,Q11,Q2                  @I pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
337
338    SUB         r7,r7,#1                    @I Decrement the ht_tmp loop count by 1
339
340PU1_SRC_LOOP:
341    ADD         r8,r0,r1,LSL #1             @II *pu1_src + src_strd
342    VMOVN.I16   D20,Q10                     @I vmovn_s16(pi2_tmp_cur_row.val[0])
343    SUB         r5,r12,r7                   @II ht_tmp - row
344
345    ADD         r4,r0,r1                    @II pu1_src_cpy[16 - src_strd]
346    VMOVN.I16   D21,Q11                     @I vmovn_s16(pi2_tmp_cur_row.val[1])
347    ADD         r2,r8,r1                    @III *pu1_src + src_strd
348
349    LDRB        r11,[r4,#15]                @II pu1_src_cpy[15]
350    VLD1.8      D16,[r8]!                   @II pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
351    VLD1.8      D17,[r8]                    @II pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
352    SUB         r8,#8
353    SUB         r7,r7,#1                    @II Decrement the ht_tmp loop count by 1
354
355    ADD         r8,r14,r5                   @II pu1_src_left_cpy[ht_tmp - row]
356    VLD1.8      D30,[r2]!                   @III pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
357    VLD1.8      D31,[r2]                    @III pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
358    SUB         r2,#8
359    LDRB        r8,[r8,#1]
360
361    LDRB        r4,[r0,#16]                 @II load the value
362    VMOV.8      D19[7],r8                   @II vsetq_lane_u8
363    SUB         r11,r11,r4                  @II pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]
364
365    CMP         r11,#0                      @II
366    VST1.8      {Q10},[r0],r1               @I vst1q_u8(pu1_src_cpy, pu1_cur_row)
367    SUB         r5,r12,r7                   @III ht_tmp - row
368
369    MVNLT       r11,#0                      @II
370    VEXT.8      Q9,Q9,Q8,#15                @II pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 15)
371    MOVGT       r11,#1                      @II SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd])
372
373    ADD         r8,r14,r5                   @III pu1_src_left_cpy[ht_tmp - row]
374    VMOV.8      D15[7],r11                  @II sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]), sign_up, 15)
375    CMP         r7,#1                       @III
376
377    BNE         NEXT_ROW_ELSE_2             @III
378    LDR         r5,[sp,#0xC8]               @III Loads pu1_avail
379    LDRB        r5,[r5,#3]                  @III pu1_avail[3]
380    CMP         r5,#0                       @III
381    SUBNE       r8,r2,#2                    @III pu1_src_cpy[src_strd - 1]
382
383NEXT_ROW_ELSE_2:
384    LDRB        r8,[r8,#1]                  @III
385    VCGT.U8     Q12,Q6,Q9                   @II vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
386    ADD         r5,r0,r1
387
388    LDRB        r2,[r5,#15]                 @III pu1_src_cpy[15]
389    VCLT.U8     Q13,Q6,Q9                   @II vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
390    LDRB        r5,[r0,#16]                 @III load the value
391
392    SUB         r2,r2,r5                    @III pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]
393    VSUB.U8     Q12,Q13,Q12                 @II sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
394    CMP         r2,#0                       @III
395
396    MVNLT       r2,#0                       @III
397    VMOV.8      D19[7],r8                   @III vsetq_lane_u8
398    MOVGT       r2,#1                       @III SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd])
399
400    SUB         r7,r7,#1                    @III Decrement the ht_tmp loop count by 1
401    VADD.I8     Q13,Q0,Q7                   @II edge_idx = vaddq_s8(const_2, sign_up)
402
403    VNEG.S8     Q7,Q12                      @II sign_up = vnegq_s8(sign_down)
404    VEXT.8      Q9,Q9,Q15,#15               @III pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 15)
405
406    VADD.I8     Q13,Q13,Q12                 @II edge_idx = vaddq_s8(edge_idx, sign_down)
407
408    VEXT.8      Q7,Q7,Q7,#1                 @II sign_up = vextq_s8(sign_up, sign_up, 1)
409    VTBL.8      D26,{D6},D26                @II vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
410    VCGT.U8     Q5,Q8,Q9                    @III vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
411
412    VMOV.8      D15[7],r2                   @III sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]), sign_up, 15)
413    VTBL.8      D27,{D6},D27                @II vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
414    VCLT.U8     Q9,Q8,Q9                    @III vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
415
416    VMOVL.U8    Q14,D12                     @II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
417    VAND        Q13,Q13,Q4                  @II edge_idx = vandq_s8(edge_idx, au1_mask)
418
419    VSUB.U8     Q5,Q9,Q5                    @III sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
420    VTBL.8      D24,{D7},D26                @II offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
421    VADD.I8     Q9,Q0,Q7                    @III edge_idx = vaddq_s8(const_2, sign_up)
422
423    VADD.I8     Q9,Q9,Q5                    @III edge_idx = vaddq_s8(edge_idx, sign_down)
424    VTBL.8      D25,{D7},D27                @II offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx))
425    VNEG.S8     Q7,Q5                       @III sign_up = vnegq_s8(sign_down)
426
427    VADDW.S8    Q14,Q14,D24                 @II pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
428    VTBL.8      D18,{D6},D18                @III vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
429    VMAX.S16    Q14,Q14,Q1                  @II pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
430
431    VEXT.8      Q7,Q7,Q7,#1                 @III sign_up = vextq_s8(sign_up, sign_up, 1)
432    VTBL.8      D19,{D6},D19                @III vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
433    VMIN.U16    Q14,Q14,Q2                  @II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
434
435    VMOVL.U8    Q13,D13                     @II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
436    VAND        Q9,Q9,Q4                    @III edge_idx = vandq_s8(edge_idx, au1_mask)
437
438    VADDW.S8    Q13,Q13,D25                 @II pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
439    VTBL.8      D10,{D7},D18                @III offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
440    VMAX.S16    Q13,Q13,Q1                  @II pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
441
442    VMOVL.U8    Q10,D16                     @III pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
443    VMIN.U16    Q13,Q13,Q2                  @II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
444
445    VADDW.S8    Q10,Q10,D10                 @III pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
446    VTBL.8      D11,{D7},D19                @III offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx))
447    VMAX.S16    Q10,Q10,Q1                  @III pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
448
449    VMOVL.U8    Q11,D17                     @III pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
450    VMIN.U16    Q10,Q10,Q2                  @III pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
451
452    VMOVN.I16   D28,Q14                     @II vmovn_s16(pi2_tmp_cur_row.val[0])
453    VADDW.S8    Q11,Q11,D11                 @III pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
454
455    VMOVN.I16   D29,Q13                     @II vmovn_s16(pi2_tmp_cur_row.val[1])
456    VMAX.S16    Q11,Q11,Q1                  @III pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
457
458    VMOV        Q6,Q15                      @II pu1_cur_row = pu1_next_row
459    VMIN.U16    Q11,Q11,Q2                  @III pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
460
461    CMP         r7,#1                       @III
462    VST1.8      {Q14},[r0],r1               @II vst1q_u8(pu1_src_cpy, pu1_cur_row)
463    BGT         PU1_SRC_LOOP                @If not equal jump to PU1_SRC_LOOP
464    BLT         INNER_LOOP_DONE
465
466    ADD         r8,r0,r1,LSL #1             @*pu1_src + src_strd
467    VMOVN.I16   D20,Q10                     @III vmovn_s16(pi2_tmp_cur_row.val[0])
468    LDR         r5,[sp,#0xC8]               @Loads pu1_avail
469
470    LDRB        r5,[r5,#3]                  @pu1_avail[3]
471    VMOVN.I16   D21,Q11                     @III vmovn_s16(pi2_tmp_cur_row.val[1])
472    CMP         r5,#0
473
474    ADD         r4,r0,r1                    @pu1_src_cpy[16 - src_strd]
475    VLD1.8      D16,[r8]!                   @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
476    VLD1.8      D17,[r8]                    @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
477    SUB         r8,#8
478    LDRB        r5,[r0,#16]                 @load the value
479
480    BEQ         NEXT_ROW_ELSE_3
481    LDRB        r8,[r8,#-1]                 @pu1_src_cpy[src_strd - 1]
482    B           NEXT_ROW_POINTER_ASSIGNED_3
483NEXT_ROW_ELSE_3:
484    SUB         r11,r12,r7                  @ht_tmp - row
485    ADD         r8,r14,r11                  @pu1_src_left_cpy[ht_tmp - row]
486    ADD         r8,r8,#1                    @pu1_src_left_cpy[ht_tmp - row + 1]
487    LDRB        r8,[r8]
488
489NEXT_ROW_POINTER_ASSIGNED_3:
490    LDRB        r11,[r4,#15]                @pu1_src_cpy[15]
491    VMOV.8      D19[7],r8                   @vsetq_lane_u8
492    SUB         r8,r11,r5                   @pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]
493
494    CMP         r8,#0
495    VEXT.8      Q9,Q9,Q8,#15                @pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 15)
496    MVNLT       r8,#0
497
498    VST1.8      {Q10},[r0],r1               @III vst1q_u8(pu1_src_cpy, pu1_cur_row)
499    VCGT.U8     Q12,Q6,Q9                   @vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
500
501    MOVGT       r8,#1                       @SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd])
502    VCLT.U8     Q13,Q6,Q9                   @vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
503
504    VMOV.8      D15[7],r8                   @sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]), sign_up, 15)
505    VSUB.U8     Q12,Q13,Q12                 @sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
506
507    VMOVL.U8    Q10,D12                     @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
508    VADD.I8     Q13,Q0,Q7                   @edge_idx = vaddq_s8(const_2, sign_up)
509
510    VMOVL.U8    Q11,D13                     @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
511    VADD.I8     Q13,Q13,Q12                 @edge_idx = vaddq_s8(edge_idx, sign_down)
512
513    VTBL.8      D26,{D6},D26                @vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
514    VTBL.8      D27,{D6},D27                @vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
515
516    VAND        Q13,Q13,Q4                  @edge_idx = vandq_s8(edge_idx, au1_mask)
517
518    VTBL.8      D24,{D7},D26                @offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
519
520    VADDW.S8    Q10,Q10,D24                 @pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
521    VTBL.8      D25,{D7},D27                @offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx))
522    VMAX.S16    Q10,Q10,Q1                  @pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
523
524    VMIN.U16    Q10,Q10,Q2                  @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
525
526    VADDW.S8    Q11,Q11,D25                 @pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
527    VMAX.S16    Q11,Q11,Q1                  @pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
528    VMIN.U16    Q11,Q11,Q2                  @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
529
530INNER_LOOP_DONE:
531    VMOVN.I16   D20,Q10                     @vmovn_s16(pi2_tmp_cur_row.val[0])
532    LDR         r8,[sp,#0xD4]               @Loads ht
533
534    VMOVN.I16   D21,Q11                     @vmovn_s16(pi2_tmp_cur_row.val[1])
535    ADD         r5,sp,#0x42                 @*au1_src_left_tmp
536
537    VST1.8      {Q10},[r0],r1               @vst1q_u8(pu1_src_cpy, pu1_cur_row)
538    LDR         r2,[sp,#0xC4]               @Loads *pu1_src_left
539SRC_LEFT_LOOP:
540    LDR         r7,[r5],#4                  @au1_src_left_tmp[row]
541    SUBS        r8,r8,#4
542    STR         r7,[r2],#4                  @pu1_src_left[row] = au1_src_left_tmp[row]
543    BNE         SRC_LEFT_LOOP
544
545    SUBS        r6,r6,#16                   @Decrement the wd loop count by 16
546    CMP         r6,#8                       @Check whether residue remains
547    BLT         RE_ASSINING_LOOP            @Jump to re-assigning loop
548    LDR         r7,[sp,#0xD0]               @Loads wd
549    LDR         r0,[sp,#0x90]               @Loads *pu1_src
550    SUB         r7,r7,r6
551    ADD         r0,r0,r7
552    BGT         WIDTH_LOOP_16               @If not equal jump to width_loop
553    BEQ         WIDTH_RESIDUE               @If residue remains jump to residue loop
554
555
556
557WD_16_HT_4_LOOP:
558    LDR         r5,[sp,#0xC8]               @Loads pu1_avail
559    LDR         r7,[sp,#0xD0]               @Loads wd
560    CMP         r6,r7                       @col == wd
561    LDRBEQ      r8,[r5]                     @pu1_avail[0]
562    MOVNE       r8,#-1
563    VMOV.8      d8[0],r8                    @au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
564
565    CMP         r6,#16                      @if(col == 16)
566    BNE         SKIP_AU1_MASK_VAL_WD_16_HT_4
567    LDRB        r8,[r5,#1]                  @pu1_avail[1]
568    VMOV.8      d9[7],r8                    @au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
569
570SKIP_AU1_MASK_VAL_WD_16_HT_4:
571    LDRB        r8,[r5,#2]                  @pu1_avail[2]
572    CMP         r8,#0
573
574    SUBEQ       r8,r0,r1                    @pu1_src - src_strd
575    MOVNE       r8,r3
576    ADD         r8,r8,#1                    @pu1_src - src_strd + 1
577    VLD1.8      D10,[r8]!                   @pu1_top_row = vld1q_u8(pu1_src - src_strd + 1)
578    VLD1.8      D11,[r8]                    @pu1_top_row = vld1q_u8(pu1_src - src_strd + 1)
579    SUB         r8,#8
580
581    ADD         r3,r3,#16
582    ADD         r5,sp,#0x42                 @*au1_src_left_tmp
583    LDR         r4,[sp,#0xD4]               @Loads ht
584    LDR         r7,[sp,#0xD0]               @Loads wd
585    SUB         r7,r7,r6                    @(wd - col)
586    ADD         r7,r7,#15                   @15 + (wd - col)
587    LDR         r8,[sp,#0xC0]               @Loads *pu1_src
588    ADD         r7,r8,r7                    @pu1_src[0 * src_strd + 15 + (wd - col)]
589    SUB         r5,r5,#1
590
591AU1_SRC_LEFT_LOOP_WD_16_HT_4:
592    LDRB        r8,[r7],r1                  @load the value and increment by src_strd
593    STRB        r8,[r5,#1]!                 @store it in the stack pointer
594    SUBS        r4,r4,#1                    @decrement the loop count
595    BNE         AU1_SRC_LEFT_LOOP_WD_16_HT_4
596
597    VLD1.8      D12,[r0]!                   @pu1_cur_row = vld1q_u8(pu1_src)
598    VLD1.8      D13,[r0]                    @pu1_cur_row = vld1q_u8(pu1_src)
599    SUB         r0,#8
600
601    VCGT.U8     Q7,Q6,Q5                    @vcgtq_u8(pu1_cur_row, pu1_top_row)
602    VCLT.U8     Q8,Q6,Q5                    @vcltq_u8(pu1_cur_row, pu1_top_row)
603    VSUB.U8     Q7,Q8,Q7                    @sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
604    VMOV.I8     Q9,#0
605    MOV         r7,r12                      @row count, move ht_tmp to r7
606
607PU1_SRC_LOOP_WD_16_HT_4:
608    ADD         r8,r0,r1                    @*pu1_src + src_strd
609    VLD1.8      D16,[r8]!                   @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
610    VLD1.8      D17,[r8]                    @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
611    SUB         r8,#8
612    LDR         r5,[sp,#0xC8]               @Loads pu1_avail
613    LDRB        r5,[r5,#3]                  @pu1_avail[3]
614    CMP         r5,#0
615    BEQ         NEXT_ROW_ELSE_WD_16_HT_4
616    CMP         r7,#1
617    LDRBEQ      r8,[r8,#-1]                 @pu1_src_cpy[src_strd - 1]
618    BEQ         NEXT_ROW_POINTER_ASSIGNED_WD_16_HT_4
619NEXT_ROW_ELSE_WD_16_HT_4:
620    SUB         r5,r12,r7                   @ht_tmp - row
621    ADD         r8,r14,r5                   @pu1_src_left_cpy[ht_tmp - row]
622    ADD         r8,r8,#1                    @pu1_src_left_cpy[ht_tmp - row + 1]
623    LDRB        r8,[r8]
624
625NEXT_ROW_POINTER_ASSIGNED_WD_16_HT_4:
626    VMOV.8      D19[7],r8                   @vsetq_lane_u8
627    VEXT.8      Q9,Q9,Q8,#15                @pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 15)
628
629    CMP         r7,r12
630    BNE         SIGN_UP_CHANGE_WD_16_HT_4
631    LDR         r5,[sp,#0xC8]               @Loads pu1_avail
632    LDRB        r5,[r5,#2]                  @pu1_avail[2]
633    CMP         r5,#0
634    BNE         SIGN_UP_CHANGE_DONE_WD_16_HT_4
635
636SIGN_UP_CHANGE_WD_16_HT_4:
637    LDRB        r8,[r0,#15]                 @pu1_src_cpy[15]
638    ADD         r5,r0,#16                   @pu1_src_cpy[16]
639    SUB         r5,r5,r1                    @pu1_src_cpy[16 - src_strd]
640    LDRB        r5,[r5]                     @load the value
641    SUB         r8,r8,r5                    @pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]
642    CMP         r8,#0
643    MVNLT       r8,#0
644    MOVGT       r8,#1                       @SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd])
645    VMOV.8      D15[7],r8                   @sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]), sign_up, 15)
646
647SIGN_UP_CHANGE_DONE_WD_16_HT_4:
648    VCGT.U8     Q10,Q6,Q9                   @vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
649    VCLT.U8     Q11,Q6,Q9                   @vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
650    VSUB.U8     Q12,Q11,Q10                 @sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
651
652    VADD.I8     Q13,Q0,Q7                   @edge_idx = vaddq_s8(const_2, sign_up)
653    VADD.I8     Q13,Q13,Q12                 @edge_idx = vaddq_s8(edge_idx, sign_down)
654    VTBL.8      D26,{D6},D26                @vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
655    VTBL.8      D27,{D6},D27                @vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
656
657    VAND        Q13,Q13,Q4                  @edge_idx = vandq_s8(edge_idx, au1_mask)
658
659    VNEG.S8     Q7,Q12                      @sign_up = vnegq_s8(sign_down)
660    VEXT.8      Q7,Q7,Q7,#1                 @sign_up = vextq_s8(sign_up, sign_up, 1)
661
662    VTBL.8      D24,{D7},D26                @offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
663    VMOVL.U8    Q14,D12                     @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
664    VADDW.S8    Q14,Q14,D24                 @pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
665    VMAX.S16    Q14,Q14,Q1                  @pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
666    VMIN.U16    Q14,Q14,Q2                  @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
667
668    VTBL.8      D25,{D7},D27                @offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx))
669    VMOVL.U8    Q15,D13                     @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
670    VADDW.S8    Q15,Q15,D25                 @pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
671    VMAX.S16    Q15,Q15,Q1                  @pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
672    VMIN.U16    Q15,Q15,Q2                  @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
673
674    VMOVN.I16   D28,Q14                     @vmovn_s16(pi2_tmp_cur_row.val[0])
675    VMOVN.I16   D29,Q15                     @vmovn_s16(pi2_tmp_cur_row.val[1])
676
677    VST1.8      {Q14},[r0],r1               @vst1q_u8(pu1_src_cpy, pu1_cur_row)
678
679    VMOV        Q6,Q8                       @pu1_cur_row = pu1_next_row
680    SUBS        r7,r7,#1                    @Decrement the ht_tmp loop count by 1
681    BNE         PU1_SRC_LOOP_WD_16_HT_4     @If not equal jump to PU1_SRC_LOOP_WD_16_HT_4
682
683    LDR         r8,[sp,#0xD4]               @Loads ht
684    ADD         r5,sp,#0x42                 @*au1_src_left_tmp
685    LDR         r2,[sp,#0xC4]               @Loads *pu1_src_left
686SRC_LEFT_LOOP_WD_16_HT_4:
687    LDR         r7,[r5],#4                  @au1_src_left_tmp[row]
688    STR         r7,[r2],#4                  @pu1_src_left[row] = au1_src_left_tmp[row]
689    SUBS        r8,r8,#4
690    BNE         SRC_LEFT_LOOP_WD_16_HT_4
691
692    SUBS        r6,r6,#16                   @Decrement the wd loop count by 16
693    BLE         RE_ASSINING_LOOP            @Jump to re-assigning loop
694    BGT         WD_16_HT_4_LOOP             @If not equal jump to width_loop
695
696
697WIDTH_RESIDUE:
698    LDR         r7,[sp,#0xD0]               @Loads wd
699    LDR         r5,[sp,#0xC8]               @Loads pu1_avail
700    CMP         r6,r7                       @wd_residue == wd
701    LDRBEQ      r8,[r5]                     @pu1_avail[0]
702
703    MOVNE       r8,#-1
704    VMOV.8      d8[0],r8                    @au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
705
706    LDRB        r8,[r5,#1]                  @pu1_avail[1]
707    VMOV.8      d8[7],r8                    @au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
708
709PU1_AVAIL_2_RESIDUE:
710    LDRB        r8,[r5,#2]                  @pu1_avail[2]
711    CMP         r8,#0
712
713    SUBEQ       r8,r0,r1                    @pu1_src - src_strd
714    MOVNE       r8,r3
715    ADD         r8,r8,#1                    @pu1_src - src_strd + 1
716    VLD1.8      D10,[r8]!                   @pu1_top_row = vld1q_u8(pu1_src - src_strd + 1)
717    VLD1.8      D11,[r8]                    @pu1_top_row = vld1q_u8(pu1_src - src_strd + 1)
718    SUB         r8,#8
719
720
721    ADD         r5,sp,#0x42                 @*au1_src_left_tmp
722    LDR         r4,[sp,#0xD4]               @Loads ht
723    LDR         r7,[sp,#0xD0]               @Loads wd
724    LDR         r8,[sp,#0xC0]               @Loads *pu1_src
725    SUB         r7,r7,#1                    @(wd - 1)
726    ADD         r7,r8,r7                    @pu1_src[0 * src_strd + (wd - 1)]
727    SUB         r5,r5,#1
728
729AU1_SRC_LEFT_LOOP_RESIDUE:
730    LDRB        r8,[r7],r1                  @load the value and increment by src_strd
731    STRB        r8,[r5,#1]!                 @store it in the stack pointer
732    SUBS        r4,r4,#1                    @decrement the loop count
733    BNE         AU1_SRC_LEFT_LOOP_RESIDUE
734
735    VLD1.8      D12,[r0]!                   @pu1_cur_row = vld1q_u8(pu1_src)
736    VLD1.8      D13,[r0]                    @pu1_cur_row = vld1q_u8(pu1_src)
737    SUB         r0,#8
738
739    VCGT.U8     Q7,Q6,Q5                    @vcgtq_u8(pu1_cur_row, pu1_top_row)
740    VCLT.U8     Q8,Q6,Q5                    @vcltq_u8(pu1_cur_row, pu1_top_row)
741    VSUB.U8     Q7,Q8,Q7                    @sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
742    MOV         r7,r12                      @row count, move ht_tmp to r7
743
744PU1_SRC_LOOP_RESIDUE:
745    VMOV.I8     Q9,#0
746    ADD         r8,r0,r1                    @*pu1_src + src_strd
747    VLD1.8      D16,[r8]!                   @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
748    VLD1.8      D17,[r8]                    @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
749    SUB         r8,#8
750    LDR         r5,[sp,#0xC8]               @Loads pu1_avail
751    LDRB        r5,[r5,#3]                  @pu1_avail[3]
752    CMP         r5,#0
753    BEQ         NEXT_ROW_ELSE_RESIDUE
754    CMP         r7,#1
755    LDRBEQ      r8,[r8,#-1]                 @pu1_src_cpy[src_strd - 1]
756    BEQ         NEXT_ROW_POINTER_ASSIGNED_RESIDUE
757NEXT_ROW_ELSE_RESIDUE:
758    SUB         r5,r12,r7                   @ht_tmp - row
759    ADD         r8,r14,r5                   @pu1_src_left_cpy[ht_tmp - row]
760    ADD         r8,r8,#1                    @pu1_src_left_cpy[ht_tmp - row + 1]
761    LDRB        r8,[r8]
762
763NEXT_ROW_POINTER_ASSIGNED_RESIDUE:
764    VMOV.8      D19[7],r8                   @vsetq_lane_u8
765    VEXT.8      Q9,Q9,Q8,#15                @pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 15)
766
767    CMP         r7,r12
768    BNE         SIGN_UP_CHANGE_RESIDUE
769    LDR         r5,[sp,#0xC8]               @Loads pu1_avail
770    LDRB        r5,[r5,#2]                  @pu1_avail[2]
771    CMP         r5,#0
772    BNE         SIGN_UP_CHANGE_DONE_RESIDUE
773
774SIGN_UP_CHANGE_RESIDUE:
775    LDRB        r8,[r0,#15]                 @pu1_src_cpy[15]
776    ADD         r5,r0,#16                   @pu1_src_cpy[16]
777    SUB         r5,r5,r1                    @pu1_src_cpy[16 - src_strd]
778    LDRB        r5,[r5]                     @load the value
779    SUB         r8,r8,r5                    @pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]
780    CMP         r8,#0
781    MVNLT       r8,#0
782    MOVGT       r8,#1                       @SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd])
783    VMOV.8      D15[7],r8                   @sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]), sign_up, 15)
784
785SIGN_UP_CHANGE_DONE_RESIDUE:
786    VCGT.U8     Q10,Q6,Q9                   @vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
787    VCLT.U8     Q11,Q6,Q9                   @vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
788    VSUB.U8     Q12,Q11,Q10                 @sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
789
790    VADD.I8     Q13,Q0,Q7                   @edge_idx = vaddq_s8(const_2, sign_up)
791    VADD.I8     Q13,Q13,Q12                 @edge_idx = vaddq_s8(edge_idx, sign_down)
792    VTBL.8      D26,{D6},D26                @vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
793    VTBL.8      D27,{D6},D27                @vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
794
795    VAND        Q13,Q13,Q4                  @edge_idx = vandq_s8(edge_idx, au1_mask)
796
797    VNEG.S8     Q7,Q12                      @sign_up = vnegq_s8(sign_down)
798    VEXT.8      Q7,Q7,Q7,#1                 @sign_up = vextq_s8(sign_up, sign_up, 1)
799
800    VTBL.8      D24,{D7},D26                @offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
801    VMOVL.U8    Q14,D12                     @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
802    VADDW.S8    Q14,Q14,D24                 @pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
803    VMAX.S16    Q14,Q14,Q1                  @pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
804    VMIN.U16    Q14,Q14,Q2                  @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
805
806    VMOVN.I16   D30,Q14                     @vmovn_s16(pi2_tmp_cur_row.val[0])
807
808    VST1.8      {D30},[r0],r1               @vst1q_u8(pu1_src_cpy, pu1_cur_row)
809    VMOV        Q6,Q8                       @pu1_cur_row = pu1_next_row
810    SUBS        r7,r7,#1
811    BNE         PU1_SRC_LOOP_RESIDUE
812
813    LDR         r8,[sp,#0xD4]               @Loads ht
814    LDR         r2,[sp,#0xC4]               @Loads *pu1_src_left
815    ADD         r5,sp,#0x42                 @*au1_src_left_tmp
816
817SRC_LEFT_LOOP_RESIDUE:
818    LDR         r7,[r5],#4                  @au1_src_left_tmp[row]
819    SUBS        r8,r8,#4
820    STR         r7,[r2],#4                  @pu1_src_left[row] = au1_src_left_tmp[row]
821    BNE         SRC_LEFT_LOOP_RESIDUE
822
823
824RE_ASSINING_LOOP:
825    LDR         r7,[sp,#0xD0]               @Loads wd
826    LDR         r0,[sp,#0xC0]               @Loads *pu1_src
827
828    LDR         r11,[sp,#0xD4]              @Loads ht
829    ADD         r8,r0,r7                    @pu1_src[wd]
830
831    LDR         r4,[sp,#0xBC]               @Loads pu1_src_top_left
832    SUB         r11,r11,#1                  @ht - 1
833
834    STRB        r9,[r8,#-1]                 @pu1_src_org[wd - 1] = u1_pos_wd_0_tmp
835    MLA         r6,r11,r1,r0                @pu1_src_org[(ht - 1) * src_strd]
836
837    LDRB        r8,[sp]                     @load u1_src_top_left_tmp from stack pointer
838    ADD         r12,sp,#0x02
839
840    STRB        r10,[r6]                    @pu1_src_org[wd - 1 + (ht - 1) * src_strd] = u1_pos_wd_ht_tmp
841    STRB        r8,[r4]                     @*pu1_src_top_left = u1_src_top_left_tmp
842    LDR         r3,[sp,#0xCC]               @Loads pu1_src_top
843
844SRC_TOP_LOOP:
845    VLD1.8      D0,[r12]!                   @pu1_src_top[col] = au1_src_top_tmp[col]
846    SUBS        r7,r7,#8                    @Decrement the width
847    VST1.8      D0,[r3]!                    @pu1_src_top[col] = au1_src_top_tmp[col]
848    BNE         SRC_TOP_LOOP
849
850END_LOOPS:
851    ADD         sp,sp,#0x94
852    LDMFD       sp!,{r4-r12,r15}            @Reload the registers from SP
853
854
855
856