1@/*****************************************************************************
2@*
3@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
4@*
5@* Licensed under the Apache License, Version 2.0 (the "License");
6@* you may not use this file except in compliance with the License.
7@* You may obtain a copy of the License at:
8@*
9@* http://www.apache.org/licenses/LICENSE-2.0
10@*
11@* Unless required by applicable law or agreed to in writing, software
12@* distributed under the License is distributed on an "AS IS" BASIS,
13@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14@* See the License for the specific language governing permissions and
15@* limitations under the License.
16@*
17@*****************************************************************************/
18@/**
19@*******************************************************************************
20@* ,:file
21@*  ihevc_sao_edge_offset_class2_chroma.s
22@*
23@* ,:brief
24@*  Contains function definitions for inter prediction  interpolation.
25@* Functions are coded using NEON  intrinsics and can be compiled using@ ARM
26@* RVCT
27@*
28@* ,:author
29@*  Parthiban V
30@*
31@* ,:par List of Functions:
32@*
33@*
34@* ,:remarks
35@*  None
36@*
37@*******************************************************************************
38@*/
39@void ihevc_sao_edge_offset_class2_chroma(UWORD8 *pu1_src,
40@                              WORD32 src_strd,
41@                              UWORD8 *pu1_src_left,
42@                              UWORD8 *pu1_src_top,
43@                              UWORD8 *pu1_src_top_left,
44@                              UWORD8 *pu1_src_top_right,
45@                              UWORD8 *pu1_src_bot_left,
46@                              UWORD8 *pu1_avail,
47@                              WORD8 *pi1_sao_offset_u,
48@                              WORD8 *pi1_sao_offset_v,
49@                              WORD32 wd,
50@                              WORD32 ht)
51@**************Variables Vs Registers*****************************************
52@r0 =>  *pu1_src
53@r1 =>  src_strd
54@r2 =>  *pu1_src_left
55@r3 =>  *pu1_src_top
56@r4 =>  *pu1_src_top_left
57@r5 =>  *pu1_avail
58@r6 =>  *pi1_sao_offset_u
59@r9 =>  *pi1_sao_offset_v
60@r7 =>  wd
61@r8=>   ht
62
63.equ    pu1_src_top_left_offset,    328
64.equ    pu1_src_top_right_offset,   332
65.equ    pu1_src_bot_left_offset,    336
66.equ    pu1_avail_offset,           340
67.equ    pi1_sao_u_offset,           344
68.equ    pi1_sao_v_offset,           348
69.equ    wd_offset,                  352
70.equ    ht_offset,                  356
71
72.text
73.syntax unified
74.p2align 2
75
76.extern gi1_table_edge_idx
77.globl ihevc_sao_edge_offset_class2_chroma_a9q
78
79gi1_table_edge_idx_addr_1:
80.long gi1_table_edge_idx - ulbl1 - 8
81
82gi1_table_edge_idx_addr_2:
83.long gi1_table_edge_idx - ulbl2 - 8
84
85gi1_table_edge_idx_addr_3:
86.long gi1_table_edge_idx - ulbl3 - 8
87
88gi1_table_edge_idx_addr_4:
89.long gi1_table_edge_idx - ulbl4 - 8
90
91gi1_table_edge_idx_addr_5:
92.long gi1_table_edge_idx - ulbl5 - 8
93
94ihevc_sao_edge_offset_class2_chroma_a9q:
95
96
97    STMFD       sp!,{r4-r12,r14}            @stack stores the values of the arguments
98    vpush       {d8  -  d15}
99    SUB         sp,sp,#224                  @Decrement the stack pointer to store some temp arr values
100
101    LDR         r7,[sp,#wd_offset]          @Loads wd
102    LDR         r8,[sp,#ht_offset]          @Loads ht
103    SUB         r9,r7,#2                    @wd - 2
104
105    LDR         r4,[sp,#pu1_src_top_left_offset]    @Loads pu1_src_top_left
106    LDRH        r10,[r3,r9]                 @pu1_src_top[wd - 2]
107
108    STR         r0,[sp,#212]                @Store pu1_src in sp
109    MOV         r9,r7                       @Move width to r9 for loop count
110
111    STR         r2,[sp,#216]                @Store pu1_src_left in sp
112    LDR         r5,[sp,#pu1_avail_offset]   @Loads pu1_avail
113    LDR         r6,[sp,#pi1_sao_u_offset]   @Loads pi1_sao_offset_u
114
115    STR         r3,[sp,#220]                @Store pu1_src_top in sp
116
117    STRH        r10,[sp]                    @u1_src_top_left_tmp = pu1_src_top[wd - 2]
118    SUB         r10,r8,#1                   @ht-1
119    MLA         r11,r10,r1,r0               @pu1_src[(ht - 1) * src_strd + col]
120    ADD         r12,sp,#10                  @temp array
121
122AU1_SRC_TOP_LOOP:
123    VLD1.8      D0,[r11]!                   @pu1_src[(ht - 1) * src_strd + col]
124    SUBS        r9,r9,#8                    @Decrement the loop count by 8
125    VST1.8      D0,[r12]!                   @au1_src_top_tmp[col] = pu1_src[(ht - 1) * src_strd + col]
126    BNE         AU1_SRC_TOP_LOOP
127
128PU1_AVAIL_4_LOOP_U:
129    LDRB        r9,[r5,#4]                  @pu1_avail[4]
130    CMP         r9,#0
131    LDRB        r9,[r0]                     @u1_pos_0_0_tmp_u = pu1_src[0]
132    LDRB        r10,[r0,#1]                 @u1_pos_0_0_tmp_v = pu1_src[1]
133    BEQ         PU1_AVAIL_7_LOOP_U
134
135    LDRB        r11,[r4]                    @pu1_src_top_left[0]
136    ADD         r14,r0,r1                   @pu1_src + src_strd
137
138    SUB         r12,r9,r11                  @pu1_src[0] - pu1_src_top_left[0]
139
140    LDRB        r14,[r14,#2]                @pu1_src[2 + src_strd]
141    CMP         r12,#0
142
143    MVNLT       r12,#0
144    SUB         r11,r9,r14                  @pu1_src[0] - pu1_src[2 + src_strd]
145
146    MOVGT       r12,#1                      @SIGN(pu1_src[0] - pu1_src_top_left[0])
147
148    CMP         r11,#0
149    MVNLT       r11,#0
150    LDR         r14, gi1_table_edge_idx_addr_1 @table pointer
151ulbl1:
152    add         r14,r14,pc
153    MOVGT       r11,#1                      @SIGN(pu1_src[0] - pu1_src[2 + src_strd])
154
155    ADD         r11,r12,r11                 @SIGN(pu1_src[0] - pu1_src_top_left[0]) +  SIGN(pu1_src[0] - pu1_src[2 + src_strd])
156    ADD         r11,r11,#2                  @edge_idx
157
158    LDRSB       r12,[r14,r11]               @edge_idx = gi1_table_edge_idx[edge_idx]
159    CMP         r12,#0                      @0 != edge_idx
160    BEQ         PU1_AVAIL_4_LOOP_V
161    LDRSB       r11,[r6,r12]                @pi1_sao_offset_u[edge_idx]
162    ADD         r9,r9,r11                   @pu1_src[0] + pi1_sao_offset_u[edge_idx]
163    USAT        r9,#8,r9                    @u1_pos_0_0_tmp_u = CLIP3(pu1_src[0] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1)
164
165PU1_AVAIL_4_LOOP_V:
166
167    LDRB        r11,[r4,#1]                 @pu1_src_top_left[1]
168    ADD         r14,r0,r1                   @pu1_src + src_strd
169
170    SUB         r12,r10,r11                 @pu1_src[1] - pu1_src_top_left[1]
171    LDRB        r14,[r14,#3]                @pu1_src[3 + src_strd]
172
173    CMP         r12,#0
174    MVNLT       r12,#0
175    SUB         r11,r10,r14                 @pu1_src[1] - pu1_src[3 + src_strd]
176    MOVGT       r12,#1                      @SIGN(pu1_src[0] - pu1_src_top_left[0])
177
178    CMP         r11,#0
179    MVNLT       r11,#0
180    LDR         r14, gi1_table_edge_idx_addr_2 @table pointer
181ulbl2:
182    add         r14,r14,pc
183    MOVGT       r11,#1                      @SIGN(pu1_src[0] - pu1_src[3 + src_strd])
184
185    ADD         r11,r12,r11                 @SIGN(pu1_src[0] - pu1_src_top_left[0]) +  SIGN(pu1_src[0] - pu1_src[3 + src_strd])
186    ADD         r11,r11,#2                  @edge_idx
187
188    LDRSB       r12,[r14,r11]               @edge_idx = gi1_table_edge_idx[edge_idx]
189    CMP         r12,#0                      @0 != edge_idx
190    BEQ         PU1_AVAIL_7_LOOP_U
191    LDR         r11,[sp,#pi1_sao_v_offset]  @Loads pi1_sao_offset_v
192    LDRSB       r11,[r11,r12]               @pi1_sao_offset_v[edge_idx]
193    ADD         r10,r10,r11                 @pu1_src[0] + pi1_sao_offset_v[edge_idx]
194    USAT        r10,#8,r10                  @u1_pos_0_0_tmp_v = CLIP3(pu1_src[0] + pi1_sao_offset_v[edge_idx], 0, (1 << bit_depth) - 1)
195
196PU1_AVAIL_7_LOOP_U:
197    STRB        r10,[sp,#7]
198    STRB        r9,[sp,#6]
199
200    LDRB        r10,[r5,#7]                 @pu1_avail[7]
201    CMP         r10,#0
202    SUB         r10,r7,#2                   @wd - 2
203    SUB         r11,r8,#1                   @ht - 1
204    MLA         r12,r11,r1,r10              @wd - 2 + (ht - 1) * src_strd
205    ADD         r12,r12,r0                  @pu1_src[wd - 2 + (ht - 1) * src_strd]
206    LDRB        r10,[r12]                   @u1_pos_wd_ht_tmp_u = pu1_src[wd - 2 + (ht - 1) * src_strd]
207    LDRB        r9,[r12,#1]                 @u1_pos_wd_ht_tmp_v = pu1_src[wd - 2 + (ht - 1) * src_strd]
208    BEQ         PU1_AVAIL_3_LOOP
209
210    SUB         r11,r12,r1                  @pu1_src[(wd - 2 + (ht - 1) * src_strd) - src_strd]
211    SUB         r11,r11,#2                  @pu1_src[wd - 2 + (ht - 1) * src_strd - 2 - src_strd]
212    LDRB        r11,[r11]                   @Load pu1_src[wd - 2 + (ht - 1) * src_strd - 2 - src_strd]
213    SUB         r11,r10,r11                 @pu1_src[wd - 2 + (ht - 1) * src_strd] - pu1_src[wd - 2 + (ht - 1) * src_strd- 2 - src_strd]
214    CMP         r11,#0
215    MVNLT       r11,#0
216    MOVGT       r11,#1                      @SIGN(pu1_src[wd - 2 + (ht - 1) * src_strd] - pu1_src[wd - 2 + (ht - 1) * src_strd- 2 - src_strd])
217
218    ADD         r14,r12,r1                  @pu1_src[(wd - 2 + (ht - 1) * src_strd) + src_strd]
219    ADD         r14,r14,#2                  @pu1_src[wd - 2 + (ht - 1) * src_strd + 2 + src_strd]
220    LDRB        r14,[r14]                   @Load pu1_src[wd - 2 + (ht - 1) * src_strd + 2 + src_strd]
221    SUB         r14,r10,r14                 @pu1_src[wd - 2 + (ht - 1) * src_strd] - pu1_src[wd - 1 + (ht - 1) * src_strd + 2 + src_strd]
222    CMP         r14,#0
223    MVNLT       r14,#0
224    MOVGT       r14,#1                      @SIGN(pu1_src[wd - 2 + (ht - 1) * src_strd] - pu1_src[wd - 2 + (ht - 1) * src_strd + 2 + src_strd])
225
226    ADD         r11,r11,r14                 @Add 2 sign value
227    ADD         r11,r11,#2                  @edge_idx
228    LDR         r14, gi1_table_edge_idx_addr_3 @table pointer
229ulbl3:
230    add         r14,r14,pc
231
232    LDRSB       r14,[r14,r11]               @edge_idx = gi1_table_edge_idx[edge_idx]
233    CMP         r14,#0
234    BEQ         PU1_AVAIL_7_LOOP_V
235    LDRSB       r11,[r6,r14]                @pi1_sao_offset_u[edge_idx]
236    ADD         r10,r10,r11                 @pu1_src[wd - 2 + (ht - 1) * src_strd] + pi1_sao_offset[edge_idx]
237    USAT        r10,#8,r10                  @u1_pos_wd_ht_tmp = CLIP3(pu1_src[wd - 1 + (ht - 1) * src_strd] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1)
238
239PU1_AVAIL_7_LOOP_V:
240    ADD         r12,r12,#1
241    SUB         r11,r12,r1                  @pu1_src[(wd - 1 + (ht - 1) * src_strd) - src_strd]
242    SUB         r11,r11,#2                  @pu1_src[wd - 1 + (ht - 1) * src_strd - 2 - src_strd]
243    LDRB        r11,[r11]                   @Load pu1_src[wd - 1 + (ht - 1) * src_strd - 2 - src_strd]
244    SUB         r11,r9,r11                  @pu1_src[wd - 1 + (ht - 1) * src_strd] - pu1_src[wd - 1 + (ht - 1) * src_strd- 2 - src_strd]
245    CMP         r11,#0
246    MVNLT       r11,#0
247    MOVGT       r11,#1                      @SIGN(pu1_src[wd - 1 + (ht - 1) * src_strd] - pu1_src[wd - 1 + (ht - 1) * src_strd - 2 - src_strd])
248
249    ADD         r14,r12,r1                  @pu1_src[(wd - 1 + (ht - 1) * src_strd) + src_strd]
250    ADD         r14,r14,#2                  @pu1_src[wd - 1 + (ht - 1) * src_strd + 2 + src_strd]
251    LDRB        r14,[r14]                   @Load pu1_src[wd - 1 + (ht - 1) * src_strd + 2 + src_strd]
252    SUB         r14,r9,r14                  @pu1_src[wd - 1 + (ht - 1) * src_strd] - pu1_src[wd - 1 + (ht - 1) * src_strd + 2 + src_strd]
253    CMP         r14,#0
254    MVNLT       r14,#0
255    MOVGT       r14,#1                      @SIGN(pu1_src[wd - 1 + (ht - 1) * src_strd] - pu1_src[wd - 1 + (ht - 1) * src_strd + 1 + src_strd])
256
257    ADD         r11,r11,r14                 @Add 2 sign value
258    ADD         r11,r11,#2                  @edge_idx
259    LDR         r14, gi1_table_edge_idx_addr_4 @table pointer
260ulbl4:
261    add         r14,r14,pc
262
263    LDRSB       r12,[r14,r11]               @edge_idx = gi1_table_edge_idx[edge_idx]
264    CMP         r12,#0
265    BEQ         PU1_AVAIL_3_LOOP
266    LDR         r14,[sp,#pi1_sao_v_offset]  @Loads pi1_sao_offset_v
267    LDRSB       r11,[r14,r12]               @pi1_sao_offset_v[edge_idx]
268    ADD         r9,r9,r11                   @pu1_src[wd - 1 + (ht - 1) * src_strd] + pi1_sao_offset[edge_idx]
269    USAT        r9,#8,r9                    @u1_pos_wd_ht_tmp_v = CLIP3(pu1_src[wd - 1 + (ht - 1) * src_strd] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1)
270
271PU1_AVAIL_3_LOOP:
272    STRB        r10,[sp,#8]
273    VMOV.I8     Q0,#2                       @const_2 = vdupq_n_s8(2)
274    STRB        r9,[sp,#9]
275
276    MOV         r12,r8                      @Move ht
277    VMOV.I16    Q1,#0                       @const_min_clip = vdupq_n_s16(0)
278    MOV         r14,r2                      @Move pu1_src_left to pu1_src_left_cpy
279
280    LDRB        r11,[r5,#3]                 @pu1_avail[3]
281    VMOV.I16    Q2,#255                     @const_max_clip = vdupq_n_u16((1 << bit_depth) - 1)
282    CMP         r11,#0
283
284    SUBEQ       r12,r12,#1                  @ht_tmp--
285    LDRB        r5,[r5,#2]                  @pu1_avail[2]
286
287    CMP         r5,#0
288
289    ADDEQ       r0,r0,r1                    @pu1_src += src_strd
290    VLD1.8      D6,[r6]                     @offset_tbl_u = vld1_s8(pi1_sao_offset_u)
291    SUBEQ       r12,r12,#1                  @ht_tmp--
292
293    LDR         r6,[sp,#pi1_sao_v_offset]   @Loads pi1_sao_offset_v
294    ADDEQ       r14,r14,#2                  @pu1_src_left_cpy += 2
295
296    STR         r0,[sp,#2]                  @Store pu1_src in sp
297    VLD1.8      D7,[r6]                     @offset_tbl_v = vld1_s8(pi1_sao_offset_v)
298    LDR         r2, gi1_table_edge_idx_addr_5 @table pointer
299ulbl5:
300    add         r2,r2,pc
301
302    MOV         r6,r7                       @move wd to r6 loop_count
303    VMOV.S8     Q4,#0xFF                    @au1_mask = vdupq_n_s8(-1)
304    CMP         r7,#16                      @Compare wd with 16
305
306    BLT         WIDTH_RESIDUE               @If not jump to WIDTH_RESIDUE where loop is unrolled for 8 case
307    CMP         r8,#4                       @Compare ht with 4
308    BLE         WD_16_HT_4_LOOP             @If jump to WD_16_HT_4_LOOP
309
310WIDTH_LOOP_16:
311    LDR         r5,[sp,#pu1_avail_offset]   @Loads pu1_avail
312    LDR         r7,[sp,#wd_offset]          @Loads wd
313    CMP         r6,r7                       @col == wd
314    LDRBEQ      r8,[r5]                     @pu1_avail[0]
315
316    MOVNE       r8,#-1
317    VMOV.8      D8[0],r8                    @au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
318
319    CMP         r6,#16                      @if(col == 16)
320    VMOV.8      D8[1],r8                    @au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
321
322    BNE         SKIP_AU1_MASK_VAL
323    LDRB        r8,[r5,#1]                  @pu1_avail[1]
324    VMOV.8      D9[6],r8                    @au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
325    VMOV.8      D9[7],r8                    @au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
326
327SKIP_AU1_MASK_VAL:
328    LDRB        r9,[r5,#2]                  @pu1_avail[2]
329    VLD1.8      D12,[r0]!                   @pu1_cur_row = vld1q_u8(pu1_src)
330    VLD1.8      D13,[r0]                    @pu1_cur_row = vld1q_u8(pu1_src)
331    SUB         r0,#8
332    CMP         r9,#0
333
334    LDR         r4,[sp,#ht_offset]          @Loads ht
335    SUBEQ       r8,r0,r1                    @pu1_src - src_strd
336
337    LDR         r7,[sp,#wd_offset]          @Loads wd
338    MOVNE       r8,r3                       @pu1_src_top_cpy
339
340    SUB         r8,r8,#2                    @pu1_src - src_strd - 2
341    ADD         r3,r3,#16
342
343    ADD         r5,sp,#75                   @*au1_src_left_tmp
344    VLD1.8      D10,[r8]!                   @pu1_top_row = vld1q_u8(pu1_src - src_strd - 2) || vld1q_u8(pu1_src_top_cpy - 2)
345    VLD1.8      D11,[r8]                    @pu1_top_row = vld1q_u8(pu1_src - src_strd - 2) || vld1q_u8(pu1_src_top_cpy - 2)
346    SUB         r8,#8
347    SUB         r7,r7,r6                    @(wd - col)
348
349    ADD         r7,r7,#14                   @15 + (wd - col)
350    VCGT.U8     Q7,Q6,Q5                    @vcgtq_u8(pu1_cur_row, pu1_top_row)
351    LDR         r8,[sp,#212]                @Loads *pu1_src
352
353    ADD         r7,r8,r7                    @pu1_src[0 * src_strd + 15 + (wd - col)]
354    VCLT.U8     Q8,Q6,Q5                    @vcltq_u8(pu1_cur_row, pu1_top_row)
355
356AU1_SRC_LEFT_LOOP:
357    LDRH        r8,[r7]                     @load the value and increment by src_strd
358    SUBS        r4,r4,#1                    @decrement the loop count
359
360    STRH        r8,[r5],#2                  @store it in the stack pointer
361    ADD         r7,r7,r1
362
363    BNE         AU1_SRC_LEFT_LOOP
364
365    ADD         r8,r0,r1                    @I *pu1_src + src_strd
366    VSUB.U8     Q7,Q8,Q7                    @sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
367    MOV         r7,r12                      @row count, move ht_tmp to r7
368
369    VLD1.8      D16,[r8]!                   @I pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
370    VLD1.8      D17,[r8]                    @I pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
371    SUB         r8,#8
372
373    ADD         r8,r8,#16                   @I
374    VMOV.I8     Q9,#0
375    LDRH        r5,[r8]                     @I pu1_src_cpy[src_strd + 16]
376
377    LDR         r10,[sp,#pu1_avail_offset]  @I Loads pu1_avail
378    VMOV.16     D18[0],r5                   @I pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0)
379    LDRB        r10,[r10,#2]                @I pu1_avail[2]
380
381    CMP         r10,#0                      @I
382    VEXT.8      Q9,Q8,Q9,#2                 @I pu1_next_row_tmp = vextq_u8(pu1_next_row, pu1_next_row_tmp, 2)
383    BNE         SIGN_UP_CHANGE_DONE         @I
384
385    LDRB        r11,[r0]                    @I pu1_src_cpy[0]
386    SUB         r4,r12,r7                   @I ht_tmp - row
387
388    LDRB        r10,[r0,#1]                 @I pu1_src_cpy[0]
389    LSL         r4,r4,#1                    @I (ht_tmp - row) * 2
390
391    ADD         r9,r14,r4                   @I pu1_src_left_cpy[(ht_tmp - row) * 2]
392    LDRB        r5,[r9,#-2]                 @I load the value
393
394    SUB         r8,r11,r5                   @I pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]
395    LDRB        r5,[r9,#-1]                 @I load the value
396
397    CMP         r8,#0                       @I
398    SUB         r4,r10,r5                   @I pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]
399
400    MVNLT       r8,#0                       @I
401    MOVGT       r8,#1                       @I SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2])
402
403    CMP         r4,#0                       @I
404    VMOV.8      D14[0],r8                   @I sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]), sign_up, 0)
405    MVNLT       r4,#0                       @I
406
407    MOVGT       r4,#1                       @I SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2])
408    VMOV.8      D14[1],r4                   @I sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]), sign_up, 1)
409
410SIGN_UP_CHANGE_DONE:
411    VLD1.8      D30,[r2]                    @edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
412    VCGT.U8     Q10,Q6,Q9                   @I vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
413
414    VCLT.U8     Q11,Q6,Q9                   @I vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
415    VSUB.U8     Q11,Q11,Q10                 @I sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
416
417    VADD.I8     Q9,Q0,Q7                    @I edge_idx = vaddq_s8(const_2, sign_up)
418    VADD.I8     Q9,Q9,Q11                   @I edge_idx = vaddq_s8(edge_idx, sign_down)
419
420    VTBL.8      D18,{D30},D18               @I vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
421    VNEG.S8     Q7,Q11                      @I sign_up = vnegq_s8(sign_down)
422
423    VTBL.8      D19,{D30},D19               @I vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
424    VEXT.8      Q7,Q7,Q7,#14                @I sign_up = vextq_s8(sign_up, sign_up, 14)
425
426    VMOVL.U8    Q10,D12                     @I pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
427    VAND        Q11,Q9,Q4                   @I edge_idx = vandq_s8(edge_idx, au1_mask)
428
429    VMOVL.U8    Q9,D13                      @I pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
430    VUZP.8      D22,D23                     @I
431
432    VTBL.8      D22,{D6},D22                @I
433    VTBL.8      D23,{D7},D23                @I
434    VZIP.8      D22,D23                     @I
435
436    VMOV        Q6,Q8                       @I pu1_cur_row = pu1_next_row
437    VADDW.S8    Q10,Q10,D22                 @I pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
438
439    VMAX.S16    Q10,Q10,Q1                  @I pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
440    VMIN.U16    Q10,Q10,Q2                  @I pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
441
442    VADDW.S8    Q9,Q9,D23                   @I pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
443    VMAX.S16    Q9,Q9,Q1                    @I pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
444
445    VMIN.U16    Q9,Q9,Q2                    @I pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
446    SUB         r7,r7,#1                    @I Decrement the ht_tmp loop count by 1
447
448
449PU1_SRC_LOOP:
450    ADD         r8,r0,r1,LSL #1             @II *pu1_src + src_strd
451    VMOVN.I16   D20,Q10                     @I vmovn_s16(pi2_tmp_cur_row.val[0])
452    ADD         r11,r8,r1                   @III *pu1_src + src_strd
453
454    VLD1.8      D16,[r8]!                   @II pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
455    VLD1.8      D17,[r8]                    @II pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
456    SUB         r8,#8
457    VLD1.8      D30,[r11]!                  @III pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
458    VLD1.8      D31,[r11]                   @III pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
459    SUB         r11,#8
460
461    ADD         r8,r8,#16                   @II
462    VMOVN.I16   D21,Q9                      @I vmovn_s16(pi2_tmp_cur_row.val[1])
463    LDRH        r5,[r8]                     @II pu1_src_cpy[src_strd + 16]
464
465    ADD         r11,r11,#16                 @III
466    VMOV.16     D28[0],r5                   @II pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0)
467    LDRH        r4,[r11]                    @III pu1_src_cpy[src_strd + 16]
468
469    LDRB        r8,[r0,r1]                  @II pu1_src_cpy[0]
470    VEXT.8      Q14,Q8,Q14,#2               @II pu1_next_row_tmp = vextq_u8(pu1_next_row, pu1_next_row_tmp, 2)
471    SUB         r5,r12,r7                   @II ht_tmp - row
472
473    LSL         r5,r5,#1                    @II (ht_tmp - row) * 2
474    VMOV.16     D18[0],r4                   @III pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0)
475    ADD         r9,r14,r5                   @II pu1_src_left_cpy[(ht_tmp - row) * 2]
476
477    LDRB        r11,[r9,#-2]                @II load the value
478    VST1.8      {Q10},[r0],r1               @I vst1q_u8(pu1_src_cpy, pu1_cur_row)
479    SUB         r8,r8,r11                   @II pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]
480
481    CMP         r8,#0                       @II
482    VEXT.8      Q9,Q15,Q9,#2                @III pu1_next_row_tmp = vextq_u8(pu1_next_row, pu1_next_row_tmp, 2)
483    LDRB        r11,[r0,#1]                 @II pu1_src_cpy[0]
484
485    MVNLT       r8,#0                       @II
486    VCGT.U8     Q11,Q6,Q14                  @II vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
487    MOVGT       r8,#1                       @II SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2])
488
489    LDRB        r5,[r9,#-1]                 @II load the value
490    VMOV.8      D14[0],r8                   @II sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]), sign_up, 0)
491    SUB         r7,r7,#1                    @II Decrement the ht_tmp loop count by 1
492
493    SUB         r11,r11,r5                  @II pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]
494    VCLT.U8     Q12,Q6,Q14                  @II vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
495    CMP         r11,#0                      @II
496
497    MVNLT       r11,#0                      @II
498    VSUB.U8     Q12,Q12,Q11                 @II sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
499    MOVGT       r11,#1                      @II SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2])
500
501    LDRB        r4,[r0,r1]                  @III pu1_src_cpy[0]
502    VLD1.8      D22,[r2]                    @edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
503    SUB         r5,r12,r7                   @III ht_tmp - row
504
505    ADD         r10,r0,r1
506    VMOV.8      D14[1],r11                  @II sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]), sign_up, 1)
507    LSL         r5,r5,#1                    @III (ht_tmp - row) * 2
508
509    ADD         r9,r14,r5                   @III pu1_src_left_cpy[(ht_tmp - row) * 2]
510    VADD.I8     Q13,Q0,Q7                   @II edge_idx = vaddq_s8(const_2, sign_up)
511    LDRB        r10,[r10,#1]                @III pu1_src_cpy[0]
512
513    LDRB        r5,[r9,#-2]                 @III load the value
514    VADD.I8     Q13,Q13,Q12                 @II edge_idx = vaddq_s8(edge_idx, sign_down)
515    SUB         r4,r4,r5                    @III pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]
516
517    CMP         r4,#0                       @III
518    LDRB        r9,[r9,#-1]                 @III load the value
519    VTBL.8      D26,{D22},D26               @II vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
520    VNEG.S8     Q7,Q12                      @II sign_up = vnegq_s8(sign_down)
521
522    MVNLT       r4,#0                       @III
523    SUB         r10,r10,r9                  @III pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]
524    VTBL.8      D27,{D22},D27               @II vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
525    VEXT.8      Q7,Q7,Q7,#14                @II sign_up = vextq_s8(sign_up, sign_up, 14)
526
527    MOVGT       r4,#1                       @III SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2])
528    VAND        Q13,Q13,Q4                  @II edge_idx = vandq_s8(edge_idx, au1_mask)
529    CMP         r10,#0                      @III
530
531    VUZP.8      D26,D27                     @II
532    VMOV.8      d14[0],r4                   @III sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]), sign_up, 0)
533
534    MVNLT       r10,#0                      @III
535    MOVGT       r10,#1                      @III SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2])
536    VTBL.8      D24,{D6},D26                @II
537    VCGT.U8     Q10,Q8,Q9                   @III vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
538
539    VCLT.U8     Q11,Q8,Q9                   @III vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
540    VTBL.8      D25,{D7},D27                @II
541    VSUB.U8     Q11,Q11,Q10                 @III sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
542
543    VMOV.8      D14[1],r10                  @III sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]), sign_up, 1)
544    VZIP.8      D24,D25                     @II
545
546    VMOVL.U8    Q14,D12                     @II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
547    VADD.I8     Q9,Q0,Q7                    @III edge_idx = vaddq_s8(const_2, sign_up)
548
549    VLD1.8      D20,[r2]                    @edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
550    VADDW.S8    Q14,Q14,D24                 @II pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
551
552    VADD.I8     Q9,Q9,Q11                   @III edge_idx = vaddq_s8(edge_idx, sign_down)
553    VMAX.S16    Q14,Q14,Q1                  @II pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
554
555    VMIN.U16    Q14,Q14,Q2                  @II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
556    VTBL.8      D18,{D20},D18               @III vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
557    VNEG.S8     Q7,Q11                      @III sign_up = vnegq_s8(sign_down)
558
559    VTBL.8      D19,{D20},D19               @III vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
560    VEXT.8      Q7,Q7,Q7,#14                @III sign_up = vextq_s8(sign_up, sign_up, 14)
561
562    VMOVL.U8    Q13,D13                     @II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
563    VAND        Q9,Q9,Q4                    @III edge_idx = vandq_s8(edge_idx, au1_mask)
564
565    VUZP.8      D18,D19                     @III
566    VTBL.8      D22,{D6},D18                @III
567    VADDW.S8    Q13,Q13,D25                 @II pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
568
569    VMOV        Q6,Q15                      @III pu1_cur_row = pu1_next_row
570    VTBL.8      D23,{D7},D19                @III
571    VMAX.S16    Q13,Q13,Q1                  @II pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
572
573    VMOVL.U8    Q10,D16                     @III pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
574    VMIN.U16    Q13,Q13,Q2                  @II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
575
576    VZIP.8      D22,D23                     @III
577    VMOVN.I16   D28,Q14                     @II vmovn_s16(pi2_tmp_cur_row.val[0])
578
579    VMOVN.I16   D29,Q13                     @II vmovn_s16(pi2_tmp_cur_row.val[1])
580    VADDW.S8    Q10,Q10,D22                 @III pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
581
582    VMOVL.U8    Q9,D17                      @III pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
583    VMAX.S16    Q10,Q10,Q1                  @III pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
584
585    VMIN.U16    Q10,Q10,Q2                  @III pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
586    VADDW.S8    Q9,Q9,D23                   @III pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
587
588    SUB         r7,r7,#1                    @III Decrement the ht_tmp loop count by 1
589    VMAX.S16    Q9,Q9,Q1                    @III pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
590    CMP         r7,#1
591
592    VST1.8      {Q14},[r0],r1               @II vst1q_u8(pu1_src_cpy, pu1_cur_row)
593    VMIN.U16    Q9,Q9,Q2                    @III pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
594
595    BGT         PU1_SRC_LOOP                @If not equal jump to PU1_SRC_LOOP
596    BLT         INNER_LOOP_DONE
597
598    ADD         r8,r0,r1,LSL #1             @*pu1_src + src_strd
599    VMOVN.I16   D20,Q10                     @III vmovn_s16(pi2_tmp_cur_row.val[0])
600
601    LDRB        r11,[r0,r1]                 @pu1_src_cpy[0]
602    VLD1.8      D16,[r8]!                   @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
603    VLD1.8      D17,[r8]                    @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
604    SUB         r8,#8
605    SUB         r4,r12,r7                   @ht_tmp - row
606
607    ADD         r8,r8,#16
608    VMOVN.I16   D21,Q9                      @III vmovn_s16(pi2_tmp_cur_row.val[1])
609    LDRH        r5,[r8]                     @pu1_src_cpy[src_strd + 16]
610
611    LSL         r4,r4,#1                    @(ht_tmp - row) * 2
612    VMOV.16     D18[0],r5                   @pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0)
613    ADD         r9,r14,r4                   @pu1_src_left_cpy[(ht_tmp - row) * 2]
614
615    LDRB        r5,[r9,#-2]                 @load the value
616    VEXT.8      Q9,Q8,Q9,#2                 @pu1_next_row_tmp = vextq_u8(pu1_next_row, pu1_next_row_tmp, 2)
617    SUB         r8,r11,r5                   @pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]
618
619    CMP         r8,#0
620    VST1.8      {Q10},[r0],r1               @III vst1q_u8(pu1_src_cpy, pu1_cur_row)
621    MVNLT       r8,#0
622
623    MOVGT       r8,#1                       @SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2])
624    VLD1.8      D30,[r2]                    @edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
625
626    LDRB        r11,[r0,#1]                 @pu1_src_cpy[0]
627    VMOV.8      D14[0],r8                   @sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]), sign_up, 0)
628    LDRB        r5,[r9,#-1]                 @load the value
629
630    SUB         r4,r11,r5                   @pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]
631    VCGT.U8     Q11,Q6,Q9                   @vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
632    CMP         r4,#0
633
634    MVNLT       r4,#0
635    VCLT.U8     Q12,Q6,Q9                   @vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
636    MOVGT       r4,#1                       @SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2])
637
638    VMOV.8      D14[1],r4                   @sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]), sign_up, 1)
639    VSUB.U8     Q12,Q12,Q11                 @sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
640
641    VADD.I8     Q13,Q0,Q7                   @edge_idx = vaddq_s8(const_2, sign_up)
642    VADD.I8     Q13,Q13,Q12                 @edge_idx = vaddq_s8(edge_idx, sign_down)
643
644    VTBL.8      D26,{D30},D26               @vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
645    VTBL.8      D27,{D30},D27               @vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
646
647    VMOVL.U8    Q10,D12                     @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
648    VAND        Q13,Q13,Q4                  @edge_idx = vandq_s8(edge_idx, au1_mask)
649
650    VMOVL.U8    Q9,D13                      @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
651    VUZP.8      D26,D27
652
653    VTBL.8      D24,{D6},D26
654    VTBL.8      D25,{D7},D27
655    VZIP.8      D24,D25
656
657    VADDW.S8    Q10,Q10,D24                 @pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
658    VMAX.S16    Q10,Q10,Q1                  @pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
659    VMIN.U16    Q10,Q10,Q2                  @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
660
661    VADDW.S8    Q9,Q9,D25                   @pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
662    VMAX.S16    Q9,Q9,Q1                    @pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
663    VMIN.U16    Q9,Q9,Q2                    @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
664
665
666INNER_LOOP_DONE:
667    LDR         r8,[sp,#ht_offset]          @Loads ht
668    VMOVN.I16   D20,Q10                     @vmovn_s16(pi2_tmp_cur_row.val[0])
669    ADD         r5,sp,#75                   @*au1_src_left_tmp
670
671    LDR         r11,[sp,#216]               @Loads *pu1_src_left
672    VMOVN.I16   D21,Q9                      @vmovn_s16(pi2_tmp_cur_row.val[1])
673
674
675SRC_LEFT_LOOP:
676    LDR         r7,[r5],#4                  @au1_src_left_tmp[row]
677    SUBS        r8,r8,#2
678    STR         r7,[r11],#4                 @pu1_src_left[row] = au1_src_left_tmp[row]
679    BNE         SRC_LEFT_LOOP
680
681    SUBS        r6,r6,#16                   @Decrement the wd loop count by 16
682    VST1.8      {Q10},[r0],r1               @vst1q_u8(pu1_src_cpy, pu1_cur_row)
683    CMP         r6,#8                       @Check whether residue remains
684
685    BLT         RE_ASSINING_LOOP            @Jump to re-assigning loop
686    LDR         r7,[sp,#wd_offset]          @Loads wd
687    LDR         r0,[sp,#2]                  @Loads *pu1_src
688    SUB         r7,r7,r6
689    ADD         r0,r0,r7
690    BGT         WIDTH_LOOP_16               @If not equal jump to width_loop
691    BEQ         WIDTH_RESIDUE               @If residue remains jump to residue loop
692
693
694WD_16_HT_4_LOOP:
695    LDR         r5,[sp,#pu1_avail_offset]   @Loads pu1_avail
696    LDR         r7,[sp,#wd_offset]          @Loads wd
697    CMP         r6,r7                       @col == wd
698    LDRBEQ      r8,[r5]                     @pu1_avail[0]
699
700    MOVNE       r8,#-1
701    VMOV.8      D8[0],r8                    @au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
702    VMOV.8      D8[1],r8                    @au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
703
704    CMP         r6,#16                      @if(col == 16)
705    BNE         SKIP_AU1_MASK_VAL_WD_16_HT_4
706    LDRB        r8,[r5,#1]                  @pu1_avail[1]
707    VMOV.8      D9[6],r8                    @au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
708    VMOV.8      D9[7],r8                    @au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
709
710SKIP_AU1_MASK_VAL_WD_16_HT_4:
711    LDRB        r8,[r5,#2]                  @pu1_avail[2]
712    CMP         r8,#0
713
714    SUBEQ       r8,r0,r1                    @pu1_src - src_strd
715    MOVNE       r8,r3                       @pu1_src_top_cpy
716    SUB         r8,r8,#2                    @pu1_src - src_strd - 2
717    VLD1.8      D10,[r8]!                   @pu1_top_row = vld1q_u8(pu1_src - src_strd - 2) || vld1q_u8(pu1_src_top_cpy - 2)
718    VLD1.8      D11,[r8]                    @pu1_top_row = vld1q_u8(pu1_src - src_strd - 2) || vld1q_u8(pu1_src_top_cpy - 2)
719    SUB         r8,#8
720
721    ADD         r3,r3,#16
722    ADD         r5,sp,#75                   @*au1_src_left_tmp
723    LDR         r4,[sp,#ht_offset]          @Loads ht
724    LDR         r7,[sp,#wd_offset]          @Loads wd
725    SUB         r7,r7,r6                    @(wd - col)
726    ADD         r7,r7,#14                   @15 + (wd - col)
727    LDR         r8,[sp,#212]                @Loads *pu1_src
728    ADD         r7,r8,r7                    @pu1_src[0 * src_strd + 15 + (wd - col)]
729
730AU1_SRC_LEFT_LOOP_WD_16_HT_4:
731    LDRH        r8,[r7]                     @load the value and increment by src_strd
732    STRH        r8,[r5],#2                  @store it in the stack pointer
733    ADD         r7,r7,r1
734
735    SUBS        r4,r4,#1                    @decrement the loop count
736    BNE         AU1_SRC_LEFT_LOOP_WD_16_HT_4
737
738    VLD1.8      D12,[r0]!                   @pu1_cur_row = vld1q_u8(pu1_src)
739    VLD1.8      D13,[r0]                    @pu1_cur_row = vld1q_u8(pu1_src)
740    SUB         r0,#8
741
742    VCGT.U8     Q7,Q6,Q5                    @vcgtq_u8(pu1_cur_row, pu1_top_row)
743    VCLT.U8     Q8,Q6,Q5                    @vcltq_u8(pu1_cur_row, pu1_top_row)
744    VSUB.U8     Q7,Q8,Q7                    @sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
745    VMOV.I8     Q9,#0
746    MOV         r7,r12                      @row count, move ht_tmp to r7
747
748PU1_SRC_LOOP_WD_16_HT_4:
749    VMOV.I8     Q9,#0
750    ADD         r8,r0,r1                    @*pu1_src + src_strd
751    VLD1.8      D16,[r8]!                   @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
752    VLD1.8      D17,[r8]                    @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
753    SUB         r8,#8
754
755    ADD         r8,r8,#16
756    LDRH        r5,[r8]                     @pu1_src_cpy[src_strd + 16]
757    VMOV.16     D18[0],r5                   @pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0)
758    VEXT.8      Q9,Q8,Q9,#2                 @pu1_next_row_tmp = vextq_u8(pu1_next_row, pu1_next_row_tmp, 2)
759
760    CMP         r7,r12
761    BLT         SIGN_UP_CHANGE_WD_16_HT_4
762    LDR         r5,[sp,#pu1_avail_offset]   @Loads pu1_avail
763    LDRB        r5,[r5,#2]                  @pu1_avail[2]
764    CMP         r5,#0
765    BNE         SIGN_UP_CHANGE_DONE_WD_16_HT_4
766
767SIGN_UP_CHANGE_WD_16_HT_4:
768    LDRB        r8,[r0]                     @pu1_src_cpy[0]
769    SUB         r5,r12,r7                   @ht_tmp - row
770    LSL         r5,r5,#1                    @(ht_tmp - row) * 2
771    ADD         r9,r14,r5                   @pu1_src_left_cpy[(ht_tmp - row) * 2]
772    LDRB        r5,[r9,#-2]                 @load the value
773    SUB         r8,r8,r5                    @pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]
774    CMP         r8,#0
775    MVNLT       r8,#0
776    MOVGT       r8,#1                       @SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2])
777    VMOV.8      d14[0],r8                   @sign_up = sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]), sign_up, 0)
778
779    LDRB        r8,[r0,#1]                  @pu1_src_cpy[0]
780    LDRB        r5,[r9,#-1]                 @load the value
781    SUB         r8,r8,r5                    @pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]
782    CMP         r8,#0
783    MVNLT       r8,#0
784    MOVGT       r8,#1                       @SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2])
785    VMOV.8      d14[1],r8                   @sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]), sign_up, 1)
786
787SIGN_UP_CHANGE_DONE_WD_16_HT_4:
788    VCGT.U8     Q11,Q6,Q9                   @vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
789    VCLT.U8     Q12,Q6,Q9                   @vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
790    VSUB.U8     Q12,Q12,Q11                 @sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
791
792    VADD.I8     Q13,Q0,Q7                   @edge_idx = vaddq_s8(const_2, sign_up)
793    VADD.I8     Q13,Q13,Q12                 @edge_idx = vaddq_s8(edge_idx, sign_down)
794
795    VLD1.8      D22,[r2]                    @edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
796    VTBL.8      D26,{D22},D26               @vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
797    VTBL.8      D27,{D22},D27               @vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
798
799    VAND        Q13,Q13,Q4                  @edge_idx = vandq_s8(edge_idx, au1_mask)
800
801    VNEG.S8     Q7,Q12                      @sign_up = vnegq_s8(sign_down)
802    VEXT.8      Q7,Q7,Q7,#14                @sign_up = vextq_s8(sign_up, sign_up, 14)
803
804    VUZP.8      D26,D27
805    VTBL.8      D24,{D6},D26
806    VTBL.8      D25,{D7},D27
807    VZIP.8      D24,D25
808
809    VMOVL.U8    Q14,D12                     @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
810    VADDW.S8    Q14,Q14,D24                 @pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
811    VMAX.S16    Q14,Q14,Q1                  @pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
812    VMIN.U16    Q14,Q14,Q2                  @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
813
814    VMOVL.U8    Q13,D13                     @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
815    VADDW.S8    Q13,Q13,D25                 @pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
816    VMAX.S16    Q13,Q13,Q1                  @pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
817    VMIN.U16    Q13,Q13,Q2                  @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
818
819    VMOVN.I16   D28,Q14                     @vmovn_s16(pi2_tmp_cur_row.val[0])
820    VMOVN.I16   D29,Q13                     @vmovn_s16(pi2_tmp_cur_row.val[1])
821
822    VST1.8      {Q14},[r0],r1               @vst1q_u8(pu1_src_cpy, pu1_cur_row)
823
824    VMOV        Q6,Q8                       @pu1_cur_row = pu1_next_row
825    SUBS        r7,r7,#1                    @Decrement the ht_tmp loop count by 1
826    BNE         PU1_SRC_LOOP_WD_16_HT_4     @If not equal jump to PU1_SRC_LOOP_WD_16_HT_4
827
828    LDR         r8,[sp,#ht_offset]          @Loads ht
829    ADD         r5,sp,#75                   @*au1_src_left_tmp
830    LDR         r11,[sp,#216]               @Loads *pu1_src_left
831
832SRC_LEFT_LOOP_WD_16_HT_4:
833    LDR         r7,[r5],#4                  @au1_src_left_tmp[row]
834    STR         r7,[r11],#4                 @pu1_src_left[row] = au1_src_left_tmp[row]
835
836    SUBS        r8,r8,#2
837    BNE         SRC_LEFT_LOOP_WD_16_HT_4
838
839
840    SUBS        r6,r6,#16                   @Decrement the wd loop count by 16
841    CMP         r6,#8
842    BLT         RE_ASSINING_LOOP            @Jump to re-assigning loop
843    LDR         r7,[sp,#wd_offset]          @Loads wd
844    LDR         r0,[sp,#2]                  @Loads *pu1_src
845    SUB         r7,r7,r6
846    ADD         r0,r0,r7
847    BGT         WD_16_HT_4_LOOP
848    BEQ         WIDTH_RESIDUE
849
850
851WIDTH_RESIDUE:
852    LDR         r7,[sp,#wd_offset]          @Loads wd
853    LDR         r5,[sp,#pu1_avail_offset]   @Loads pu1_avail
854    CMP         r6,r7                       @wd_residue == wd
855    LDRBEQ      r8,[r5]                     @pu1_avail[0]
856
857    MOVNE       r8,#-1
858    VMOV.8      d8[0],r8                    @au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
859    VMOV.8      d8[1],r8                    @au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
860
861    LDRB        r8,[r5,#1]                  @pu1_avail[1]
862    VMOV.8      d8[6],r8                    @au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
863    VMOV.8      d8[7],r8                    @au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
864
865    LDRB        r8,[r5,#2]                  @pu1_avail[2]
866    CMP         r8,#0
867
868    SUBEQ       r8,r0,r1                    @pu1_src - src_strd
869    MOVNE       r8,r3
870    SUB         r8,r8,#2                    @pu1_src - src_strd - 2
871    VLD1.8      D10,[r8]!                   @pu1_top_row = vld1q_u8(pu1_src - src_strd - 2)
872    VLD1.8      D11,[r8]                    @pu1_top_row = vld1q_u8(pu1_src - src_strd - 2)
873    SUB         r8,#8
874
875    ADD         r5,sp,#75                   @*au1_src_left_tmp
876    LDR         r4,[sp,#ht_offset]          @Loads ht
877    LDR         r7,[sp,#wd_offset]          @Loads wd
878    LDR         r8,[sp,#212]                @Loads *pu1_src
879    SUB         r7,r7,#2                    @(wd - 2)
880    ADD         r7,r8,r7                    @pu1_src[0 * src_strd + (wd - 2)]
881
882AU1_SRC_LEFT_LOOP_RESIDUE:
883    LDRH        r8,[r7]                     @load the value and increment by src_strd
884    STRH        r8,[r5],#2                  @store it in the stack pointer
885    ADD         r7,r7,r1
886    SUBS        r4,r4,#1                    @decrement the loop count
887    BNE         AU1_SRC_LEFT_LOOP_RESIDUE
888
889    VLD1.8      D12,[r0]!                   @pu1_cur_row = vld1q_u8(pu1_src)
890    VLD1.8      D13,[r0]                    @pu1_cur_row = vld1q_u8(pu1_src)
891    SUB         r0,#8
892
893    VCGT.U8     Q7,Q6,Q5                    @vcgtq_u8(pu1_cur_row, pu1_top_row)
894    VCLT.U8     Q8,Q6,Q5                    @vcltq_u8(pu1_cur_row, pu1_top_row)
895    VSUB.U8     Q7,Q8,Q7                    @sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
896    MOV         r7,r12                      @row count, move ht_tmp to r7
897
898PU1_SRC_LOOP_RESIDUE:
899    VMOV.I8     Q9,#0
900    ADD         r8,r0,r1                    @*pu1_src + src_strd
901    VLD1.8      D16,[r8]!                   @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
902    VLD1.8      D17,[r8]                    @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
903    SUB         r8,#8
904
905    ADD         r8,r8,#16
906    LDRH        r5,[r8]                     @pu1_src_cpy[src_strd + 16]
907    VMOV.16     D18[0],r5                   @pu1_next_row_tmp = vsetq_lane_u8(pu1_src_cpy[src_strd + 16], pu1_next_row_tmp, 0)
908    VEXT.8      Q9,Q8,Q9,#2                 @pu1_next_row_tmp = vextq_u8(pu1_next_row, pu1_next_row_tmp, 2)
909
910    CMP         r7,r12
911    BLT         SIGN_UP_CHANGE_RESIDUE
912    LDR         r5,[sp,#pu1_avail_offset]   @Loads pu1_avail
913    LDRB        r5,[r5,#2]                  @pu1_avail[2]
914    CMP         r5,#0
915    BNE         SIGN_UP_CHANGE_DONE_RESIDUE
916
917SIGN_UP_CHANGE_RESIDUE:
918    LDRB        r8,[r0]                     @pu1_src_cpy[0]
919    SUB         r5,r12,r7                   @ht_tmp - row
920    LSL         r5,r5,#1                    @(ht_tmp - row) * 2
921    ADD         r9,r14,r5                   @pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]
922    LDRB        r5,[r9,#-2]                 @load the value
923    SUB         r8,r8,r5                    @pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]
924    CMP         r8,#0
925    MVNLT       r8,#0
926    MOVGT       r8,#1                       @SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2])
927    VMOV.8      d14[0],r8                   @sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[0] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]), sign_up, 0)
928
929    LDRB        r8,[r0,#1]                  @pu1_src_cpy[0]
930    LDRB        r5,[r9,#-1]                 @load the value
931    SUB         r8,r8,r5                    @pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2]
932    CMP         r8,#0
933    MVNLT       r8,#0
934    MOVGT       r8,#1                       @SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2])
935    VMOV.8      d14[1],r8                   @sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[1] - pu1_src_left_cpy[(ht_tmp - 1 - row) * 2 + 1]), sign_up, 1)
936
937SIGN_UP_CHANGE_DONE_RESIDUE:
938    VCGT.U8     Q11,Q6,Q9                   @vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
939    VCLT.U8     Q12,Q6,Q9                   @vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
940    VSUB.U8     Q12,Q12,Q11                 @sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
941
942    VADD.I8     Q13,Q0,Q7                   @edge_idx = vaddq_s8(const_2, sign_up)
943    VADD.I8     Q13,Q13,Q12                 @edge_idx = vaddq_s8(edge_idx, sign_down)
944
945    VLD1.8      D22,[r2]                    @edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
946    VTBL.8      D26,{D22},D26               @vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
947    VTBL.8      D27,{D22},D27               @vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
948
949    VAND        Q13,Q13,Q4                  @edge_idx = vandq_s8(edge_idx, au1_mask)
950
951    VNEG.S8     Q7,Q12                      @sign_up = vnegq_s8(sign_down)
952    VEXT.8      Q7,Q7,Q7,#14                @sign_up = vextq_s8(sign_up, sign_up, 14)
953
954    VUZP.8      D26,D27
955    VTBL.8      D24,{D6},D26
956    VTBL.8      D25,{D7},D27
957    VZIP.8      D24,D25
958
959    VMOVL.U8    Q14,D12                     @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
960    VADDW.S8    Q14,Q14,D24                 @pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
961    VMAX.S16    Q14,Q14,Q1                  @pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
962    VMIN.U16    Q14,Q14,Q2                  @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
963
964    VMOVN.I16   D28,Q14                     @vmovn_s16(pi2_tmp_cur_row.val[0])
965
966    VST1.8      {D28},[r0],r1               @vst1q_u8(pu1_src_cpy, pu1_cur_row)
967
968    VMOV        Q6,Q8                       @pu1_cur_row = pu1_next_row
969    SUBS        r7,r7,#1                    @Decrement the ht_tmp loop count by 1
970    BNE         PU1_SRC_LOOP_RESIDUE        @If not equal jump to PU1_SRC_LOOP
971
972    LDR         r8,[sp,#ht_offset]          @Loads ht
973    LDR         r11,[sp,#216]               @Loads *pu1_src_left
974    ADD         r5,sp,#75                   @*au1_src_left_tmp
975
976SRC_LEFT_LOOP_RESIDUE:
977    LDR         r7,[r5],#4                  @au1_src_left_tmp[row]
978    SUBS        r8,r8,#2
979    STR         r7,[r11],#4                 @pu1_src_left[row] = au1_src_left_tmp[row]
980
981    BNE         SRC_LEFT_LOOP_RESIDUE
982
983
984RE_ASSINING_LOOP:
985    LDR         r8,[sp,#ht_offset]          @Loads ht
986
987    LDR         r0,[sp,#212]                @Loads *pu1_src
988    SUB         r8,r8,#1                    @ht - 1
989
990    LDR         r7,[sp,#wd_offset]          @Loads wd
991
992    LDRH        r9,[sp,#6]
993    MLA         r6,r8,r1,r7                 @wd - 2 + (ht - 1) * src_strd
994
995    STRH        r9,[r0]                     @pu1_src_org[0] = u1_pos_0_0_tmp
996    ADD         r6,r0,r6                    @pu1_src[wd - 2 + (ht - 1) * src_strd]
997
998    LDRH        r9,[sp,#8]
999    ADD         r12,sp,#10
1000    STRH        r9,[r6,#-2]                 @pu1_src_org[wd - 1 + (ht - 1) * src_strd] = u1_pos_wd_ht_tmp_u
1001
1002    LDR         r4,[sp,#pu1_src_top_left_offset]               @Loads pu1_src_top_left
1003    LDRH        r10,[sp]                    @load u1_src_top_left_tmp from stack pointer
1004    STRH        r10,[r4]                    @*pu1_src_top_left = u1_src_top_left_tmp
1005    LDR         r3,[sp,#220]                @Loads pu1_src_top
1006
1007SRC_TOP_LOOP:
1008    VLD1.8      D0,[r12]!                   @pu1_src_top[col] = au1_src_top_tmp[col]
1009    SUBS        r7,r7,#8                    @Decrement the width
1010    VST1.8      D0,[r3]!                    @pu1_src_top[col] = au1_src_top_tmp[col]
1011    BNE         SRC_TOP_LOOP
1012
1013END_LOOPS:
1014    ADD         sp,sp,#224
1015
1016    vpop        {d8  -  d15}
1017    LDMFD       sp!,{r4-r12,r15}            @Reload the registers from SP
1018
1019
1020
1021