1///*****************************************************************************
2//*
3//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
4//*
5//* Licensed under the Apache License, Version 2.0 (the "License");
6//* you may not use this file except in compliance with the License.
7//* You may obtain a copy of the License at:
8//*
9//* http://www.apache.org/licenses/LICENSE-2.0
10//*
11//* Unless required by applicable law or agreed to in writing, software
12//* distributed under the License is distributed on an "AS IS" BASIS,
13//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14//* See the License for the specific language governing permissions and
15//* limitations under the License.
16//*
17//*****************************************************************************/
18///**
19
20///**
21//******************************************************************************
22//* //file
23//*  ihevc_inter_pred_luma_horz_w16out.s
24//*
25//* //brief
26//*  contains function definitions for inter prediction  interpolation.
27//* functions are coded using neon  intrinsics and can be compiled using
28
29//* rvct
30//*
31//* //author
32//*  parthiban v
33//*
34//* //par list of functions:
35//*
36//*  - ihevc_inter_pred_luma_horz_w16out()
37//*
38//* //remarks
39//*  none
40//*
41//*******************************************************************************
42//*/
43///**
44//*******************************************************************************
45//*
46//* //brief
47//*   interprediction luma filter for horizontal 16bit output
48//*
49//* //par description:
50//*     applies a horizontal filter with coefficients pointed to  by 'pi1_coeff'
51//*     to the elements pointed by 'pu1_src' and  writes to the location pointed
52//*     by 'pu1_dst'  no downshifting or clipping is done and the output is  used
53//*     as an input for vertical filtering or weighted  prediction   assumptions :
54//*     the function is optimized considering the fact width is  multiple of 4 or
55//*     8. if width is multiple of 4 then height  should be multiple of 2, width 8
56//*     is optimized further.
57//*
58//* //param[in] pu1_src
59//*  uword8 pointer to the source
60//*
61//* //param[out] pi2_dst
62//*  word16 pointer to the destination
63//*
64//* //param[in] src_strd
65//*  integer source stride
66//*
67//* //param[in] dst_strd
68//*  integer destination stride
69//*
70//* //param[in] pi1_coeff
71//*  word8 pointer to the filter coefficients
72//*
73//* //param[in] ht
74//*  integer height of the array
75//*
76//* //param[in] wd
77//*  integer width of the array
78//*
79//* //returns
80//*
81//* //remarks
82//*  none
83//*
84//*******************************************************************************
85//*/
86
87//void ihevc_inter_pred_luma_horz_w16out(uword8 *pu1_src,
88//                                word16 *pi2_dst,
89//                                word32 src_strd,
90//                                word32 dst_strd,
91//                                word8 *pi1_coeff,
92//                                word32 ht,
93//                                word32 wd
94
95
96//x0 - free
97//x1 - dst_ptr
98//x2 - src_strd
99//x3 - dst_strd
100//x8 - src_ptx2
101//x9 - inner loop counter
102//x10 - dst_ptx2
103//x11 - free
104//x12 - dst_strd2
105//x13 - src_strd1
106//x14 - wd
107//x15 - #1
108//x16 - src_ptx1
109//x19 - loop_counter
110.text
111.align 4
112
113.include "ihevc_neon_macros.s"
114
115.globl ihevc_inter_pred_luma_horz_w16out_av8
116
117.type ihevc_inter_pred_luma_horz_w16out_av8, %function
118
119ihevc_inter_pred_luma_horz_w16out_av8:
120
121    // stmfd sp!, {x8-x16, x19}                //stack stores the values of the arguments
122    push_v_regs
123    stp         x19, x20,[sp,#-16]!
124    mov         x20,#1
125    bic         x19, x19, x20               // clearing bit[0], so that it goes back to mode
126    mov         x8,x4                       //loads pi1_coeff
127    mov         x11,x5                      //loads ht
128
129
130    ld1         {v0.8b},[x8]                //coeff = vld1_s8(pi1_coeff)
131    sub         x19,x11,#0                  //checks for ht == 0
132    abs         v2.8b, v0.8b                //vabs_s8(coeff)
133    mov         x15,#1
134    //ble          end_loops
135    mov         x14,x6                      //loads wd
136    dup         v24.8b, v2.b[0]             //coeffabs_0 = vdup_lane_u8(coeffabs, 0)
137    sub         x16,x0,#3                   //pu1_src - 3
138    dup         v25.8b, v2.b[1]             //coeffabs_1 = vdup_lane_u8(coeffabs, 1)
139    add         x8,x16,x2                   //pu1_src_tmp2_8 = pu1_src + src_strd
140    dup         v26.8b, v2.b[2]             //coeffabs_2 = vdup_lane_u8(coeffabs, 2)
141    sub         x20,x14,x2,lsl #1           //2*src_strd - wd
142    neg         x13, x20
143    dup         v27.8b, v2.b[3]             //coeffabs_3 = vdup_lane_u8(coeffabs, 3)
144    sub         x20,x14,x3                  //dst_strd - wd
145    neg         x12, x20
146    dup         v28.8b, v2.b[4]             //coeffabs_4 = vdup_lane_u8(coeffabs, 4)
147
148    dup         v29.8b, v2.b[5]             //coeffabs_5 = vdup_lane_u8(coeffabs, 5)
149    and         x11,x19,#1                  //calculating ht_residue ht_residue = (ht & 1)
150    dup         v30.8b, v2.b[6]             //coeffabs_6 = vdup_lane_u8(coeffabs, 6)
151    sub         x19,x19,x11                 //decrement height by ht_residue(residue value is calculated outside)
152    dup         v31.8b, v2.b[7]             //coeffabs_7 = vdup_lane_u8(coeffabs, 7)
153
154    cmp         x11,#1
155    beq         odd_height_decision
156
157even_height_decision:
158    mov         x11,x1
159    cmp         x14,#4
160    ble         outer_loop_4
161
162    cmp         x14,#24
163    mov         x20,#16
164    csel        x14, x20, x14,eq
165    add         x20, x12,#8
166    csel        x12, x20, x12,eq
167    add         x20, x13,#8
168    csel        x13, x20, x13,eq
169
170    cmp         x14,#16
171    bge         outer_loop_16_branch
172
173    cmp         x14,#12
174    add         x20, x12,#4
175    csel        x12, x20, x12,eq
176    add         x20, x13,#4
177    csel        x13, x20, x13,eq
178outer_loop_8_branch:
179    b           outer_loop_8
180
181outer_loop_16_branch:
182    b           outer_loop_16
183
184
185odd_height_decision:
186    cmp         x14,#24
187    beq         outer_loop_8_branch
188    cmp         x14,#12
189    beq         outer_loop_4
190    b           even_height_decision
191
192outer_loop4_residual:
193    sub         x16,x0,#3                   //pu1_src - 3
194    mov         x1,x11
195    add         x1, x1,#16
196    mov         x14,#4
197    add         x16, x16,#8
198    mov         x19,#16
199    add         x12, x12,#4
200    add         x13, x13,#4
201
202outer_loop_4:
203    add         x10,x1,x3,lsl #1            //pu1_dst + dst_strd
204    add         x8,x16,x2                   //pu1_src + src_strd
205
206    subs        x9,x14,#0                   //checks wd
207    ble         end_inner_loop_4
208
209inner_loop_4:
210    mov         x15,#1
211    ld1         {v20.2s},[x16],x15          //vector load pu1_src
212    ld1         {v21.2s},[x16],x15
213    ld1         {v22.2s},[x8],x15           //vector load pu1_src + src_strd
214    ld1         {v23.2s},[x8],x15
215
216    zip1        v0.2s, v20.2s, v22.2s
217    zip2        v12.2s, v20.2s, v22.2s      //vector zip the i iteration and ii interation in single register
218    zip1        v1.2s, v21.2s, v23.2s
219    zip2        v13.2s, v21.2s, v23.2s
220
221    ld1         {v20.2s},[x16],x15
222    ld1         {v21.2s},[x16],x15
223    ld1         {v22.2s},[x8],x15
224    ld1         {v23.2s},[x8],x15
225
226    zip1        v2.2s, v20.2s, v22.2s
227    zip2        v14.2s, v20.2s, v22.2s
228    zip1        v3.2s, v21.2s, v23.2s
229    zip2        v15.2s, v21.2s, v23.2s
230
231    ld1         {v20.2s},[x16],x15
232    ld1         {v21.2s},[x16],x15
233    ld1         {v22.2s},[x8],x15
234    ld1         {v23.2s},[x8],x15
235
236    zip1        v4.2s, v20.2s, v22.2s
237    zip2        v16.2s, v20.2s, v22.2s
238    zip1        v5.2s, v21.2s, v23.2s
239    zip2        v17.2s, v21.2s, v23.2s
240
241    ld1         {v20.2s},[x16],x15
242    ld1         {v21.2s},[x16],x15
243    ld1         {v22.2s},[x8],x15
244    ld1         {v23.2s},[x8],x15
245
246    //add        x16,x16,#4                        //increment the input pointer
247    sub         x16,x16,#4
248    //vext.u8    d2,d0,d1,#2                        //vector extract of src[0_2]
249    //vext.u8    d3,d0,d1,#3                        //vector extract of src[0_3]
250    //vext.u8    d4,d0,d1,#4                        //vector extract of src[0_4]
251
252    //vext.u8    d5,d0,d1,#5                        //vector extract of src[0_5]
253    //vext.u8    d6,d0,d1,#6                        //vector extract of src[0_6]
254    //vext.u8    d7,d0,d1,#7                        //vector extract of src[0_7]
255    //vext.u8    d1,d0,d1,#1                        //vector extract of src[0_1]
256    sub         x8,x8,#4
257    // add        x8,x8,#4                        //increment the input pointer
258    // vext.u8    d14,d12,d13,#2                    //vector extract of src[0_2]
259    // vext.u8    d15,d12,d13,#3                    //vector extract of src[0_3]
260    // vext.u8    d16,d12,d13,#4                    //vector extract of src[0_4]
261    // vext.u8    d17,d12,d13,#5                    //vector extract of src[0_5]
262    // vext.u8    d18,d12,d13,#6                    //vector extract of src[0_6]
263    // vext.u8    d19,d12,d13,#7                    //vector extract of src[0_7]
264    //vext.u8    d13,d12,d13,#1                    //vector extract of src[0_1]
265
266
267
268
269
270
271    zip1        v6.2s, v20.2s, v22.2s
272    zip2        v18.2s, v20.2s, v22.2s
273    zip1        v7.2s, v21.2s, v23.2s
274    zip2        v19.2s, v21.2s, v23.2s
275
276    umull       v8.8h, v1.8b, v25.8b        //arithmetic operations for ii iteration in the same time
277    umlsl       v8.8h, v0.8b, v24.8b
278    umlsl       v8.8h, v2.8b, v26.8b
279    umlal       v8.8h, v3.8b, v27.8b
280    umlal       v8.8h, v4.8b, v28.8b
281    umlsl       v8.8h, v5.8b, v29.8b
282    umlal       v8.8h, v6.8b, v30.8b
283    umlsl       v8.8h, v7.8b, v31.8b
284
285    // vqrshrun.s16 d8,q4,#6                        //narrow right shift and saturating the result
286    st1         {v8.d}[0],[x1],#8           //store the i iteration result which is in upper part of the register
287    st1         {v8.d}[1],[x10],#8          //store the ii iteration result which is in lower part of the register
288    subs        x9,x9,#4                    //decrement the wd by 4
289    bgt         inner_loop_4
290
291end_inner_loop_4:
292    subs        x19,x19,#2                  //decrement the ht by 4
293    add         x16,x16,x13                 //increment the input pointer 2*src_strd-wd
294    add         x1,x10,x12,lsl #1           //increment the output pointer 2*dst_strd-wd
295    bgt         outer_loop_4
296
297
298height_residue_4:
299
300    mov         x11,x5                      //loads ht
301    and         x11,x11,#1                  //calculating ht_residue ht_residue = (ht & 1)
302    cmp         x11,#0
303    //beq        end_loops
304    // ldmeqfd sp!,{x8-x16,pc}                  //reload the registers from sp
305    bne         lbl280
306    ldp         x19, x20,[sp], #16
307    pop_v_regs
308    ret
309lbl280:
310
311outer_loop_height_residue_4:
312
313
314    subs        x9,x14,#0                   //checks wd
315    ble         end_inner_loop_height_residue_4
316
317inner_loop_height_residue_4:
318    mov         x15, #1
319    ld1         {v0.2s},[x16],x15           //vector load pu1_src
320    ld1         {v1.2s},[x16],x15
321
322
323
324
325
326
327    // vext.u8    d2,d0,d1,#2                        //vector extract of src[0_2]
328    // vext.u8    d3,d0,d1,#3                        //vector extract of src[0_3]
329    // vext.u8    d4,d0,d1,#4                        //vector extract of src[0_4]
330
331
332
333    //add        x16,x16,#4                        //increment the input pointer
334    // vext.u8    d5,d0,d1,#5                        //vector extract of src[0_5]
335    // vext.u8    d6,d0,d1,#6                        //vector extract of src[0_6]
336    // vext.u8    d7,d0,d1,#7                        //vector extract of src[0_7]
337    // vext.u8    d1,d0,d1,#1                        //vector extract of src[0_1]
338    ld1         {v2.2s},[x16],x15
339    umull       v8.8h, v1.8b, v25.8b        //arithmetic operations for ii iteration in the same time
340    ld1         {v3.2s},[x16],x15
341    umlsl       v8.8h, v0.8b, v24.8b
342    ld1         {v4.2s},[x16],x15
343    umlsl       v8.8h, v2.8b, v26.8b
344    ld1         {v5.2s},[x16],x15
345    umlal       v8.8h, v3.8b, v27.8b
346    ld1         {v6.2s},[x16],x15
347    umlal       v8.8h, v4.8b, v28.8b
348    ld1         {v7.2s},[x16],x15
349    umlsl       v8.8h, v5.8b, v29.8b
350    sub         x16,x16,#4
351    umlal       v8.8h, v6.8b, v30.8b
352    umlsl       v8.8h, v7.8b, v31.8b        //store the i iteration result which is in upper part of the register
353    subs        x9,x9,#4                    //decrement the wd by 4
354    st1         {v8.d}[0],[x1],#8
355    bgt         inner_loop_height_residue_4
356
357end_inner_loop_height_residue_4:
358    subs        x11,x11,#1                  //decrement the ht by 4
359    sub         x20,x14,x2
360    neg         x13, x20
361    add         x16,x16,x13                 //increment the input pointer src_strd-wd
362    add         x1,x1,x12                   //increment the output pointer dst_strd-wd
363    bgt         outer_loop_height_residue_4
364
365    // ldmfd sp!,{x8-x16,pc}                  //reload the registers from sp
366    ldp         x19, x20,[sp], #16
367    pop_v_regs
368    ret
369
370outer_loop8_residual:
371    sub         x16,x0,#3                   //pu1_src - 3
372    mov         x1,x11
373    mov         x19,#32
374    add         x1, x1,#32
375    add         x16, x16,#16
376    mov         x14,#8
377    add         x12, x12,#8
378    add         x13, x13,#8
379
380outer_loop_8:
381
382    add         x10,x1,x3,lsl #1            //pu1_dst + dst_strd
383    add         x8,x16,x2                   //pu1_src + src_strd
384    subs        x9,x14,#0                   //checks wd
385
386    ble         end_inner_loop_8
387
388inner_loop_8:
389    mov         x15, #1
390    ld1         {v0.2s},[x16],x15           //vector load pu1_src
391    ld1         {v1.2s},[x16],x15
392    ld1         {v2.2s},[x16],x15
393    ld1         {v3.2s},[x16],x15
394
395
396
397
398
399    // vext.u8    d2,d0,d1,#2                        //vector extract of src[0_2]
400    // vext.u8    d3,d0,d1,#3                        //vector extract of src[0_3]
401    // vext.u8    d4,d0,d1,#4                        //vector extract of src[0_4]
402    // vext.u8    d5,d0,d1,#5                        //vector extract of src[0_5]
403    // vext.u8    d6,d0,d1,#6                        //vector extract of src [0_6]
404    // vext.u8    d7,d0,d1,#7                        //vector extract of src[0_7]
405    // vext.u8    d1,d0,d1,#1                        //vector extract of src[0_1]
406    // vext.u8    d14,d12,d13,#2
407
408    //vext.u8    d15,d12,d13,#3                    //vector extract of src[0_3]
409    // vext.u8    d16,d12,d13,#4                    //vector extract of src[0_4]
410    // vext.u8    d17,d12,d13,#5                    //vector extract of src[0_5]
411    //vext.u8    d18,d12,d13,#6                    //vector extract of src[0_6]
412    //vext.u8    d19,d12,d13,#7                    //vector extract of src[0_7]
413    //vext.u8    d13,d12,d13,#1                    //vector extract of src[0_1]
414    ld1         {v4.2s},[x16],x15
415    umull       v8.8h, v1.8b, v25.8b        //mul_res = vmlal_u8(src[0_1], coeffabs_1)//
416    ld1         {v5.2s},[x16],x15
417    umlal       v8.8h, v3.8b, v27.8b        //mul_res = vmull_u8(src[0_3], coeffabs_3)//
418    ld1         {v6.2s},[x16],x15
419    umlsl       v8.8h, v0.8b, v24.8b        //mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
420    ld1         {v7.2s},[x16],x15
421    umlsl       v8.8h, v2.8b, v26.8b        //mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
422    ld1         {v12.2s},[x8],x15           //vector load pu1_src + src_strd
423    umlal       v8.8h, v4.8b, v28.8b        //mul_res = vmlal_u8(src[0_4], coeffabs_4)//
424    ld1         {v13.2s},[x8],x15
425    umlsl       v8.8h, v5.8b, v29.8b        //mul_res = vmlsl_u8(src[0_5], coeffabs_5)//
426    ld1         {v14.2s},[x8],x15
427    umlal       v8.8h, v6.8b, v30.8b        //mul_res = vmlal_u8(src[0_6], coeffabs_6)//
428    ld1         {v15.2s},[x8],x15
429    umlsl       v8.8h, v7.8b, v31.8b        //mul_res = vmlsl_u8(src[0_7], coeffabs_7)//
430    ld1         {v16.2s},[x8],x15           //vector load pu1_src + src_strd
431
432    umull       v10.8h, v15.8b, v27.8b      //mul_res = vmull_u8(src[0_3], coeffabs_3)//
433    ld1         {v17.2s},[x8],x15
434    umlsl       v10.8h, v14.8b, v26.8b      //mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
435    ld1         {v18.2s},[x8],x15
436    umlal       v10.8h, v16.8b, v28.8b      //mul_res = vmlal_u8(src[0_4], coeffabs_4)//
437    ld1         {v19.2s},[x8],x15           //vector load pu1_src + src_strd
438    umlsl       v10.8h, v17.8b, v29.8b      //mul_res = vmlsl_u8(src[0_5], coeffabs_5)//
439    // vqrshrun.s16     d20,q4,#6                        //right shift and saturating narrow result 1
440    umlal       v10.8h, v18.8b, v30.8b      //mul_res = vmlal_u8(src[0_6], coeffabs_6)//
441    umlsl       v10.8h, v19.8b, v31.8b      //mul_res = vmlsl_u8(src[0_7], coeffabs_7)//
442    st1         {v8.8h},[x1],#16            //store the result pu1_dst
443    umlsl       v10.8h, v12.8b, v24.8b      //mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
444    umlal       v10.8h, v13.8b, v25.8b      //mul_res = vmlal_u8(src[0_1], coeffabs_1)//
445
446
447
448    // vqrshrun.s16 d8,q5,#6                        //right shift and saturating narrow result 2
449    subs        x9,x9,#8                    //decrement the wd loop
450    st1         {v10.8h},[x10],#16          //store the result pu1_dst
451    cmp         x9,#4
452    bgt         inner_loop_8
453
454end_inner_loop_8:
455    subs        x19,x19,#2                  //decrement the ht loop
456    add         x16,x16,x13                 //increment the src pointer by 2*src_strd-wd
457    add         x1,x10,x12,lsl #1           //increment the dst pointer by 2*dst_strd-wd
458    bgt         outer_loop_8
459
460
461
462
463
464    mov         x14,x6                      //loads wd
465    cmp         x14,#12
466
467    beq         outer_loop4_residual
468
469    mov         x11,x5                      //loads ht
470    and         x11,x11,#1
471    cmp         x11,#1
472    beq         height_residue_4
473
474//end_loops
475
476    // ldmfd sp!,{x8-x16,pc}                  //reload the registers from sp
477    ldp         x19, x20,[sp], #16
478    pop_v_regs
479    ret
480
481
482
483
484
485outer_loop_16:
486    mov         x15, #-7
487    stp         x0,x11,[sp,#-16]!
488    add         x10,x1,x3,lsl #1            //pu1_dst + dst_strd
489    add         x8,x16,x2                   //pu1_src + src_strd
490    and         x0, x16, #31
491    sub         x9,x14,#0                   //checks wd
492    //ble          end_loops1
493    add         x20,x16, x2, lsl #1
494    prfm        PLDL1KEEP,[x20]
495    ld1         {v0.2s},[x16],#8            //vector load pu1_src
496    ld1         {v1.2s},[x16],x15           //vector load pu1_src
497    add         x20,x8, x2, lsl #1
498    prfm        PLDL1KEEP,[x20]
499    ld1         {v2.2s},[x16],#8
500    ld1         {v3.2s},[x16],x15
501    ld1         {v4.2s},[x16],#8
502    ld1         {v5.2s},[x16],x15
503    ld1         {v6.2s},[x16],#8
504    ld1         {v7.2s},[x16],x15
505    ld1         {v12.2s},[x16],#8
506    ld1         {v13.2s},[x16],x15
507    umull       v8.8h, v2.8b, v25.8b        //mul_res = vmlal_u8(src[0_1], coeffabs_1)//
508    ld1         {v14.2s},[x16],#8
509    ld1         {v15.2s},[x16],x15
510    umlal       v8.8h, v6.8b, v27.8b        //mul_res = vmull_u8(src[0_3], coeffabs_3)//
511    ld1         {v16.2s},[x16],#8
512    ld1         {v17.2s},[x16],x15
513    umlsl       v8.8h, v0.8b, v24.8b        //mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
514    ld1         {v18.2s},[x16],#8
515    ld1         {v19.2s},[x16],x15
516    umlsl       v8.8h, v4.8b, v26.8b        //mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
517    umlal       v8.8h, v12.8b, v28.8b       //mul_res = vmlal_u8(src[0_4], coeffabs_4)//
518    umlsl       v8.8h, v14.8b, v29.8b       //mul_res = vmlsl_u8(src[0_5], coeffabs_5)//
519    umlal       v8.8h, v16.8b, v30.8b       //mul_res = vmlal_u8(src[0_6], coeffabs_6)//
520    umlsl       v8.8h, v18.8b, v31.8b       //mul_res = vmlsl_u8(src[0_7], coeffabs_7)//
521
522
523inner_loop_16:
524
525
526    subs        x9,x9,#16
527    umull       v20.8h, v3.8b, v25.8b
528
529    add         x16, x16,#8
530    umlsl       v20.8h, v1.8b, v24.8b
531
532    ld1         {v0.2s},[x8],#8             //vector load pu1_src
533    ld1         {v1.2s},[x8],x15            //vector load pu1_src
534    umlal       v20.8h, v7.8b, v27.8b
535
536    ld1         {v2.2s},[x8],#8
537    ld1         {v3.2s},[x8],x15
538    umlsl       v20.8h, v5.8b, v26.8b
539
540    ld1         {v4.2s},[x8],#8
541    ld1         {v5.2s},[x8],x15
542    umlal       v20.8h, v13.8b, v28.8b
543
544    ld1         {v6.2s},[x8],#8
545    ld1         {v7.2s},[x8],x15
546    umlal       v20.8h, v17.8b, v30.8b
547
548    ld1         {v12.2s},[x8],#8
549    ld1         {v13.2s},[x8],x15
550    umlsl       v20.8h, v15.8b, v29.8b
551
552    ld1         {v14.2s},[x8],#8
553    ld1         {v15.2s},[x8],x15
554    umlsl       v20.8h, v19.8b, v31.8b
555
556    ld1         {v16.2s},[x8],#8
557    ld1         {v17.2s},[x8],x15
558    umull       v10.8h, v2.8b, v25.8b       //mul_res = vmlal_u8(src[0_1], coeffabs_1)//
559
560    ld1         {v18.2s},[x8],#8
561    ld1         {v19.2s},[x8],x15
562    umlal       v10.8h, v6.8b, v27.8b       //mul_res = vmull_u8(src[0_3], coeffabs_3)//
563
564    add         x8, x8,#8
565    umlsl       v10.8h, v0.8b, v24.8b       //mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
566    add         x20,x16, x2, lsl #2
567    prfm        PLDL1KEEP,[x20]
568    add         x20,x8, x2, lsl #2
569    prfm        PLDL1KEEP,[x20]
570    st1         {v8.16b},[x1],#16           //store the result pu1_dst
571    umlsl       v10.8h, v4.8b, v26.8b       //mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
572
573    add         x20,x16,x13                 //increment the src pointer by 2*src_strd-wd
574    csel        x16, x20, x16,eq
575    umlal       v10.8h, v12.8b, v28.8b      //mul_res = vmlal_u8(src[0_4], coeffabs_4)//
576
577    add         x20,x16,x2                  //pu1_src + src_strd
578    csel        x8, x20, x8,eq
579    umlsl       v10.8h, v14.8b, v29.8b      //mul_res = vmlsl_u8(src[0_5], coeffabs_5)//
580
581//    and            x11, x16, #31
582    umlal       v10.8h, v16.8b, v30.8b      //mul_res = vmlal_u8(src[0_6], coeffabs_6)//
583
584    sub         x20,x19,#2
585    csel        x19, x20, x19,eq
586    umlsl       v10.8h, v18.8b, v31.8b      //mul_res = vmlsl_u8(src[0_7], coeffabs_7)//
587
588    //cmp            x11, x0
589    umull       v22.8h, v3.8b, v25.8b
590
591//    add x20,x16, x2, lsl #2
592    prfm        PLDL1KEEP,[x20]
593    umlsl       v22.8h, v1.8b, v24.8b
594
595    st1         {v20.8h},[x1],#16
596    umlal       v22.8h, v7.8b, v27.8b
597
598//    add x20,x8, x2, lsl #2
599    prfm        PLDL1KEEP,[x20]
600    umlsl       v22.8h, v5.8b, v26.8b
601
602//    mov            x0, x11
603    umlal       v22.8h, v13.8b, v28.8b
604
605    cmp         x19,#0
606    umlal       v22.8h, v17.8b, v30.8b
607
608    st1         {v10.8h},[x10],#16
609    umlsl       v22.8h, v15.8b, v29.8b
610
611    umlsl       v22.8h, v19.8b, v31.8b
612
613    beq         epilog_16
614
615    ld1         {v0.2s},[x16],#8            //vector load pu1_src
616    ld1         {v1.2s},[x16],x15           //vector load pu1_src
617    ld1         {v2.2s},[x16],#8
618    ld1         {v3.2s},[x16],x15
619    ld1         {v4.2s},[x16],#8
620    ld1         {v5.2s},[x16],x15
621    ld1         {v6.2s},[x16],#8
622    ld1         {v7.2s},[x16],x15
623    ld1         {v12.2s},[x16],#8
624    ld1         {v13.2s},[x16],x15
625    umull       v8.8h, v2.8b, v25.8b        //mul_res = vmlal_u8(src[0_1], coeffabs_1)//
626    ld1         {v14.2s},[x16],#8
627    ld1         {v15.2s},[x16],x15
628    umlal       v8.8h, v6.8b, v27.8b        //mul_res = vmull_u8(src[0_3], coeffabs_3)//
629    ld1         {v16.2s},[x16],#8
630    ld1         {v17.2s},[x16],x15
631    umlsl       v8.8h, v0.8b, v24.8b        //mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
632    ld1         {v18.2s},[x16],#8
633    ld1         {v19.2s},[x16],x15
634    umlsl       v8.8h, v4.8b, v26.8b        //mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
635    umlal       v8.8h, v12.8b, v28.8b       //mul_res = vmlal_u8(src[0_4], coeffabs_4)//
636    cmp         x9,#0
637    umlsl       v8.8h, v14.8b, v29.8b       //mul_res = vmlsl_u8(src[0_5], coeffabs_5)//
638    mov         x20,x14
639    csel        x9, x20, x9,eq
640    umlal       v8.8h, v16.8b, v30.8b       //mul_res = vmlal_u8(src[0_6], coeffabs_6)//
641    st1         {v22.16b},[x10],#16         //store the result pu1_dst
642    umlsl       v8.8h, v18.8b, v31.8b       //mul_res = vmlsl_u8(src[0_7], coeffabs_7)//
643    add         x20,x10,x12,lsl #1
644    csel        x1, x20, x1,eq
645    add         x20,x1,x3,lsl #1            //pu1_dst + dst_strd
646    csel        x10, x20, x10,eq
647    b           inner_loop_16
648
649
650epilog_16:
651//    vqrshrun.s16 d11,q11,#6
652    st1         {v22.16b},[x10],#16         //store the result pu1_dst
653
654    ldp         x0,x11,[sp],#16
655    mov         x14,x6
656    cmp         x14,#24
657    beq         outer_loop8_residual
658    add         x1,x10,x12,lsl #1
659    mov         x11,x5                      //loads ht
660    and         x11,x11,#1
661    cmp         x11,#1
662    beq         height_residue_4
663
664end_loops1:
665
666    // ldmfd sp!,{x8-x16,pc}                  //reload the registers from sp
667    ldp         x19, x20,[sp], #16
668    pop_v_regs
669    ret
670
671
672
673
674
675
676
677
678
679