1///*****************************************************************************
2//*
3//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
4//*
5//* Licensed under the Apache License, Version 2.0 (the "License");
6//* you may not use this file except in compliance with the License.
7//* You may obtain a copy of the License at:
8//*
9//* http://www.apache.org/licenses/LICENSE-2.0
10//*
11//* Unless required by applicable law or agreed to in writing, software
12//* distributed under the License is distributed on an "AS IS" BASIS,
13//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14//* See the License for the specific language governing permissions and
15//* limitations under the License.
16//*
17//*****************************************************************************/
18///**
19//*******************************************************************************
20//* //file
21//*  ihevc_inter_pred_chroma_horz_neon.s
22//*
23//* //brief
24//*  contains function definitions for inter prediction  interpolation.
25//* functions are coded using neon  intrinsics and can be compiled using
26
27//* rvct
28//*
29//* //author
30//*  yogeswaran rs / akshaya mukund
31//*
32//* //par list of functions:
33//*
34//*
35//* //remarks
36//*  none
37//*
38//*******************************************************************************
39//*/
40///**
41//*******************************************************************************
42//*
43//* //brief
44//*       chroma interprediction filter to store horizontal 16bit ouput
45//*
46//* //par description:
47//*    applies a horizontal filter with coefficients pointed to  by 'pi1_coeff'
48//*    to the elements pointed by 'pu1_src' and  writes to the location pointed
49//*    by 'pu1_dst'  no downshifting or clipping is done and the output is  used
50//*    as an input for vertical filtering or weighted  prediction
51//*
52//* //param[in] pu1_src
53//*  uword8 pointer to the source
54//*
55//* //param[out] pi2_dst
56//*  word16 pointer to the destination
57//*
58//* //param[in] src_strd
59//*  integer source stride
60//*
61//* //param[in] dst_strd
62//*  integer destination stride
63//*
64//* //param[in] pi1_coeff
65//*  word8 pointer to the filter coefficients
66//*
67//* //param[in] ht
68//*  integer height of the array
69//*
70//* //param[in] wd
71//*  integer width of the array
72//*
73//* //returns
74//*
75//* //remarks
76//*  none
77//*
78//*******************************************************************************
79//*/
80//void ihevc_inter_pred_chroma_horz_w16out(uword8 *pu1_src,
81//                                          word16 *pi2_dst,
82//                                          word32 src_strd,
83//                                          word32 dst_strd,
84//                                          word8 *pi1_coeff,
85//                                          word32 ht,
86//                                          word32 wd)
87//**************variables vs registers*****************************************
88//x0 => *pu1_src
89//x1 => *pi2_dst
90//x2 =>  src_strd
91//x3 =>  dst_strd
92
93
94.text
95.align 4
96
97.include "ihevc_neon_macros.s"
98
99.globl ihevc_inter_pred_chroma_horz_w16out_av8
100
101
102.type ihevc_inter_pred_chroma_horz_w16out_av8, %function
103
104ihevc_inter_pred_chroma_horz_w16out_av8:
105
106    // stmfd sp!, {x4-x12, x14}                    //stack stores the values of the arguments
107
108    stp         d10,d11,[sp,#-16]!
109    stp         d12,d13,[sp,#-16]!
110    stp         d14,d15,[sp,#-16]!
111    stp         x19, x20,[sp,#-16]!
112
113    mov         x15,x4 // pi1_coeff
114    mov         x16,x5 // ht
115    mov         x17,x6 // wd
116
117    mov         x4,x15                      //loads pi1_coeff
118    mov         x6,x16                      //loads ht
119    mov         x10,x17                     //loads wd
120
121    ld1         {v0.8b},[x4]                //coeff = vld1_s8(pi1_coeff)
122    subs        x14,x6,#0                   //checks for ht == 0
123    abs         v2.8b, v0.8b                //vabs_s8(coeff)
124
125//******* added
126    mov         x11, #2
127//******* added ends
128
129    ble         end_loops
130
131    dup         v24.8b, v2.b[0]             //coeffabs_0 = vdup_lane_u8(coeffabs, 0)
132    sub         x12,x0,#2                   //pu1_src - 2
133    dup         v25.8b, v2.b[1]             //coeffabs_1 = vdup_lane_u8(coeffabs, 1)
134    add         x4,x12,x2                   //pu1_src_tmp2_8 = pu1_src + src_strd
135    dup         v26.8b, v2.b[2]             //coeffabs_2 = vdup_lane_u8(coeffabs, 2)
136
137    tst         x10,#3                      //checks wd for multiples of 4
138    lsl         x5, x10, #1                 //2wd
139
140    dup         v27.8b, v2.b[3]             //coeffabs_3 = vdup_lane_u8(coeffabs, 3)
141
142    and         x7,x14,#1                   //added                //calculating ht_residue ht_residue = (ht & 1)
143    sub         x14,x14,x7                  //added                //decrement height by ht_residue(residue value is calculated outside)
144
145    bne         outer_loop_4                // this branching happens when the width is 2 or 6
146
147    cmp         x10,#12
148    beq         skip_16
149
150    cmp         x10,#8
151    bge         outer_loop_16
152
153skip_16:
154    tst         x6,#3
155
156//******* removal
157    //mov        x11,#8
158//******* removal ends
159
160    sub         x9,x0,#2
161    beq         outer_loop_ht_4             //this branching happens when the height is a a multiple of 4
162
163
164
165//     cmp        x10,#12
166//     beq     outer_loop_8
167//     cmp        x10,#16
168//     bge    outer_loop_16
169    b           outer_loop_8
170
171
172
173outer_loop_16:
174    add         x4,x12,x2
175
176
177    and         x0, x12, #31
178    add         x20,x12, x2 , lsl #1
179    prfm        PLDL1KEEP,[x20]
180
181
182
183
184
185
186    add         x19,x12,#8
187    ld1         { v0.2s},[x12],x11          //vector load pu1_src
188    ld1         { v1.2s},[x19],x11          //vector load pu1_src
189    mov         x10,x5                      //2wd
190    mul         x14, x14 , x10
191    ld1         { v2.2s},[x12],x11          //vector load pu1_src
192    ld1         { v3.2s},[x19],x11          //vector load pu1_src
193    add         x20,x4, x2 , lsl #1
194    prfm        PLDL1KEEP,[x20]
195    mov         x9,#10
196    ld1         { v4.2s},[x12],x11          //vector load pu1_src
197    ld1         { v5.2s},[x19],x11          //vector load pu1_src
198    sub         x20,x3,#8
199    neg         x6, x20
200    sub         x8,x3,#8
201    ld1         { v6.2s},[x12],x9           //vector load pu1_src
202    ld1         { v7.2s},[x19],x9           //vector load pu1_src
203
204
205    add         x19,x4,#8
206    umull       v30.8h, v2.8b, v25.8b       //mul_res = vmull_u8(src[0_3], coeffabs_3)//
207    ld1         { v29.2s},[x4],x11          //vector load pu1_src
208    ld1         { v31.2s},[x19],x11         //vector load pu1_src
209
210    umlsl       v30.8h, v0.8b, v24.8b       //mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
211
212    ld1         { v10.2s},[x4],x11          //vector load pu1_src
213    ld1         { v11.2s},[x19],x11         //vector load pu1_src
214
215    umlal       v30.8h, v4.8b, v26.8b       //mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
216
217    ld1         { v12.2s},[x4],x11          //vector load pu1_src
218    ld1         { v13.2s},[x19],x11         //vector load pu1_src
219
220    umlsl       v30.8h, v6.8b, v27.8b       //mul_res = vmlal_u8(src[0_1], coeffabs_1)//
221
222    ld1         { v14.4s},[x4],x9           //vector load pu1_src
223    ld1         { v15.2s},[x19],x9          //vector load pu1_src
224
225    umull       v28.8h, v3.8b, v25.8b
226    lsl         x6,x6,#1
227    sub         x20,x5,x3,lsl #1
228    neg         x3, x20
229    umlsl       v28.8h, v1.8b, v24.8b
230    lsl         x8,x8,#1
231    sub         x20,x5,x2,lsl #1
232    neg         x7, x20
233    umlal       v28.8h, v5.8b, v26.8b
234
235    umlsl       v28.8h, v7.8b, v27.8b
236    cmp         x14,#32
237    beq         epilog_end
238    sub         x14, x14,#64
239
240inner_loop_16:
241
242    // and            x7, x12, #31                    //decrement the wd loop
243    // cmp            x7, x0
244    add         x20,x12, x2 , lsl #2
245    prfm        PLDL1KEEP,[x20]
246    add         x20,x4, x2 , lsl #2
247    prfm        PLDL1KEEP,[x20]
248
249
250    subs        x10,x10,#16
251
252    umull       v22.8h, v10.8b, v25.8b      //mul_res = vmull_u8(src[0_3], coeffabs_3)//
253
254
255
256//     add x20,x12,x2,lsl #1
257    //csel x12, x20, x12,eq
258//     sub x20,x12,x5
259    //csel x12, x20, x12,eq
260    add         x20,x12,x7
261    csel        x12, x20, x12,eq
262    add         x20,x12,x2
263    csel        x4, x20, x4,eq
264
265
266    st1         { v30.8h}, [x1],#16
267    umlsl       v22.8h, v29.8b, v24.8b      //mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
268
269
270
271
272    add         x19,x12,#8
273    ld1         { v0.2s},[x12],x11          //vector load pu1_src
274    ld1         { v1.2s},[x19],x11          //vector load pu1_src
275    umlal       v22.8h, v12.8b, v26.8b      //mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
276
277
278
279
280    ld1         { v2.2s},[x12],x11          //vector load pu1_src
281    ld1         { v3.2s},[x19],x11          //vector load pu1_src
282    umlsl       v22.8h, v14.8b, v27.8b      //mul_res = vmlal_u8(src[0_1], coeffabs_1)//
283
284
285    ld1         { v4.2s},[x12],x11          //vector load pu1_src
286    ld1         { v5.2s},[x19],x11          //vector load pu1_src
287    umull       v20.8h, v11.8b, v25.8b      //mul_res = vmull_u8(src[0_3], coeffabs_3)//
288
289    st1         { v28.8h}, [x1],x8
290    umlsl       v20.8h, v31.8b, v24.8b      //mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
291
292    ld1         { v6.2s},[x12],x9           //vector load pu1_src
293    ld1         { v7.2s},[x19],x9           //vector load pu1_src
294    umlal       v20.8h, v13.8b, v26.8b      //mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
295
296    add         x19,x4,#8
297    ld1         { v29.2s},[x4],x11          //vector load pu1_src
298    ld1         { v31.2s},[x19],x11         //vector load pu1_src
299    umlsl       v20.8h, v15.8b, v27.8b      //mul_res = vmlal_u8(src[0_1], coeffabs_1)//
300
301
302    ld1         { v10.2s},[x4],x11          //vector load pu1_src
303    ld1         { v11.2s},[x19],x11         //vector load pu1_src
304    umull       v30.8h, v2.8b, v25.8b       //mul_res = vmull_u8(src[0_3], coeffabs_3)//
305
306    ld1         { v12.2s},[x4],x11          //vector load pu1_src
307    ld1         { v13.2s},[x19],x11         //vector load pu1_src
308    umlsl       v30.8h, v0.8b, v24.8b       //mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
309
310    ld1         { v14.2s},[x4],x9           //vector load pu1_src
311    ld1         { v15.2s},[x19],x9          //vector load pu1_src
312    umlal       v30.8h, v4.8b, v26.8b       //mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
313
314    st1         { v22.8h},[x1],#16          //store the result pu1_dst
315    umlsl       v30.8h, v6.8b, v27.8b       //mul_res = vmlal_u8(src[0_1], coeffabs_1)//
316
317    csel        x10, x5, x10,eq             //2wd
318    umull       v28.8h, v3.8b, v25.8b
319
320
321
322    umlsl       v28.8h, v1.8b, v24.8b
323    st1         { v20.8h},[x1],x6           //store the result pu1_dst
324
325
326    add         x20,x1,x3,lsl #1
327    csel        x1, x20, x1,eq
328    umlal       v28.8h, v5.8b, v26.8b
329
330    subs        x14,x14,#32                 //decrement the ht loop
331    umlsl       v28.8h, v7.8b, v27.8b
332
333
334
335//     mov            x0, x7
336    bgt         inner_loop_16
337
338
339
340    add         x14,x14,#64
341    cmp         x14,#32
342    beq         epilog_end
343
344epilog:
345
346    st1         { v30.8h}, [x1],#16
347    umull       v22.8h, v10.8b, v25.8b      //mul_res = vmull_u8(src[0_3], coeffabs_3)//
348    st1         { v28.8h}, [x1],x8
349
350
351
352    umlsl       v22.8h, v29.8b, v24.8b      //mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
353    subs        x10,x10,#16                 //decrement the wd loop
354    umlal       v22.8h, v12.8b, v26.8b      //mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
355//     add x20,x12,x2,lsl #1
356    //csel x12, x20, x12,eq
357    add         x20,x12,x7
358    csel        x12, x20, x12,eq
359    umlsl       v22.8h, v14.8b, v27.8b      //mul_res = vmlal_u8(src[0_1], coeffabs_1)//
360    // sub x20,x12,x5
361    //csel x12, x20, x12,eq
362    csel        x10, x5, x10,eq             //2wd
363    add         x20,x12,x2
364    csel        x4, x20, x4,eq
365    umull       v20.8h, v11.8b, v25.8b      //mul_res = vmull_u8(src[0_3], coeffabs_3)//
366
367    add         x19,x12,#8
368    ld1         { v0.2s},[x12],x11          //vector load pu1_src
369    ld1         { v1.2s},[x19],x11          //vector load pu1_src
370
371    umlsl       v20.8h, v31.8b, v24.8b      //mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
372
373    ld1         { v2.2s},[x12],x11          //vector load pu1_src
374    ld1         { v3.2s},[x19],x11          //vector load pu1_src
375
376    umlal       v20.8h, v13.8b, v26.8b      //mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
377
378    ld1         { v4.2s},[x12],x11          //vector load pu1_src
379    ld1         { v5.2s},[x19],x11          //vector load pu1_src
380
381    umlsl       v20.8h, v15.8b, v27.8b      //mul_res = vmlal_u8(src[0_1], coeffabs_1)//
382    ld1         { v6.2s},[x12],x9           //vector load pu1_src
383    ld1         { v7.2s},[x19],x9           //vector load pu1_src
384    umull       v30.8h, v2.8b, v25.8b       //mul_res = vmull_u8(src[0_3], coeffabs_3)//
385
386    add         x19,x4,#8
387    ld1         { v29.2s},[x4],x11          //vector load pu1_src
388    ld1         { v31.2s},[x19],x11         //vector load pu1_src
389    umlsl       v30.8h, v0.8b, v24.8b       //mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
390
391    ld1         { v10.2s},[x4],x11          //vector load pu1_src
392    ld1         { v11.2s},[x19],x11         //vector load pu1_src
393    umlal       v30.8h, v4.8b, v26.8b       //mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
394
395    umlsl       v30.8h, v6.8b, v27.8b       //mul_res = vmlal_u8(src[0_1], coeffabs_1)//
396
397    ld1         { v12.2s},[x4],x11          //vector load pu1_src
398    ld1         { v13.2s},[x19],x11         //vector load pu1_src
399    umull       v28.8h, v3.8b, v25.8b
400
401    ld1         { v14.2s},[x4],x9           //vector load pu1_src
402    ld1         { v15.2s},[x19],x9          //vector load pu1_src
403
404    umlsl       v28.8h, v1.8b, v24.8b
405    st1         { v22.8h},[x1],#16          //store the result pu1_dst
406    umlal       v28.8h, v5.8b, v26.8b
407    st1         { v20.8h},[x1],x6           //store the result pu1_dst
408    umlsl       v28.8h, v7.8b, v27.8b
409    add         x20,x1,x3,lsl #1
410    csel        x1, x20, x1,eq
411
412
413epilog_end:
414
415    umull       v22.8h, v10.8b, v25.8b      //mul_res = vmull_u8(src[0_3], coeffabs_3)//
416    umlsl       v22.8h, v29.8b, v24.8b      //mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
417    umlal       v22.8h, v12.8b, v26.8b      //mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
418    umlsl       v22.8h, v14.8b, v27.8b      //mul_res = vmlal_u8(src[0_1], coeffabs_1)//
419
420
421    umull       v20.8h, v11.8b, v25.8b      //mul_res = vmull_u8(src[0_3], coeffabs_3)//
422    umlsl       v20.8h, v31.8b, v24.8b      //mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
423    umlal       v20.8h, v13.8b, v26.8b      //mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
424    umlsl       v20.8h, v15.8b, v27.8b      //mul_res = vmlal_u8(src[0_1], coeffabs_1)//
425
426
427    st1         { v30.8h}, [x1],#16
428    st1         { v28.8h}, [x1],x8
429    st1         { v22.8h},[x1],#16          //store the result pu1_dst
430    st1         { v20.8h},[x1],x6           //store the result pu1_dst
431
432
433    mov         x6,x16                      //loads ht
434
435    and         x7,x6,#1
436
437    cmp         x7,#0
438    mov         x10,x5
439    add         x20,x12,x2,lsl #1
440    csel        x12, x20, x12,ne
441    sub         x20,x12,x5
442    csel        x12, x20, x12,ne
443    add         x20,x1,x3,lsl #1
444    csel        x1, x20, x1,ne
445
446
447    bgt         loop_residue_4
448
449    b           end_loops
450
451
452
453
454outer_loop_8:
455
456    add         x6,x1,x3,lsl #1             //pu1_dst + dst_strd
457    mov         x10,x5                      //2wd
458    add         x4,x12,x2                   //pu1_src + src_strd
459
460inner_loop_8:
461    //ld1 {v0.2s, v1.2s},[x12],x11                //vector load pu1_src
462    ld1         {v0.2s},[x12],x11           //vector load pu1_src
463    ld1         {v1.2s},[x12],x11           //vector load pu1_src
464    ld1         {v2.2s},[x12],x11           //vector load pu1_src
465    ld1         {v3.2s},[x12],x11           //vector load pu1_src
466
467
468    //vext.u8    d2,d0,d1,#2                        //vector extract of src[0_2]
469    umull       v29.8h, v1.8b, v25.8b       //mul_res = vmull_u8(src[0_3], coeffabs_3)//
470    umlsl       v29.8h, v0.8b, v24.8b       //mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
471    //vext.u8    d4,d0,d1,#4                        //vector extract of src[0_4]
472    //vext.u8    d6,d0,d1,#6                        //vector extract of src[0_6]
473    umlal       v29.8h, v2.8b, v26.8b       //mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
474    umlsl       v29.8h, v3.8b, v27.8b       //mul_res = vmlal_u8(src[0_1], coeffabs_1)//
475
476    //ld1 {v12.2s, v13.2s},[x4],x11                //vector load pu1_src + src_strd
477    ld1         {v4.2s},[x4],x11            //vector load pu1_src
478    ld1         {v5.2s},[x4],x11            //vector load pu1_src
479    ld1         {v6.2s},[x4],x11            //vector load pu1_src
480    ld1         {v7.2s},[x4],x11            //vector load pu1_src
481    //vext.u8    d14,d12,d13,#2                    //vector extract of src[0_2]
482    umull       v10.8h, v5.8b, v25.8b       //mul_res = vmull_u8(src[0_3], coeffabs_3)//
483    umlsl       v10.8h, v4.8b, v24.8b       //mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
484    //vext.u8    d16,d12,d13,#4                    //vector extract of src[0_4]
485    //vext.u8    d18,d12,d13,#6                    //vector extract of src[0_6]
486    umlal       v10.8h, v6.8b, v26.8b       //mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
487    umlsl       v10.8h, v7.8b, v27.8b       //mul_res = vmlal_u8(src[0_1], coeffabs_1)//
488
489    st1         {v29.8h}, [x1],#16
490
491    subs        x10,x10,#8                  //decrement the wd loop
492    st1         {v10.8h},[x6],#16           //store the result pu1_dst
493    bgt         inner_loop_8
494
495    sub         x12,x12,x5
496    subs        x14,x14,#2                  //decrement the ht loop
497    sub         x1,x1,x5,lsl #1
498    add         x12,x12,x2,lsl #1
499    add         x1,x1,x3,lsl #2
500    bgt         outer_loop_8
501
502    cmp         x7,#0
503    mov         x10,x5
504    bgt         loop_residue_4
505
506    b           end_loops
507
508
509
510//height if 4 comes
511outer_loop_ht_4:
512
513    mov         x10,x5
514
515prologue_ht_4:
516    lsl         x8, x3, #1
517
518inner_loop_ht_4:
519
520    mov         x12,x9
521    mov         x4,x1
522
523    sub         x0, x2, #6                  // not sure if x0 needs to be preserved
524
525    ld1         {v0.2s},[x12],x11           //(1)vector load pu1_src
526    ld1         {v1.2s},[x12],x11           //(1)vector load pu1_src
527    ld1         {v2.2s},[x12],x11           //(1)vector load pu1_src
528    ld1         {v3.2s},[x12],x0            //(1)vector load pu1_src
529
530    ld1         {v4.2s},[x12],x11           //(2)vector load pu1_src
531    ld1         {v5.2s},[x12],x11           //(2)vector load pu1_src
532    ld1         {v6.2s},[x12],x11           //(2)vector load pu1_src
533    ld1         {v7.2s},[x12],x0            //(2)vector load pu1_src
534
535    ld1         {v14.2s},[x12],x11          //(3)vector load pu1_src
536    umull       v29.8h, v1.8b, v25.8b       //(1)mul_res = vmull_u8(src[0_3], coeffabs_3)//
537
538    ld1         {v15.2s},[x12],x11          //(3)vector load pu1_src
539    umlsl       v29.8h, v0.8b, v24.8b       //(1)mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
540
541    ld1         {v16.2s},[x12],x11          //(3)vector load pu1_src
542    umlal       v29.8h, v2.8b, v26.8b       //(1)mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
543
544    ld1         {v17.2s},[x12],x0           //(3)vector load pu1_src
545    umlsl       v29.8h, v3.8b, v27.8b       //(1)mul_res = vmlal_u8(src[0_1], coeffabs_1)//
546
547    ld1         {v18.2s},[x12],x11          //(4)vector load pu1_src
548    umull       v10.8h, v5.8b, v25.8b       //(2)mul_res = vmull_u8(src[0_3], coeffabs_3)//
549
550    ld1         {v19.2s},[x12],x11          //(4)vector load pu1_src
551    umlsl       v10.8h, v4.8b, v24.8b       //(2)mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
552
553    ld1         {v20.2s},[x12],x11          //(4)vector load pu1_src
554    umlal       v10.8h, v6.8b, v26.8b       //(2)mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
555
556    ld1         {v21.2s},[x12],x2           //(4)vector load pu1_src
557    umlsl       v10.8h, v7.8b, v27.8b       //(2)mul_res = vmlal_u8(src[0_1], coeffabs_1)//
558
559    add         x9,x9,#8                    //(core loop)
560
561    subs        x10,x10,#8                  //(prologue)decrement the wd loop
562    beq         epilogue
563
564core_loop:
565    st1         {v29.8h},[x4],x8            //(1)store the result pu1_dst
566    mov         x12,x9
567
568    ld1         {v0.2s},[x12],x11           //(1_1)vector load pu1_src
569    umull       v12.8h, v15.8b, v25.8b      //(3)mul_res = vmull_u8(src[0_3], coeffabs_3)//
570
571    ld1         {v1.2s},[x12],x11           //(1_1)vector load pu1_src
572    umlsl       v12.8h, v14.8b, v24.8b      //(3)mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
573
574    ld1         {v2.2s},[x12],x11           //(1_1)vector load pu1_src
575    umlal       v12.8h, v16.8b, v26.8b      //(3)mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
576
577    ld1         {v3.2s},[x12],x0            //(1_1)vector load pu1_src
578    umlsl       v12.8h, v17.8b, v27.8b      //(3)mul_res = vmlal_u8(src[0_1], coeffabs_1)//
579
580    st1         {v10.8h},[x4],x8            //(2)store the result pu1_dst
581    add         x9,x9,#8                    //(core loop)
582
583    ld1         {v4.2s},[x12],x11           //(2_1)vector load pu1_src
584    umull       v22.8h, v19.8b, v25.8b      //(4)mul_res = vmull_u8(src[0_3], coeffabs_3)//
585
586    ld1         {v5.2s},[x12],x11           //(2_1)vector load pu1_src
587    umlsl       v22.8h, v18.8b, v24.8b      //(4)mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
588
589    ld1         {v6.2s},[x12],x11           //(2_1)vector load pu1_src
590    umlal       v22.8h, v20.8b, v26.8b      //(4)mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
591
592    ld1         {v7.2s},[x12],x0            //(2_1)vector load pu1_src
593    umlsl       v22.8h, v21.8b, v27.8b      //(4)mul_res = vmlal_u8(src[0_1], coeffabs_1)//
594
595    st1         {v12.8h},[x4],x8            //(3)store the result pu1_dst
596    add         x1,x1,#16                   //(core loop)
597
598    ld1         {v14.2s},[x12],x11          //(3_1)vector load pu1_src
599    umull       v29.8h, v1.8b, v25.8b       //(1_1)mul_res = vmull_u8(src[0_3], coeffabs_3)//
600
601    ld1         {v15.2s},[x12],x11          //(3_1)vector load pu1_src
602    umlsl       v29.8h, v0.8b, v24.8b       //(1_1)mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
603
604    ld1         {v16.2s},[x12],x11          //(3_1)vector load pu1_src
605    umlal       v29.8h, v2.8b, v26.8b       //(1_1)mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
606
607    ld1         {v17.2s},[x12],x0           //(3_1)vector load pu1_src
608    umlsl       v29.8h, v3.8b, v27.8b       //(1_1)mul_res = vmlal_u8(src[0_1], coeffabs_1)//
609
610    st1         {v22.8h}, [x4], x8          //(4)store the result pu1_dst
611    subs        x10,x10,#8                  //(core loop)
612
613    umull       v10.8h, v5.8b, v25.8b       //(2_1)mul_res = vmull_u8(src[0_3], coeffabs_3)//
614    ld1         {v18.2s},[x12],x11          //(4_1)vector load pu1_src
615
616    ld1         {v19.2s},[x12],x11          //(4_1)vector load pu1_src
617    umlsl       v10.8h, v4.8b, v24.8b       //(2_1)mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
618
619    ld1         {v20.2s},[x12],x11          //(4_1)vector load pu1_src
620    umlal       v10.8h, v6.8b, v26.8b       //(2_1)mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
621
622    mov         x4, x1                      //(core loop)
623
624    ld1         {v21.2s},[x12],x0           //(4_1)vector load pu1_src
625    umlsl       v10.8h, v7.8b, v27.8b       //(2_1)mul_res = vmlal_u8(src[0_1], coeffabs_1)//
626
627
628
629    bgt         core_loop                   //loopback
630
631epilogue:
632    umull       v12.8h, v15.8b, v25.8b      //(3)mul_res = vmull_u8(src[0_3], coeffabs_3)//
633
634    umlsl       v12.8h, v14.8b, v24.8b      //(3)mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
635
636    umlal       v12.8h, v16.8b, v26.8b      //(3)mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
637
638    umlsl       v12.8h, v17.8b, v27.8b      //(3)mul_res = vmlal_u8(src[0_1], coeffabs_1)//
639
640    st1         {v29.8h},[x4], x8           //(1)store the result pu1_dst
641
642    umull       v22.8h, v19.8b, v25.8b      //(4)mul_res = vmull_u8(src[0_3], coeffabs_3)//
643    umlsl       v22.8h, v18.8b, v24.8b      //(4)mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
644
645    umlal       v22.8h, v20.8b, v26.8b      //(4)mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
646
647    umlsl       v22.8h, v21.8b, v27.8b      //(4)mul_res = vmlal_u8(src[0_1], coeffabs_1)//
648
649    st1         {v10.8h},[x4], x8           //(2)store the result pu1_dst
650
651    st1         {v12.8h},[x4], x8           //(3)store the result pu1_dst
652
653    add         x1,x1,#16                   //(core loop)
654
655    st1         {v22.8h},[x4], x8           //(4)store the result pu1_dst
656
657    sub         x9,x9,x5
658    subs        x14,x14,#4                  //decrement the ht loop
659    sub         x1,x1,x5,lsl #1
660    add         x9,x9,x2,lsl #2
661    add         x1,x1,x3,lsl #3
662    bgt         outer_loop_ht_4
663
664    cmp         x7,#0
665    mov         x10,x5
666    csel        x12, x9, x12,gt
667    csel        x4, x1, x4,gt
668    bgt         loop_residue_4
669
670    b           end_loops
671
672outer_loop_4:
673    add         x6,x1,x3,lsl #1             //pu1_dst + dst_strd
674    mov         x10,x5
675    add         x4,x12,x2                   //pu1_src + src_strd
676
677inner_loop_4:
678    //ld1 {v0.2s, v1.2s},[x12]                    //vector load pu1_src
679    ld1         {v20.2s},[x12],x11          //vector load pu1_src
680    ld1         {v21.2s},[x12],x11          //vector load pu1_src
681    ld1         {v22.2s},[x12],x11          //vector load pu1_src
682    ld1         {v23.2s},[x12]              //vector load pu1_src
683
684//**** removal
685    //add        x12,x12,#4                        //increment the input pointer
686//**** removal ends
687//**** addn
688    sub         x12,x12,#2                  //increment the input pointer
689//**** addn ends
690    ld1         {v16.2s},[x4],x11           //vector load pu1_src
691    ld1         {v17.2s},[x4],x11           //vector load pu1_src
692    ld1         {v18.2s},[x4],x11           //vector load pu1_src
693    ld1         {v19.2s},[x4]               //vector load pu1_src
694    //vext.u8    d2,d0,d1,#2                        //vector extract of src[0_2]
695    //vext.u8    d4,d0,d1,#4                        //vector extract of src[0_4]
696    //ld1 {v12.2s, v13.2s},[x4]                    //vector load pu1_src + src_strd
697    //vext.u8    d6,d0,d1,#6                        //vector extract of src[0_6]
698
699    //add        x4,x4,#4                        //increment the input pointer
700    sub         x4,x4,#2
701    //vext.u8    d14,d12,d13,#2                    //vector extract of src[0_2]
702    //vext.u8    d16,d12,d13,#4                    //vector extract of src[0_4]
703    //vext.u8    d18,d12,d13,#6                    //vector extract of src[0_6]
704
705//**** removal
706    //zip1 v0.2s, v0.2s, v12.2s
707    //zip2  v12.2s, v0.2s, v12.2s                             //vector zip the i iteration and ii interation in single register
708    //zip1 v2.2s, v2.2s, v14.2s
709    //zip2  v14.2s, v2.2s, v14.2s
710    //zip1 v4.2s, v4.2s, v16.2s
711    //zip2  v16.2s, v4.2s, v16.2s
712    //zip1 v6.2s, v6.2s, v18.2s
713    //zip2  v18.2s, v6.2s, v18.2s
714//**** removal ends
715//**** addn
716    zip1        v0.2s, v20.2s, v16.2s
717    zip2        v4.2s, v20.2s, v16.2s       //vector zip the i iteration and ii interation in single register
718    zip1        v1.2s, v21.2s, v17.2s
719    zip2        v5.2s, v21.2s, v17.2s
720    zip1        v2.2s, v22.2s, v18.2s
721    zip2        v6.2s, v22.2s, v18.2s
722    zip1        v3.2s, v23.2s, v19.2s
723    zip2        v7.2s, v23.2s, v19.2s
724//**** addn ends
725
726    umull       v29.8h, v1.8b, v25.8b       //arithmetic operations for ii iteration in the same time
727    umlsl       v29.8h, v0.8b, v24.8b
728    umlal       v29.8h, v2.8b, v26.8b
729    umlsl       v29.8h, v3.8b, v27.8b
730
731    st1         {v29.d}[0],[x1],#8          //store the i iteration result which is in upper part of the register
732    subs        x10,x10,#4                  //decrement the wd by 4
733
734    st1         {v29.d}[1],[x6],#8          //store the ii iteration result which is in lower part of the register
735
736    bgt         inner_loop_4
737
738    sub         x12,x12,x5
739    subs        x14,x14,#2                  //decrement the ht by 2
740    sub         x1,x1,x5,lsl #1
741    add         x12,x12,x2,lsl #1
742    add         x1,x1,x3,lsl #2
743    bgt         outer_loop_4
744
745    cmp         x7,#0
746    mov         x10,x5
747    beq         end_loops
748
749loop_residue_4:
750
751    mov         x10,x5                      //2wd
752
753loop_residue:
754
755    //ld1 {v0.2s, v1.2s},[x12]                    //vector load pu1_src
756    ld1         {v20.2s},[x12],x11          //vector load pu1_src
757    ld1         {v21.2s},[x12],x11          //vector load pu1_src
758    ld1         {v22.2s},[x12],x11          //vector load pu1_src
759    ld1         {v23.2s},[x12]              //vector load pu1_src
760    //vext.u8        d2,d0,d1,#2                //vector extract of src[0_2]
761    //umull v8.8h, v2.8b, v25.8b                //mul_res = vmull_u8(src[0_3], coeffabs_3)//
762    //umlsl v8.8h, v0.8b, v24.8b                //mul_res = vmlsl_u8(src[0_2], coeffabs_2)//
763    //vext.u8        d4,d0,d1,#4                //vector extract of src[0_4]
764    //add            x12,x12,#4                //pu1_src + 4
765    sub         x12, x12, #2
766    //vext.u8        d6,d0,d1,#6                //vector extract of src[0_6]
767    //umlal v8.8h, v4.8b, v26.8b                //mul_res = vmlsl_u8(src[0_0], coeffabs_0)//
768    //umlsl v8.8h, v6.8b, v27.8b                //mul_res = vmlal_u8(src[0_1], coeffabs_1)//
769    umull       v29.8h, v21.8b, v25.8b
770    umlsl       v29.8h, v20.8b, v24.8b
771    umlal       v29.8h, v22.8b, v26.8b
772    umlsl       v29.8h, v23.8b, v27.8b
773
774    st1         {v29.1d},[x1]               //store the result pu1_dst
775    subs        x10,x10,#4                  //decrement the wd loop
776    add         x1,x1,#8                    //pi2_dst + 8
777
778    bgt         loop_residue                //loop again
779
780    //inner loop ends
781    //add            x8,x3,lsl #1            //2*dst_strd
782    //sub             x8,x8,x5,lsl #1            //2*dst_strd - 2wd
783    //sub             x9,x2,x5                //src_strd - 2wd
784    //subs             x7,x7,#1                //decrement the ht loop
785    //add             x12,x12,x9                //pu1_src + src_strd
786    //add            x1,x1,x8                //pu1_dst + 2*dst_strd
787    //bgt              outer_loop_residue_4    //loop again
788    //b                 end_loops                //jumps to end
789
790end_loops:
791
792    // ldmfd sp!,{x4-x12,x15}                  //reload the registers from sp
793    ldp         x19, x20,[sp],#16
794    ldp         d14,d15,[sp],#16
795    ldp         d12,d13,[sp],#16
796    ldp         d10,d11,[sp],#16
797    ret
798
799
800
801
802
803
804