1///*****************************************************************************
2//*
3//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
4//*
5//* Licensed under the Apache License, Version 2.0 (the "License");
6//* you may not use this file except in compliance with the License.
7//* You may obtain a copy of the License at:
8//*
9//* http://www.apache.org/licenses/LICENSE-2.0
10//*
11//* Unless required by applicable law or agreed to in writing, software
12//* distributed under the License is distributed on an "AS IS" BASIS,
13//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14//* See the License for the specific language governing permissions and
15//* limitations under the License.
16//*
17//*****************************************************************************/
18///**
19//*******************************************************************************
20//* @file
21//*  ihevc_weighted_pred_bi_default.s
22//*
23//* @brief
24//*  contains function definitions for weighted prediction used in inter
25//* prediction
26//*
27//* @author
28//*  parthiban v
29//*
30//* @par list of functions:
31//*  - ihevc_weighted_pred_bi_default()
32//*
33//* @remarks
34//*  none
35//*
36//*******************************************************************************
37//*/
38///**
39//*******************************************************************************
40//*
41//* @brief
42//*  does default bi-weighted prediction on the arrays pointed by pi2_src1 and
43//* pi2_src2 and stores it at location  pointed by pi2_dst assumptions : the
44//* function is optimized considering the fact width and  height are multiple
45//* of 2.
46//*
47//* @par description:
48//*  dst = ( (src1 + lvl_shift1) +  (src2 + lvl_shift2) +  1 << (shift - 1) )
49//* >> shift  where shift = 15 - bitdepth
50//*
51//* @param[in] pi2_src1
52//*  pointer to source 1
53//*
54//* @param[in] pi2_src2
55//*  pointer to source 2
56//*
57//* @param[out] pu1_dst
58//*  pointer to destination
59//*
60//* @param[in] src_strd1
61//*  source stride 1
62//*
63//* @param[in] src_strd2
64//*  source stride 2
65//*
66//* @param[in] dst_strd
67//*  destination stride
68//*
69//* @param[in] lvl_shift1
70//*  added before shift and offset
71//*
72//* @param[in] lvl_shift2
73//*  added before shift and offset
74//*
75//* @param[in] ht
76//*  height of the source
77//*
78//* @param[in] wd
79//*  width of the source
80//*
81//* @returns
82//*
83//* @remarks
84//*  none
85//*
86//*******************************************************************************
87//*/
88//void ihevc_weighted_pred_bi_default(word16 *pi2_src1,
89//                                    word16 *pi2_src2,
90//                                    uword8 *pu1_dst,
91//                                    word32 src_strd1,
92//                                    word32 src_strd2,
93//                                    word32 dst_strd,
94//                                    word32 lvl_shift1,
95//                                    word32 lvl_shift2,
96//                                    word32 ht,
97//                                    word32 wd)
98
99//**************variables vs registers*****************************************
100//    x0 => *pi2_src1
101//    x1 => *pi2_src2
102//    x2 => *pu1_dst
103//    x3 =>  src_strd1
104//    x4 =>  src_strd2
105//    x5 =>  dst_strd
106//    x6 =>  lvl_shift1
107//    x7 =>  lvl_shift2
108//    x8 =>  ht
109//    x9 =>  wd
110.text
111.align 4
112
113.include "ihevc_neon_macros.s"
114
115.globl ihevc_weighted_pred_bi_default_av8
116
117.type ihevc_weighted_pred_bi_default_av8, %function
118
119ihevc_weighted_pred_bi_default_av8:
120
121    ldr         w8,[sp,#0]
122    ldr         w9,[sp,#8]
123
124    // stmfd sp!, {x4-x12, x14}                //stack stores the values of the arguments
125
126    stp         x19, x20,[sp,#-16]!
127    stp         x21, x22,[sp,#-16]!
128
129    mov         x15,x4 // src_strd2 40
130    mov         x16,x5 // dst_strd 44
131    mov         x17,x6 // lvl_shift1 48
132    mov         x19,x7 // lvl_shift2 52
133    mov         x20,x8 // ht 56
134    mov         x21,x9 // wd 60
135
136    mov         x4,x15                      //load src_strd2
137    lsl         x3,x3,#1
138    mov         x5,x16                      //load dst_strd
139    mov         x6,x17                      //load lvl_shift1
140    lsl         x4,x4,#1
141    mov         x7,x19                      //load lvl_shift2
142    mov         x8,x20                      //load ht
143    mov         x9,x21                      //load wd
144    dup         v4.8h,w6                    //lvl_shift1_t = vmov_n_s16((int16_t)lvl_shift1)
145    dup         v6.8h,w7                    //lvl_shift2_t = vmov_n_s16((int16_t)lvl_shift2)
146    movi        v0.8h, #0x40                //tmp_lvl_shift = 1 << (shift - 1)
147    add         v4.8h,  v4.8h,v6.8h
148    add         v0.8h,  v0.8h ,  v4.8h
149//   vmvn.i32    v2.8h,#0x6                         @vmovq_n_s32(tmp_shift)
150    lsl         x6,x9,#1
151    sub         x20,x6,x3,lsl #2            //4*src_strd1 - wd
152    neg         x7, x20
153    sub         x20,x6,x4,lsl #2            //4*src_strd2 - wd
154    neg         x10, x20
155    //asr            x6,#1
156    //rsb            x6,x6,x5,lsl #2             @4*dst_strd - wd
157
158    cmp         x8,#0                       //check ht == 0
159    beq         end_loops                   //if equal, then end the function
160
161chroma_decision:
162    orr         x14,x8,x9
163    cmp         x14,#10
164    beq         outer_loop_chroma_8x2
165
166    cmp         x14,#6
167    beq         outer_loop_chroma_4x2
168
169
170luma_decision:
171    cmp         x9,#24
172    beq         outer_loop_8
173
174    cmp         x9,#16
175    bge         outer_loop_16
176
177    cmp         x9,#12
178    beq         outer_loop_4
179
180    cmp         x9,#8
181    bge         outer_loop_8
182
183
184
185
186
187
188outer_loop_4:
189    cmp         x9,#0                       //check wd == 0
190    beq         end_loops                   //if equal, then end the function
191
192core_loop_4:
193    add         x11,x0,x3                   //pi2_src_tmp1 = pi2_src1 + 2*src_strd1(2* because pi1_src is a 16 bit pointer)
194    add         x12,x1,x4                   //pi2_src_tmp2 = pi2_src2 + 2*src_strd2(2* because pi2_src is a 16 bit pointer)
195    ld1         {v6.4h},[x0],#8             //load and increment the pi2_src1
196    add         x14,x2,x5                   //pu1_dst_tmp = pu1_dst + dst_strd
197    ld1         {v7.4h},[x1],#8             //load and increment the pi2_src2
198    ld1         {v1.4h},[x11],x3            //load and increment the pi2_src1 ii iteration
199    sqadd       v18.4h,v6.4h,v7.4h
200    sqadd       v18.4h,v18.4h,v0.4h         //vaddq_s32(i4_tmp1_t1, tmp_lvl_shift_t)
201    ld1         {v3.4h},[x12],x4            //load and increment the pi2_src2 ii iteration
202    sqadd       v20.4h,v1.4h,v3.4h          //vaddq_s32(i4_tmp2_t1, i4_tmp2_t2)
203    sqadd       v19.4h,v20.4h,v0.4h         //vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t)
204    mov         v18.d[1],v19.d[0]
205    sqshrun     v20.8b, v18.8h,#7
206    ld1         {v22.4h},[x11],x3           //load and increment the pi2_src1 iii iteration
207    ld1         {v23.4h},[x12],x4           //load and increment the pi2_src2 iii iteration
208    sqadd       v30.4h,v22.4h,v23.4h
209    sqadd       v30.4h,v30.4h,v0.4h         //vaddq_s32(i4_tmp1_t1, tmp_lvl_shift_t) iii iteration
210    ld1         {v24.4h},[x11],x3           //load and increment the pi2_src1 iv iteration
211    ld1         {v25.4h},[x12],x4           //load and increment the pi2_src2 iv iteration
212    sqadd       v18.4h,v24.4h,v25.4h        //vaddq_s32(i4_tmp2_t1, i4_tmp2_t2) iv iteration
213    sqadd       v31.4h,v18.4h,v0.4h
214    mov         v30.d[1],v31.d[0]
215    st1         {v20.s}[0],[x2],#4          //store pu1_dst i iteration
216    st1         {v20.s}[1],[x14],x5         //store pu1_dst ii iteration
217    sqshrun     v30.8b, v30.8h,#7
218    st1         {v30.s}[0],[x14],x5         //store pu1_dst iii iteration                                                //vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t) iv iteratio
219    subs        x9,x9,#4                    //decrement wd by 4 and check for 0
220    st1         {v30.s}[1],[x14],x5         //store pu1_dst iv iteration
221    bgt         core_loop_4                 //if greater than 0 repeat the core loop again
222
223end_core_loop_4:
224
225    subs        x8,x8,#4                    //decrement the ht by 4
226
227    add         x0,x0,x7                    //pi2_src1 + 4*src_strd1 - 2*wd(since pi2_src1 is 16 bit pointer double the increment with double the wd decrement)
228    asr         x9,x6,#1
229    add         x1,x1,x10                   //pi2_src2 + 4*src_strd2 - 2*wd
230    sub         x20,x9,x5,lsl #2            //4*dst_strd - wd
231    neg         x14, x20
232    add         x2,x2,x14
233                                            //pu1_dst + dst_std - wd
234    bgt         core_loop_4                 //if ht is greater than 0 goto outer_loop
235
236    b           end_loops
237
238
239// this is only for chroma module with input 2x2
240outer_loop_chroma_4x2:
241    cmp         x9,#0                       //check wd == 0
242    beq         end_loops                   //if equal, then end the function
243    sub         x20,x6,x3,lsl #1            //2*src_strd1 - wd
244    neg         x7, x20
245    sub         x20,x6,x4,lsl #1            //2*src_strd2 - wd
246    neg         x10, x20
247core_loop_chroma_4x2:
248    add         x11,x0,x3                   //pi2_src_tmp1 = pi2_src1 + 2*src_strd1(2* because pi1_src is a 16 bit pointer)
249    add         x12,x1,x4                   //pi2_src_tmp2 = pi2_src2 + 2*src_strd2(2* because pi2_src is a 16 bit pointer)
250    ld1         {v6.4h},[x0],#8             //load and increment the pi2_src1
251    add         x14,x2,x5                   //pu1_dst_tmp = pu1_dst + dst_strd
252    ld1         {v7.4h},[x1],#8             //load and increment the pi2_src2
253    ld1         {v1.4h},[x11],x3            //load and increment the pi2_src1 ii iteration
254    sqadd       v18.4h,v6.4h,v7.4h
255    sqadd       v18.4h,v18.4h,v0.4h         //vaddq_s32(i4_tmp1_t1, tmp_lvl_shift_t)
256    ld1         {v3.4h},[x12],x4            //load and increment the pi2_src2 ii iteration
257    sqadd       v20.4h,v1.4h,v3.4h          //vaddq_s32(i4_tmp2_t1, i4_tmp2_t2)
258    sqadd       v19.4h,v20.4h,v0.4h         //vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t)
259    mov         v18.d[1],v19.d[0]
260    sqshrun     v20.8b, v18.8h,#7
261    st1         {v20.s}[0],[x2],#4          //store pu1_dst i iteration
262    st1         {v20.s}[1],[x14],x5         //store pu1_dst ii iteration
263
264    subs        x9,x9,#4                    //decrement wd by 4 and check for 0
265
266    bgt         core_loop_chroma_4x2        //if greater than 0 repeat the core loop again
267
268end_core_loop_chorma_4x2:
269
270    subs        x8,x8,#2                    //decrement the ht by 4
271
272    add         x0,x0,x7                    //pi2_src1 + 2*src_strd1 - 2*wd(since pi2_src1 is 16 bit pointer double the increment with double the wd decrement)
273    asr         x9,x6,#1
274    add         x1,x1,x10                   //pi2_src2 + 2*src_strd2 - 2*wd
275    sub         x20,x9,x5,lsl #1            //2*dst_strd - wd
276    neg         x14, x20
277    add         x2,x2,x14
278                                            //pu1_dst + dst_std - wd
279    bgt         core_loop_chroma_4x2        //if ht is greater than 0 goto outer_loop
280
281    b           end_loops
282
283
284
285outer_loop_8:
286    cmp         x9,#0                       //check wd == 0
287    beq         end_loops                   //if equal, then end the function
288    add         x11,x0,x3                   //pi2_src_tmp1 = pi2_src1 + 2*src_strd1(2* because pi1_src is a 16 bit pointer)
289    add         x12,x1,x4                   //pi2_src_tmp2 = pi2_src2 + 2*src_strd2(2* because pi2_src is a 16 bit pointer)
290core_loop_8:
291
292    ld1         { v24.8h},[x0],#16          //load and increment the pi2_src1
293    add         x14,x2,x5                   //pu1_dst_tmp = pu1_dst + dst_strd
294    ld1         { v26.8h},[x1],#16          //load and increment the pi2_src2
295    sqadd       v24.8h,v24.8h,v26.8h
296    ld1         { v28.8h},[x11],x3          //load and increment the pi2_src1 ii iteration
297    sqadd       v24.8h,v24.8h,v0.8h         //vaddq_s32(i4_tmp1_t1, tmp_lvl_shift_t)
298    ld1         { v30.8h},[x12],x4          //load and increment the pi2_src2 ii iteration
299    ld1         { v16.8h},[x11],x3          //load and increment the pi2_src1 iii iteration
300    sqadd       v22.8h,v28.8h,v30.8h        //vaddq_s32(i4_tmp2_t1, i4_tmp2_t2)
301    ld1         { v18.8h},[x12],x4          //load and increment the pi2_src2 iii iteration
302    sqadd       v22.8h,v22.8h,v0.8h         //vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t)
303    sqshrun     v20.8b, v24.8h,#7
304    ld1         { v17.8h},[x11],x3          //load and increment the pi2_src1 iv iteration
305    sqadd       v30.8h,v16.8h,v18.8h
306    sqshrun     v21.8b, v22.8h,#7
307    ld1         { v29.8h},[x12],x4          //load and increment the pi2_src2 iv iteration
308    sqadd       v30.8h,v30.8h,v0.8h         //vaddq_s32(i4_tmp1_t1, tmp_lvl_shift_t) iii iteration
309    st1         {v20.2s},[x2],#8            //store pu1_dst i iteration
310    sqadd       v1.8h,v17.8h,v29.8h         //vaddq_s32(i4_tmp2_t1, i4_tmp2_t2) iv iteration
311    st1         {v21.2s},[x14],x5           //store pu1_dst ii iteration
312    sqadd       v1.8h,v1.8h,v0.8h
313    sqshrun     v30.8b, v30.8h,#7
314    sqshrun     v31.8b, v1.8h,#7
315    add         x11,x0,x3                   //pi2_src_tmp1 = pi2_src1 + 2*src_strd1(2* because pi1_src is a 16 bit pointer)
316    add         x12,x1,x4                   //pi2_src_tmp2 = pi2_src2 + 2*src_strd2(2* because pi2_src is a 16 bit pointer)
317    st1         {v30.2s},[x14],x5           //store pu1_dst iii iteration                                                //vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t) iv iteratio
318    subs        x9,x9,#8                    //decrement wd by 4 and check for 0
319    st1         {v31.2s},[x14],x5           //store pu1_dst iv iteration
320    bgt         core_loop_8                 //if greater than 0 repeat the core loop again
321
322end_core_loop_8:
323
324    subs        x8,x8,#4                    //decrement the ht by 4
325
326    add         x0,x0,x7                    //pi2_src1 + 4*src_strd1 - 2*wd(since pi2_src1 is 16 bit pointer double the increment with double the wd decrement)
327    asr         x9,x6,#1
328    add         x1,x1,x10                   //pi2_src2 + 4*src_strd2 - 2*wd
329    sub         x20,x9,x5,lsl #2            //4*dst_strd - wd
330    neg         x14, x20
331    add         x2,x2,x14
332    add         x11,x0,x3                   //pi2_src_tmp1 = pi2_src1 + 2*src_strd1(2* because pi1_src is a 16 bit pointer)
333    add         x12,x1,x4                   //pi2_src_tmp2 = pi2_src2 + 2*src_strd2(2* because pi2_src is a 16 bit pointer)                                  //pu1_dst + dst_std - wd
334
335    bgt         core_loop_8
336    b           end_loops
337
338
339
340// this is only for chroma module with inpput 4x2
341outer_loop_chroma_8x2:
342    cmp         x9,#0                       //check wd == 0
343    beq         end_loops                   //if equal, then end the function
344    add         x11,x0,x3                   //pi2_src_tmp1 = pi2_src1 + 2*src_strd1(2* because pi1_src is a 16 bit pointer)
345    add         x12,x1,x4                   //pi2_src_tmp2 = pi2_src2 + 2*src_strd2(2* because pi2_src is a 16 bit pointer)
346    sub         x20,x6,x3,lsl #1            //2*src_strd1 - wd
347    neg         x7, x20
348    sub         x20,x6,x4,lsl #1            //2*src_strd2 - wd
349    neg         x10, x20
350core_loop_chroma_8x2:
351
352    ld1         { v24.8h},[x0],#16          //load and increment the pi2_src1
353    add         x14,x2,x5                   //pu1_dst_tmp = pu1_dst + dst_strd
354    ld1         { v26.8h},[x1],#16          //load and increment the pi2_src2
355    sqadd       v24.8h,v24.8h,v26.8h
356    ld1         { v28.8h},[x11],x3          //load and increment the pi2_src1 ii iteration
357    sqadd       v24.8h,v24.8h,v0.8h         //vaddq_s32(i4_tmp1_t1, tmp_lvl_shift_t)
358    ld1         { v30.8h},[x12],x4          //load and increment the pi2_src2 ii iteration
359    ld1         { v16.8h},[x11],x3          //load and increment the pi2_src1 iii iteration
360    sqadd       v22.8h,v28.8h,v30.8h        //vaddq_s32(i4_tmp2_t1, i4_tmp2_t2)
361    sqadd       v22.8h,v22.8h,v0.8h         //vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t)
362    sqshrun     v20.8b, v24.8h,#7
363    sqshrun     v21.8b, v22.8h,#7
364    st1         {v20.2s},[x2],#8            //store pu1_dst i iteration
365    st1         {v21.2s},[x14],x5           //store pu1_dst ii iteration
366
367    add         x11,x0,x3                   //pi2_src_tmp1 = pi2_src1 + 2*src_strd1(2* because pi1_src is a 16 bit pointer)
368    add         x12,x1,x4                   //pi2_src_tmp2 = pi2_src2 + 2*src_strd2(2* because pi2_src is a 16 bit pointer)
369                                            //vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t) iv iteratio
370    subs        x9,x9,#8                    //decrement wd by 4 and check for 0
371
372    bgt         core_loop_chroma_8x2        //if greater than 0 repeat the core loop again
373
374end_core_loop_chroma_8x2:
375
376    subs        x8,x8,#2                    //decrement the ht by 4
377
378    add         x0,x0,x7                    //pi2_src1 + 4*src_strd1 - 2*wd(since pi2_src1 is 16 bit pointer double the increment with double the wd decrement)
379    asr         x9,x6,#1
380    add         x1,x1,x10                   //pi2_src2 + 4*src_strd2 - 2*wd
381    sub         x20,x9,x5,lsl #1            //4*dst_strd - wd
382    neg         x14, x20
383    add         x2,x2,x14
384    add         x11,x0,x3                   //pi2_src_tmp1 = pi2_src1 + 2*src_strd1(2* because pi1_src is a 16 bit pointer)
385    add         x12,x1,x4                   //pi2_src_tmp2 = pi2_src2 + 2*src_strd2(2* because pi2_src is a 16 bit pointer)                                  //pu1_dst + dst_std - wd
386
387    bgt         core_loop_chroma_8x2
388
389    b           end_loops
390
391
392
393
394outer_loop_16:
395    cmp         x9,#0                       //check wd == 0
396    beq         end_loops                   //if equal, then end the function
397    add         x11,x0,x3                   //pi2_src_tmp1 = pi2_src1 + 2*src_strd1(2* because pi1_src is a 16 bit pointer)
398    add         x12,x1,x4                   //pi2_src_tmp2 = pi2_src2 + 2*src_strd2(2* because pi2_src is a 16 bit pointer)
399    sub         x20,x6,x3,lsl #1            //2*src_strd1 - wd
400    neg         x7, x20
401    mov         x14,#16
402    sub         x10,x14,x5
403    sub         x11,x3,x14
404    sub         x12,x14,x3
405
406    sub         x20,x9,x5,lsl #1            //2*dst_strd - wd
407    neg         x14, x20
408
409
410
411prolog_16:
412
413
414    ld1         { v2.8h},[x0],#16           //load and increment the pi2_src1
415    ld1         { v4.8h},[x1],#16           //load and increment the pi2_src2
416    ld1         { v5.8h},[x0],x11           //load and increment the pi2_src1
417    ld1         { v17.8h},[x1],x11          //load and increment the pi2_src2
418    ld1         { v6.8h},[x0],#16           //load and increment the pi2_src1 ii iteration
419    subs        x9,x9,#16
420    ld1         { v1.8h},[x1],#16           //load and increment the pi2_src2 ii iteration
421    sub         x20,x8,#2
422    csel        x8, x20, x8,eq
423    sqadd       v22.8h,v2.8h,v4.8h
424    ld1         { v29.8h},[x0],x12          //load and increment the pi2_src1 ii iteration
425    sqadd       v28.8h,v5.8h,v17.8h
426    ld1         { v16.8h},[x1],x12          //load and increment the pi2_src2 ii iteration
427    add         x20,x0,x7
428    csel        x0, x20, x0,eq
429    add         x20,x1,x7
430    csel        x1, x20, x1,eq
431    sqadd       v24.8h,v6.8h,v1.8h
432    ld1         { v2.8h},[x0],#16
433    sqadd       v26.8h,v29.8h,v16.8h
434// if the input is chroma with 8x2 block size
435    cmp         x8,#0
436    beq         epilog_16
437
438    ld1         { v4.8h},[x1],#16           //load and increment the pi2_src2
439    sqadd       v22.8h,v22.8h,v0.8h
440    ld1         { v5.8h},[x0],x11           //load and increment the pi2_src1
441    sqadd       v28.8h,v28.8h,v0.8h
442    ld1         { v17.8h},[x1],x11          //load and increment the pi2_src2
443    sqadd       v24.8h,v24.8h,v0.8h
444    ld1         { v6.8h},[x0],#16           //load and increment the pi2_src1 ii iteration
445    sqadd       v30.8h,v26.8h,v0.8h
446    sqshrun     v20.8b, v22.8h,#7
447    ld1         { v1.8h},[x1],#16           //load and increment the pi2_src2 ii iteration
448    sqshrun     v21.8b, v28.8h,#7
449    ld1         { v29.8h},[x0],x12          //load and increment the pi2_src1 ii iteration
450    sqshrun     v26.8b, v24.8h,#7
451    ld1         { v16.8h},[x1],x12          //load and increment the pi2_src2 ii iteration
452    sqshrun     v27.8b, v30.8h,#7
453
454
455
456core_loop_16:
457
458    cmp         x9,#0
459    sqadd       v22.8h,v2.8h,v4.8h
460    asr         x20,x6,#1
461    csel        x9,x20,x9,eq
462    //asreq           x9,x6,#1
463    mov         v20.d[1],v21.d[0]
464    mov         v26.d[1],v27.d[0]
465    st1         { v20.4s},[x2],x5
466    sqadd       v28.8h,v5.8h,v17.8h
467    st1         { v26.4s},[x2],x10
468    add         x20,x2,x14
469    csel        x2, x20, x2,eq
470    sqadd       v24.8h,v6.8h,v1.8h
471    subs        x9,x9,#16
472    add         x20,x0,x7
473    csel        x0, x20, x0,eq
474    sqadd       v26.8h,v29.8h,v16.8h
475
476    add         x20,x1,x7
477    csel        x1, x20, x1,eq
478    sub         x20,x8,#2
479    csel        x8,x20,x8,eq
480    cmp         x8,#0
481    //subeqs           x8,x8,#2                      //decrement the ht by 2
482    beq         epilog_16
483
484
485    sqadd       v22.8h,v22.8h,v0.8h
486    ld1         { v2.8h},[x0],#16           //load and increment the pi2_src1
487    sqadd       v28.8h,v28.8h,v0.8h
488    ld1         { v4.8h},[x1],#16           //load and increment the pi2_src2
489    sqadd       v24.8h,v24.8h,v0.8h
490    ld1         { v5.8h},[x0],x11           //load and increment the pi2_src1
491    sqadd       v30.8h,v26.8h,v0.8h
492    ld1         { v17.8h},[x1],x11          //load and increment the pi2_src2
493    sqshrun     v20.8b, v22.8h,#7
494    ld1         { v6.8h},[x0],#16           //load and increment the pi2_src1 ii iteration
495    sqshrun     v21.8b, v28.8h,#7
496    ld1         { v1.8h},[x1],#16           //load and increment the pi2_src2 ii iteration
497    sqshrun     v26.8b, v24.8h,#7
498    ld1         { v29.8h},[x0],x12          //load and increment the pi2_src1 ii iteration
499    sqshrun     v27.8b, v30.8h,#7
500    ld1         { v16.8h},[x1],x12          //load and increment the pi2_src2 ii iteration
501
502
503    b           core_loop_16
504
505
506epilog_16:
507
508    sqadd       v22.8h,v22.8h,v0.8h
509    sqadd       v28.8h,v28.8h,v0.8h
510    sqadd       v24.8h,v24.8h,v0.8h
511    sqadd       v30.8h,v26.8h,v0.8h
512    sqshrun     v20.8b, v22.8h,#7
513    sqshrun     v21.8b, v28.8h,#7
514    sqshrun     v26.8b, v24.8h,#7
515    sqshrun     v27.8b, v30.8h,#7
516    mov         v20.d[1],v21.d[0]
517    mov         v26.d[1],v27.d[0]
518    st1         { v20.4s},[x2],x5
519    st1         { v26.4s},[x2]
520
521
522
523end_core_loop_16:
524
525
526
527
528
529
530
531
532end_loops:
533    // ldmfd sp!,{x4-x12,x15}                  //reload the registers from sp
534    ldp         x21, x22,[sp],#16
535    ldp         x19, x20,[sp],#16
536
537    ret
538
539
540
541
542