1///*****************************************************************************
2//*
3//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
4//*
5//* Licensed under the Apache License, Version 2.0 (the "License");
6//* you may not use this file except in compliance with the License.
7//* You may obtain a copy of the License at:
8//*
9//* http://www.apache.org/licenses/LICENSE-2.0
10//*
11//* Unless required by applicable law or agreed to in writing, software
12//* distributed under the License is distributed on an "AS IS" BASIS,
13//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14//* See the License for the specific language governing permissions and
15//* limitations under the License.
16//*
17//*****************************************************************************/
18///**
19//*******************************************************************************
20//* //file
21//*  ihevc_weighted_pred_bi.s
22//*
23//* //brief
24//*  contains function definitions for weighted prediction used in inter
25//* prediction
26//*
27//* //author
28//*  parthiban v
29//*
30//* //par list of functions:
31//*  - ihevc_weighted_pred_bi()
32//*
33//* //remarks
34//*  none
35//*
36//*******************************************************************************
37//*/
38///**
39//*******************************************************************************
40//*
41//* //brief
42//*  does bi-weighted prediction on the arrays pointed by  pi2_src1 and
43//* pi2_src2 and stores it at location pointed  by pi2_dst   assumptions : the
44//* function is optimized considering the fact width and  height are multiple
45//* of 2.
46//*
47//* //par description:
48//*  dst = ( (src1 + lvl_shift1)*wgt0 +  (src2 + lvl_shift2)*wgt1 +  (off0 +
49//* off1 + 1) << (shift - 1) ) >> shift
50//*
51//* //param[in] pi2_src1
52//*  pointer to source 1
53//*
54//* //param[in] pi2_src2
55//*  pointer to source 2
56//*
57//* //param[out] pu1_dst
58//*  pointer to destination
59//*
60//* //param[in] src_strd1
61//*  source stride 1
62//*
63//* //param[in] src_strd2
64//*  source stride 2
65//*
66//* //param[in] dst_strd
67//*  destination stride
68//*
69//* //param[in] wgt0
70//*  weight to be multiplied to source 1
71//*
72//* //param[in] off0
73//*  offset 0
74//*
75//* //param[in] wgt1
76//*  weight to be multiplied to source 2
77//*
78//* //param[in] off1
79//*  offset 1
80//*
81//* //param[in] shift
82//*  (14 bit depth) + log2_weight_denominator
83//*
84//* //param[in] lvl_shift1
85//*  added before shift and offset
86//*
87//* //param[in] lvl_shift2
88//*  added before shift and offset
89//*
90//* //param[in] ht
91//*  height of the source
92//*
93//* //param[in] wd
94//*  width of the source
95//*
96//* //returns
97//*
98//* //remarks
99//*  none
100//*
101//*******************************************************************************
102//*/
103
104//void ihevc_weighted_pred_bi(word16 *pi2_src1,
105//                            word16 *pi2_src2,
106//                            uword8 *pu1_dst,
107//                            word32 src_strd1,
108//                            word32 src_strd2,
109//                            word32 dst_strd,
110//                            word32 wgt0,
111//                            word32 off0,
112//                            word32 wgt1,
113//                            word32 off1,
114//                            word32 shift,
115//                            word32 lvl_shift1,
116//                            word32 lvl_shift2,
117//                            word32 ht,
118//                            word32 wd)
119
120//**************variables vs registers*****************************************
121//    x0 => *pi2_src1
122//    x1 => *pi2_src2
123//    x2 => *pu1_dst
124//    x3 =>  src_strd1
125//    x4 =>  src_strd2
126//    x5 =>  dst_strd
127//    x6 =>  wgt0
128//    x7 =>  off0
129//    x8 =>  wgt1
130//    x9 =>  off1
131//    x10 =>  shift
132//    x11 =>  lvl_shift1
133//    x12 =>    lvl_shift2
134//    x14 =>    ht
135//    x7    =>    wd
136
137.text
138.align 4
139
140.include "ihevc_neon_macros.s"
141
142.globl ihevc_weighted_pred_bi_av8
143
144.type ihevc_weighted_pred_bi_av8, %function
145
146ihevc_weighted_pred_bi_av8:
147
148    // stmfd sp!, {x4-x12, x14}            //stack stores the values of the arguments
149
150    ldr         w8,[sp,#0]
151    ldr         w9,[sp,#8]
152    ldr         w10,[sp,#16]
153    ldr         w11,[sp,#24]
154    ldr         w12,[sp,#32]
155    ldr         w13,[sp,#40]
156    ldr         w14,[sp,#48]
157
158    sxtw        x8,w8
159    sxtw        x9,w9
160    sxtw        x10,w10
161    sxtw        x11,w11
162    sxtw        x12,w12
163
164
165    stp         x19, x20,[sp,#-16]!
166    stp         x21, x22,[sp,#-16]!
167    stp         x23, x24,[sp,#-16]!
168    stp         x25, x26,[sp,#-16]!
169
170    mov         x15,x4 // src_strd2 40
171    mov         x16,x5 // dst_strd 44
172    mov         x17,x6 // wgt0 48
173    mov         x19,x7 // off0 52
174    mov         x20,x8 // wgt1 56
175    mov         x21,x9 // off1 60
176    mov         x22,x10 // shift 64
177    mov         x23,x11 // lvl_shift1 68
178    mov         x24,x12 // lvl_shift2 72
179    mov         x25,x13 // ht 76
180    mov         x26,x14 // wd 80
181
182    mov         x6,x17                      //load wgt0
183    mov         x11,x23                     //load lvl_shift1
184    mov         x12,x24                     //load lvl_shift2
185    mov         v7.h[0],w6                  //moved for scalar multiplication
186    mul         x4, x11 , x6                //lvl_shift1 * wgt0
187    mov         x8,x20                      //load wgt1
188    mov         x7,x19                      //load off0
189    mov         v7.h[1],w8                  //moved for scalar multiplication
190    madd        x4,x12,x8,x4                //(lvl_shift1 * wgt0) + (lvl_shift2 * wgt1)
191    mov         x9,x21                      //load off1
192    add         x5,x7,x9                    //off0 + off1
193    mov         x10,x22                     //load shift
194    add         x5,x5,#1                    //off0 + off1 + 1
195    sub         x14,x10,#1                  //shift - 1
196    mov         x7,x26                      //load wd
197    lsl         x5,x5,x14                   //((off0 + off1 + 1) << (shift - 1))
198    dup         v28.4s,w10                  //vmovq_n_s32(0-shift)
199    add         x4,x4,x5                    //tmp_lvl_shift += ((off0 + off1 + 1) << (shift - 1))
200    dup         v30.4s,w4                   //vmovq_n_s32(tmp_lvl_shift)
201    neg         v28.4s, v28.4s
202    mov         x4,x15                      //load src_strd2
203    lsl         x9,x7,#1
204    mov         x5,x16                      //load dst_strd
205    lsl         x3,x3,#1
206    mov         x14,x25                     //load ht
207    lsl         x4,x4,#1
208
209    cmp         x14,#0                      //check ht == 0
210    beq         end_loops                   //if equal, then end the function
211
212outer_loop:
213    cmp         x7,#0                       //check wd == 0
214    beq         end_loops                   //if equal, then end the function
215
216core_loop:
217    add         x6,x0,x3                    //pi2_src_tmp1 = pi2_src1 + 2*src_strd1(2* because pi1_src is a 16 bit pointer)
218    add         x8,x1,x4                    //pi2_src_tmp2 = pi2_src2 + 2*src_strd2(2* because pi2_src is a 16 bit pointer)
219    ld1         {v0.4h},[x0],#8             //load and increment the pi2_src1
220    add         x10,x2,x5                   //pu1_dst_tmp = pu1_dst + dst_strd
221    ld1         {v1.4h},[x1],#8             //load and increment the pi2_src2
222    smull       v4.4s, v0.4h, v7.h[0]       //vmull_n_s16(pi2_src1_val1, (int16_t) wgt0)
223    ld1         {v2.4h},[x6],x3             //load and increment the pi2_src_tmp1 ii iteration
224    smull       v5.4s, v1.4h, v7.h[1]       //vmull_n_s16(pi2_src2_val1, (int16_t) wgt1)
225    ld1         {v3.4h},[x8],x4             //load and increment the pi2_src_tmp1 ii iteration
226    add         v4.4s,  v4.4s ,  v5.4s      //vaddq_s32(i4_tmp1_t1, i4_tmp1_t2)
227
228    ld1         {v0.4h},[x6],x3             //load and increment the pi2_src1 iii iteration
229    smull       v6.4s, v2.4h, v7.h[0]       //vmull_n_s16(pi2_src1_val2, (int16_t) wgt0) ii iteration
230
231    ld1         {v1.4h},[x8],x4             //load and increment the pi2_src2 iii iteration
232    add         v4.4s,  v4.4s ,  v30.4s     //vaddq_s32(i4_tmp1_t1, tmp_lvl_shift_t)
233    smull       v19.4s, v0.4h, v7.h[0]      //vmull_n_s16(pi2_src1_val1, (int16_t) wgt0) iii iteration
234
235    ld1         {v2.4h},[x6],x3             //load and increment the pi2_src_tmp1 iv iteration
236    smull       v17.4s, v3.4h, v7.h[1]      //vmull_n_s16(pi2_src2_val2, (int16_t) wgt1) ii iteration
237    sshl        v4.4s,v4.4s,v28.4s          //vshlq_s32(i4_tmp1_t1, tmp_shift_t)
238
239    ld1         {v3.4h},[x8],x4             //load and increment the pi2_src_tmp1 iv iteration
240    add         v6.4s,  v6.4s ,  v17.4s     //vaddq_s32(i4_tmp2_t1, i4_tmp2_t2) ii iteration
241
242    sqxtun      v4.4h, v4.4s                //vqmovun_s32(sto_res_tmp1)
243    smull       v16.4s, v1.4h, v7.h[1]      //vmull_n_s16(pi2_src2_val1, (int16_t) wgt1) iii iteration
244
245    add         v6.4s,  v6.4s ,  v30.4s     //vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t) ii iteration
246    //mov v5, v4                        //vcombine_u16(sto_res_tmp2, sto_res_tmp2)
247    add         v19.4s,  v19.4s ,  v16.4s   //vaddq_s32(i4_tmp1_t1, i4_tmp1_t2) iii iteration
248
249    sshl        v6.4s,v6.4s,v28.4s
250    //vshl.s32    q5,q5,q14                    //vshlq_s32(i4_tmp2_t1, tmp_shift_t) ii iteration
251    smull       v18.4s, v2.4h, v7.h[0]      //vmull_n_s16(pi2_src1_val2, (int16_t) wgt0) iv iteration
252    uqxtn       v4.8b,v4.8h
253    //vqmovn.u16    d4,q2                        //vqmovn_u16(sto_res_tmp3)
254    add         v19.4s,  v19.4s ,  v30.4s   //vaddq_s32(i4_tmp1_t1, tmp_lvl_shift_t) iii iteration
255
256    sqxtun      v6.4h, v6.4s                //vqmovun_s32(sto_res_tmp1) ii iteration
257    smull       v20.4s, v3.4h, v7.h[1]      //vmull_n_s16(pi2_src2_val2, (int16_t) wgt1) iv iteration
258
259    sshl        v19.4s,v19.4s,v28.4s
260    //vshl.s32    q7,q7,q14                    //vshlq_s32(i4_tmp1_t1, tmp_shift_t) iii iteration
261    //mov v11, v10                        //vcombine_u16(sto_res_tmp2, sto_res_tmp2) ii iteration
262
263    add         v18.4s,  v18.4s ,  v20.4s   //vaddq_s32(i4_tmp2_t1, i4_tmp2_t2) iv iteration
264    sqxtun      v19.4h, v19.4s              //vqmovun_s32(sto_res_tmp1) iii iteration
265
266    add         v18.4s,  v18.4s ,  v30.4s   //vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t) iv iteration
267    st1         {v4.s}[0],[x2],#4           //store pu1_dst i iteration
268
269    uqxtn       v6.8b,v6.8h
270    //vqmovn.u16    d10,q5                        //vqmovn_u16(sto_res_tmp3) ii iteration
271    sshl        v18.4s,v18.4s,v28.4s
272    //vshl.s32    q9,q9,q14                    //vshlq_s32(i4_tmp2_t1, tmp_shift_t) iv iteration
273    st1         {v6.s}[0],[x10],x5          //store pu1_dst ii iteration
274
275
276    //mov v15, v14                        //vcombine_u16(sto_res_tmp2, sto_res_tmp2) iii iteration
277    uqxtn       v19.8b,v19.8h
278    //vqmovn.u16    d14,q7                        //vqmovn_u16(sto_res_tmp3) iii iteration
279    sqxtun      v18.4h, v18.4s              //vqmovun_s32(sto_res_tmp1) iv iteration
280    //mov v19, v18                        //vcombine_u16(sto_res_tmp2, sto_res_tmp2)
281    st1         {v19.s}[0],[x10],x5         //store pu1_dst iii iteration
282    uqxtn       v18.8b,v18.8h
283    //vqmovn.u16    d18,q9                        //vqmovn_u16(sto_res_tmp3) iv iteration
284    subs        x7,x7,#4                    //decrement wd by 4 and check for 0
285    st1         {v18.s}[0],[x10],x5         //store pu1_dst iv iteration
286
287    bgt         core_loop                   //if greater than 0 repeat the core loop again
288
289end_core_loop:
290    sub         x20,x9,x3,lsl #2            //2*src_strd1 - wd
291    neg         x11, x20
292    subs        x14,x14,#4                  //decrement the ht by 4
293    sub         x20,x9,x4,lsl #2            //2*src_strd2 - wd
294    neg         x12, x20
295    add         x0,x0,x11                   //pi2_src1 + 4*src_strd1 - 2*wd(since pi2_src1 is 16 bit pointer double the increment with double the wd decrement)
296    asr         x7,x9,#1
297    add         x1,x1,x12                   //pi2_src2 + 4*src_strd2 - 2*wd
298    sub         x20,x7,x5,lsl #2            //2*dst_strd - wd
299    neg         x10, x20
300    add         x2,x2,x10                   //pu1_dst + dst_std - wd
301    bgt         core_loop                   //if ht is greater than 0 goto outer_loop
302
303end_loops:
304    // ldmfd sp!,{x4-x12,x15}              //reload the registers from sp
305    ldp         x25, x26,[sp],#16
306    ldp         x23, x24,[sp],#16
307    ldp         x21, x22,[sp],#16
308    ldp         x19, x20,[sp],#16
309
310    ret
311
312
313
314
315
316
317