1///*****************************************************************************
2//*
3//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
4//*
5//* Licensed under the Apache License, Version 2.0 (the "License");
6//* you may not use this file except in compliance with the License.
7//* You may obtain a copy of the License at:
8//*
9//* http://www.apache.org/licenses/LICENSE-2.0
10//*
11//* Unless required by applicable law or agreed to in writing, software
12//* distributed under the License is distributed on an "AS IS" BASIS,
13//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14//* See the License for the specific language governing permissions and
15//* limitations under the License.
16//*
17//*****************************************************************************/
18///**
19//*******************************************************************************
20//* @file
21//*  ihevc_weighted_pred_uni.s
22//*
23//* @brief
24//*  contains function definitions for weighted prediction used in inter
25//* prediction
26//*
27//* @author
28//*  parthiban v
29//*
30//* @par list of functions:
31//*  - ihevc_weighted_pred_uni()
32//*
33//* @remarks
34//*  none
35//*
36//*******************************************************************************
37//*/
38
39///**
40//*******************************************************************************
41//*
42//* @brief
43//*  does uni-weighted prediction on the array pointed by  pi2_src and stores
44//* it at the location pointed by pi2_dst assumptions : the function is
45//* optimized considering the fact width and  height are multiple of 2.
46//*
47//* @par description:
48//*  dst = ( (src + lvl_shift) * wgt0 + (1 << (shift - 1)) )  >> shift +
49//* offset
50//*
51//* @param[in] pi2_src
52//*  pointer to the source
53//*
54//* @param[out] pu1_dst
55//*  pointer to the destination
56//*
57//* @param[in] src_strd
58//*  source stride
59//*
60//* @param[in] dst_strd
61//*  destination stride
62//*
63//* @param[in] wgt0
64//*  weight to be multiplied to the source
65//*
66//* @param[in] off0
67//*  offset to be added after rounding and
68//*
69//* @param[in] shifting
70//*
71//*
72//* @param[in] shift
73//*  (14 bit depth) + log2_weight_denominator
74//*
75//* @param[in] lvl_shift
76//*  added before shift and offset
77//*
78//* @param[in] ht
79//*  height of the source
80//*
81//* @param[in] wd
82//*  width of the source
83//*
84//* @returns
85//*
86//* @remarks
87//*  none
88//*
89//*******************************************************************************
90//*/
91
92//void ihevc_weighted_pred_uni(word16 *pi2_src,
93//                             uword8 *pu1_dst,
94//                             word32 src_strd,
95//                             word32 dst_strd,
96//                             word32 wgt0,
97//                             word32 off0,
98//                             word32 shift,
99//                             word32 lvl_shift,
100//                             word32 ht,
101//                             word32 wd)
102
103//**************variables vs registers*****************************************
104//    x0 => *pi2_src
105//    x1 => *pu1_dst
106//    x2 =>  src_strd
107//    x3 =>  dst_strd
108//    x4 =>  wgt0
109//    x5 =>  off0
110//    x6 =>  shift
111//    x7 =>  lvl_shift
112//    x8 =>    ht
113//    x9    =>    wd
114
115.text
116.align 4
117
118.include "ihevc_neon_macros.s"
119
120.globl ihevc_weighted_pred_uni_av8
121
122.type ihevc_weighted_pred_uni_av8, %function
123
124ihevc_weighted_pred_uni_av8:
125
126    // stmfd sp!, {x4-x12, x14}            //stack stores the values of the arguments
127
128    ldr         w8,[sp,#0]
129    ldr         w9,[sp,#8]
130
131    // stmfd sp!, {x4-x12, x14}                //stack stores the values of the arguments
132
133    stp         x19, x20,[sp,#-16]!
134    stp         x21, x22,[sp,#-16]!
135
136    mov         x15,x4 // src_strd2 40
137    mov         x16,x5 // dst_strd 44
138    mov         x17,x6 // lvl_shift1 48
139    mov         x19,x7 // lvl_shift2 52
140    mov         x20,x8 // ht 56
141    mov         x21,x9 // wd 60
142
143    mov         x4,x15                      //load wgt0
144    mov         x7,x19                      //load lvl_shift
145    mov         x11,#1
146    mov         x5,x16                      //load off0
147    mul         x10, x7, x4                 //lvl_shift * wgt0
148    mov         x6,x17                      //load shift
149    mov         x8,x20                      //load ht
150    lsl         x22,x5,x6
151    add         x10,x10,x22                 //lvl_shift * wgt0 + (off0 << shift)
152    mov         x9,x21                      //load wt
153    sub         x12,x6,#1
154    mov         v0.h[0], w4                 //moved for scalar multiplication
155    lsl         x2,x2,#1
156    dup         v28.4s,w6                   //vmovq_n_s32(tmp_shift)
157    lsl         x22,x11,x12
158    add         x10,x10,x22                 //tmp_lvl_shift += (1 << (shift - 1))
159    dup         v30.4s,w10                  //vmovq_n_s32(tmp_lvl_shift)
160    neg         v28.4s, v28.4s
161    lsl         x4,x9,#1
162
163    cmp         x8,#0                       //check ht == 0
164    beq         end_loops                   //if equal, then end the function
165
166outer_loop:
167    cmp         x9,#0                       //check wd == 0
168    beq         end_loops                   //if equal, then end the function
169
170core_loop:
171    add         x5,x0,x2                    //pi2_src_tmp1 = pi2_src1 + 2*src_strd1(2* because pi1_src is a 16 bit pointer)
172    add         x6,x1,x3                    //pu1_dst_tmp = pu1_dst + dst_strd
173    ld1         {v1.4h},[x0],#8             //load and increment the pi2_src
174    ld1         {v2.4h},[x5],x2             //load and increment the pi2_src_tmp ii iteration
175    smull       v4.4s, v1.4h, v0.h[0]       //vmull_n_s16(pi2_src_val1, (int16_t) wgt0)
176
177    add         v4.4s,  v4.4s ,  v30.4s     //vaddq_s32(i4_tmp1_t, tmp_lvl_shift_t)
178    ld1         {v3.4h},[x5],x2             //load and increment the pi2_src iii iteration
179
180    smull       v6.4s, v2.4h, v0.h[0]       //vmull_n_s16(pi2_src_val2, (int16_t) wgt0) ii iteration
181    ld1         {v5.4h},[x5],x2             //load and increment the pi2_src_tmp iv iteration
182
183    sshl        v4.4s,v4.4s,v28.4s
184    //vshl.s32    q2,q2,q14                    //vshlq_s32(i4_tmp1_t, tmp_shift_t)
185    add         v6.4s,  v6.4s ,  v30.4s     //vaddq_s32(i4_tmp2_t, tmp_lvl_shift_t) ii iteration
186
187    smull       v7.4s, v3.4h, v0.h[0]       //vmull_n_s16(pi2_src_val1, (int16_t) wgt0) iii iteration
188    sqxtun      v4.4h, v4.4s                //vqmovun_s32(sto_res_tmp1)
189
190    add         v7.4s,  v7.4s ,  v30.4s     //vaddq_s32(i4_tmp1_t, tmp_lvl_shift_t) iii iteration
191    //mov v5, v4                        //vcombine_u16(sto_res_tmp2, sto_res_tmp2)
192
193    sshl        v6.4s,v6.4s,v28.4s
194    //vshl.s32    q3,q3,q14                    //vshlq_s32(i4_tmp2_t, tmp_shift_t) ii iteration
195
196    smull       v16.4s, v5.4h, v0.h[0]      //vmull_n_s16(pi2_src_val2, (int16_t) wgt0) iv iteration
197    uqxtn       v4.8b,  v4.8h               //vqmovn_u16(sto_res_tmp3)
198
199    sshl        v7.4s,v7.4s,v28.4s
200    //vshl.s32    q5,q5,q14                    //vshlq_s32(i4_tmp1_t, tmp_shift_t) iii iteration
201    sqxtun      v6.4h, v6.4s                //vqmovun_s32(sto_res_tmp1) ii iteration
202
203    add         v16.4s,  v16.4s ,  v30.4s   //vaddq_s32(i4_tmp2_t, tmp_lvl_shift_t) iv iteration
204    //mov v7, v6                        //vcombine_u16(sto_res_tmp2, sto_res_tmp2) ii iteration
205
206    sqxtun      v7.4h, v7.4s                //vqmovun_s32(sto_res_tmp1) iii iteration
207
208    sshl        v16.4s,v16.4s,v28.4s
209    //vshl.s32    q6,q6,q14                    //vshlq_s32(i4_tmp2_t, tmp_shift_t) iv iteration
210    st1         {v4.s}[0],[x1],#4           //store pu1_dst i iteration
211    //mov v11, v10                        //vcombine_u16(sto_res_tmp2, sto_res_tmp2) iii iteration
212
213    uqxtn       v6.8b,  v6.8h               //vqmovn_u16(sto_res_tmp3) ii iteration
214    st1         {v6.s}[0],[x6],x3           //store pu1_dst ii iteration
215
216    uqxtn       v7.8b,  v7.8h               //vqmovn_u16(sto_res_tmp3) iii iteration
217    sqxtun      v16.4h, v16.4s              //vqmovun_s32(sto_res_tmp1) iv iteration
218
219    //mov v13, v12                        //vcombine_u16(sto_res_tmp2, sto_res_tmp2) iv iteration
220    st1         {v7.s}[0],[x6],x3           //store pu1_dst i iteration iii iteration
221    uqxtn       v16.8b,  v16.8h             //vqmovn_u16(sto_res_tmp3) iv iteration
222
223    subs        x9,x9,#4                    //decrement wd by 4 and check for 0
224    st1         {v16.s}[0],[x6],x3          //store pu1_dst iv iteration
225    bgt         core_loop                   //if greater than 0 repeat the core loop again
226
227end_core_loop:
228    sub         x22,x4,x2,lsl #2            //2*src_strd - wd
229    neg         x11, x22
230    subs        x8,x8,#4                    //decrement the ht by 4
231    add         x0,x0,x11                   //pi2_src + 4*src_strd - 2*wd(since pi2_src is 16 bit pointer double the increment with double the wd decrement)
232    asr         x9,x4,#1
233    sub         x22,x9,x3,lsl #2            //2*dst_strd - wd
234    neg         x12, x22
235    add         x1,x1,x12                   //pu1_dst + dst_std - wd
236    bgt         core_loop                   //if ht is greater than 0 goto outer_loop
237
238end_loops:
239    // ldmfd sp!,{x4-x12,x15}                  //reload the registers from sp
240    ldp         x21, x22,[sp],#16
241    ldp         x19, x20,[sp],#16
242
243    ret
244
245
246