1///*****************************************************************************
2//*
3//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
4//*
5//* Licensed under the Apache License, Version 2.0 (the "License");
6//* you may not use this file except in compliance with the License.
7//* You may obtain a copy of the License at:
8//*
9//* http://www.apache.org/licenses/LICENSE-2.0
10//*
11//* Unless required by applicable law or agreed to in writing, software
12//* distributed under the License is distributed on an "AS IS" BASIS,
13//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14//* See the License for the specific language governing permissions and
15//* limitations under the License.
16//*
17//*****************************************************************************/
18///**
19//*******************************************************************************
20//* //file
21//*  ihevc_inter_pred_chroma_vert_neon_w16inp_w16out_neon.s
22//*
23//* //brief
24//*  contains function definitions for inter prediction  interpolation.
25//* functions are coded using neon  intrinsics and can be compiled using
26
27//* rvct
28//*
29//* //author
30//*  yogeswaran rs / parthiban
31//*
32//* //par list of functions:
33//*
34//*
35//* //remarks
36//*  none
37//*
38//*******************************************************************************
39//*/
40///**
41///**
42//*******************************************************************************
43//*
44//* //brief
45//*    chroma interprediction filter for 16bit vertical input and output.
46//*
47//* //par description:
48//*    applies a vertical filter with coefficients pointed to  by 'pi1_coeff' to
49//*    the elements pointed by 'pu1_src' and  writes to the location pointed by
50//*    'pu1_dst'  input is 16 bits  the filter output is downshifted by 6 and
51//*    8192 is  subtracted to store it as a 16 bit number  the output is used as
52//*    a input to weighted prediction   assumptions : the function is optimized
53//*    considering the fact width and  height are multiple of 2.
54//*
55//* //param[in] pi2_src
56//*  word16 pointer to the source
57//*
58//* //param[out] pi2_dst
59//*  word16 pointer to the destination
60//*
61//* //param[in] src_strd
62//*  integer source stride
63//*
64//* //param[in] dst_strd
65//*  integer destination stride
66//*
67//* //param[in] pi1_coeff
68//*  word8 pointer to the filter coefficients
69//*
70//* //param[in] ht
71//*  integer height of the array
72//*
73//* //param[in] wd
74//*  integer width of the array
75//*
76//* //returns
77//*
78//* //remarks
79//*  none
80//*
81//*******************************************************************************
82//*/
83//void ihevc_inter_pred_chroma_vert_w16inp_w16out(word16 *pi2_src,
84//                                                 word16 *pi2_dst,
85//                                                 word32 src_strd,
86//                                                 word32 dst_strd,
87//                                                 word8 *pi1_coeff,
88//                                                 word32 ht,
89//                                                 word32 wd)
90//**************variables vs registers*****************************************
91//x0 => *pu1_src
92//x1 => *pi2_dst
93//x2 =>  src_strd
94//x3 =>  dst_strd
95.text
96.align 4
97
98.include "ihevc_neon_macros.s"
99
100.globl ihevc_inter_pred_chroma_vert_w16inp_w16out_av8
101
102.type ihevc_inter_pred_chroma_vert_w16inp_w16out_av8, %function
103
104ihevc_inter_pred_chroma_vert_w16inp_w16out_av8:
105
106    // stmfd sp!, {x4-x12, x14}                    //stack stores the values of the arguments
107
108    stp         x19, x20,[sp,#-16]!
109
110    mov         x15,x4 // pi1_coeff
111    mov         x16,x5 // ht
112    mov         x17,x6 // wd
113
114    mov         x4, x15                     //loads pi1_coeff
115    mov         x6, x17                     //wd
116    lsl         x2,x2,#1                    //src_strd = 2* src_strd
117    mov         x5,x16                      //loads ht
118    ld1         {v0.8b},[x4]                //loads pi1_coeff
119    sub         x4,x0,x2                    //pu1_src - src_strd
120    sxtl        v0.8h, v0.8b                //long the value
121
122    tst         x6,#3                       //checks wd  == 2
123    dup         v16.4h, v0.h[0]             //coeff_0
124    dup         v17.4h, v0.h[1]             //coeff_1
125    dup         v18.4h, v0.h[2]             //coeff_2
126    dup         v19.4h, v0.h[3]             //coeff_3
127
128    bgt         core_loop_ht_2              //jumps to loop handles wd 2
129
130    tst         x5,#3                       //checks ht == mul of 4
131    beq         core_loop_ht_4              //jumps to loop handles ht mul of 4
132
133core_loop_ht_2:
134    lsl         x7,x2,#1                    //2*src_strd
135    lsl         x3,x3,#1                    //2*dst_strd
136    lsl         x9,x6,#2                    //4*wd
137    sub         x6,x3,x6,lsl #1             //2*dst_strd - 2*wd
138    sub         x8,x7,x9                    //2*src_strd - 4*wd
139    mov         x12,x9                      //4wd
140
141inner_loop_ht_2:
142    add         x0,x4,x2                    //increments pi2_src
143    ld1         {v0.4h},[x4],#8             //loads pu1_src
144    smull       v0.4s, v0.4h, v16.4h        //vmull_s16(src_tmp1, coeff_0)
145    subs        x12,x12,#8                  //2wd + 8
146    ld1         {v2.4h},[x0],x2             //loads pi2_src
147    smull       v7.4s, v2.4h, v16.4h        //vmull_s16(src_tmp2, coeff_0)
148    ld1         {v3.4h},[x0],x2             //loads pi2_src
149    smlal       v0.4s, v2.4h, v17.4h
150    ld1         {v6.4h},[x0],x2
151    smlal       v7.4s, v3.4h, v17.4h
152    ld1         {v2.4h},[x0]
153    add         x7,x1,x3                    //pu1_dst + dst_strd
154    smlal       v0.4s, v3.4h, v18.4h
155    smlal       v7.4s, v6.4h, v18.4h
156    smlal       v0.4s, v6.4h, v19.4h
157    smlal       v7.4s, v2.4h, v19.4h
158    sqshrn      v0.4h, v0.4s,#6             //right shift
159    sqshrn      v30.4h, v7.4s,#6            //right shift
160    st1         {v0.2s},[x1],#8             //stores the loaded value
161    st1         {v30.2s},[x7]               //stores the loaded value
162    bgt         inner_loop_ht_2             //inner loop -again
163
164    //inner loop ends
165    subs        x5,x5,#2                    //increments ht
166    add         x1,x1,x6,lsl #1             //pu1_dst += 2*dst_strd - 2*wd
167    mov         x12,x9                      //4wd
168    add         x4,x4,x8                    //pi1_src_tmp1 += 2*src_strd - 4*wd
169    bgt         inner_loop_ht_2             //loop again
170
171    b           end_loops                   //jumps to end
172
173core_loop_ht_4:
174    lsl         x7,x2,#2                    //2*src_strd
175    lsl         x10,x3,#2                   //2*dst_strd
176    lsr         x11, x6, #1                 //divide by 2
177    sub         x14,x10,x6,lsl #1           //2*dst_strd - 2*wd
178    sub         x8,x7,x6,lsl #2             //2*src_strd - 4*wd
179
180    mul         x12, x5 , x11               //multiply height by width
181    sub         x12, x12,#4                 //subtract by one for epilog
182    lsl         x11, x6, #1                 //2*wd
183    lsl         x3,x3,#1                    //2*dst_strd
184
185prolog:
186    add         x0,x4,x2                    //increments pi2_src
187    ld1         {v0.4h},[x4],#8             //loads pu1_src
188    ld1         {v1.4h},[x0],x2             //loads pi2_src
189    subs        x11,x11,#4
190    ld1         {v2.4h},[x0],x2             //loads pi2_src
191    smull       v30.4s, v0.4h, v16.4h       //vmull_s16(src_tmp1, coeff_0)
192    ld1         {v3.4h},[x0],x2
193    smlal       v30.4s, v1.4h, v17.4h
194    smlal       v30.4s, v2.4h, v18.4h
195    add         x9,x1,x3                    //pu1_dst + dst_strd
196    smlal       v30.4s, v3.4h, v19.4h
197
198    ld1         {v4.4h},[x0],x2
199    smull       v28.4s, v1.4h, v16.4h       //vmull_s16(src_tmp2, coeff_0)
200    add         x20,x4,x8
201    csel        x4, x20, x4,le
202    lsl         x20,x6,#1
203    csel        x11, x20, x11,le
204    smlal       v28.4s, v2.4h, v17.4h
205    smlal       v28.4s, v3.4h, v18.4h
206    ld1         {v5.4h},[x0],x2
207    smlal       v28.4s, v4.4h, v19.4h
208
209    sqshrn      v30.4h, v30.4s,#6           //right shift
210
211    ld1         {v6.4h},[x0],x2
212    smull       v26.4s, v2.4h, v16.4h       //vmull_s16(src_tmp2, coeff_0)
213    smlal       v26.4s, v3.4h, v17.4h
214    smlal       v26.4s, v4.4h, v18.4h
215    add         x0,x4,x2
216    ld1         {v0.4h},[x4],#8             //loads pu1_src
217    smlal       v26.4s, v5.4h, v19.4h
218
219    sqshrn      v28.4h, v28.4s,#6           //right shift
220
221    ld1         {v1.4h},[x0],x2             //loads pi2_src
222    smull       v24.4s, v3.4h, v16.4h       //vmull_s16(src_tmp2, coeff_0)
223    st1         {v30.2s},[x1],#8            //stores the loaded value
224    smlal       v24.4s, v4.4h, v17.4h
225    ld1         {v2.4h},[x0],x2             //loads pi2_src
226    smlal       v24.4s, v5.4h, v18.4h
227    ld1         {v3.4h},[x0],x2
228    smlal       v24.4s, v6.4h, v19.4h
229    add         x20,x1,x14,lsl #1
230    csel        x1, x20, x1,le
231
232    sqshrn      v26.4h, v26.4s,#6           //right shift
233    subs        x12,x12,#4
234
235    beq         epilog                      //jumps to epilog
236
237kernel_4:
238    smull       v30.4s, v0.4h, v16.4h       //vmull_s16(src_tmp1, coeff_0)
239    subs        x11,x11,#4
240    smlal       v30.4s, v1.4h, v17.4h
241    st1         {v28.2s},[x9],x3            //stores the loaded value
242    smlal       v30.4s, v2.4h, v18.4h
243    smlal       v30.4s, v3.4h, v19.4h
244
245    sqshrn      v24.4h, v24.4s,#6           //right shift
246
247    ld1         {v4.4h},[x0],x2
248    smull       v28.4s, v1.4h, v16.4h       //vmull_s16(src_tmp2, coeff_0)
249    smlal       v28.4s, v2.4h, v17.4h
250    smlal       v28.4s, v3.4h, v18.4h
251    smlal       v28.4s, v4.4h, v19.4h
252    st1         {v26.2s},[x9],x3            //stores the loaded value
253    add         x20,x4,x8
254    csel        x4, x20, x4,le
255    lsl         x20,x6,#1
256    csel        x11, x20, x11,le
257
258    sqshrn      v30.4h, v30.4s,#6           //right shift
259
260    ld1         {v5.4h},[x0],x2
261    smull       v26.4s, v2.4h, v16.4h       //vmull_s16(src_tmp2, coeff_0)
262    ld1         {v6.4h},[x0],x2
263    smlal       v26.4s, v3.4h, v17.4h
264    st1         {v24.2s},[x9]               //stores the loaded value
265    add         x0,x4,x2
266    smlal       v26.4s, v4.4h, v18.4h
267    ld1         {v0.4h},[x4],#8             //loads pu1_src
268    smlal       v26.4s, v5.4h, v19.4h
269
270    sqshrn      v28.4h, v28.4s,#6           //right shift
271
272    ld1         {v1.4h},[x0],x2             //loads pi2_src
273    smull       v24.4s, v3.4h, v16.4h       //vmull_s16(src_tmp2, coeff_0)
274    ld1         {v2.4h},[x0],x2             //loads pi2_src
275    smlal       v24.4s, v4.4h, v17.4h
276    add         x9,x1,x3                    //pu1_dst + dst_strd
277    ld1         {v3.4h},[x0],x2
278    smlal       v24.4s, v5.4h, v18.4h
279
280    st1         {v30.2s},[x1],#8            //stores the loaded value
281    smlal       v24.4s, v6.4h, v19.4h
282
283    sqshrn      v26.4h, v26.4s,#6           //right shift
284    add         x20,x1,x14,lsl #1
285    csel        x1, x20, x1,le
286
287    subs        x12,x12,#4
288
289    bgt         kernel_4                    //jumps to kernel_4
290
291epilog:
292    smull       v30.4s, v0.4h, v16.4h       //vmull_s16(src_tmp1, coeff_0)
293    st1         {v28.2s},[x9],x3            //stores the loaded value
294    smlal       v30.4s, v1.4h, v17.4h
295    smlal       v30.4s, v2.4h, v18.4h
296    smlal       v30.4s, v3.4h, v19.4h
297
298    sqshrn      v24.4h, v24.4s,#6           //right shift
299
300    smull       v28.4s, v1.4h, v16.4h       //vmull_s16(src_tmp2, coeff_0)
301    ld1         {v4.4h},[x0],x2
302    smlal       v28.4s, v2.4h, v17.4h
303    st1         {v26.2s},[x9],x3            //stores the loaded value
304    smlal       v28.4s, v3.4h, v18.4h
305    smlal       v28.4s, v4.4h, v19.4h
306
307    sqshrn      v30.4h, v30.4s,#6           //right shift
308
309    smull       v26.4s, v2.4h, v16.4h       //vmull_s16(src_tmp2, coeff_0)
310    ld1         {v5.4h},[x0],x2
311    smlal       v26.4s, v3.4h, v17.4h
312    smlal       v26.4s, v4.4h, v18.4h
313    smlal       v26.4s, v5.4h, v19.4h
314
315    sqshrn      v28.4h, v28.4s,#6           //right shift
316
317    st1         {v24.2s},[x9]               //stores the loaded value
318    smull       v24.4s, v3.4h, v16.4h       //vmull_s16(src_tmp2, coeff_0)
319    smlal       v24.4s, v4.4h, v17.4h
320    add         x9,x1,x3                    //pu1_dst + dst_strd
321    ld1         {v6.4h},[x0],x2
322    smlal       v24.4s, v5.4h, v18.4h
323    smlal       v24.4s, v6.4h, v19.4h
324    st1         {v30.2s},[x1],#8            //stores the loaded value
325
326    sqshrn      v26.4h, v26.4s,#6           //right shift
327
328    st1         {v28.2s},[x9],x3            //stores the loaded value
329
330    sqshrn      v24.4h, v24.4s,#6           //right shift
331    st1         {v26.2s},[x9],x3            //stores the loaded value
332
333    st1         {v24.2s},[x9]               //stores the loaded value
334
335end_loops:
336    // ldmfd sp!,{x4-x12,x15}                  //reload the registers from sp
337    ldp         x19, x20,[sp],#16
338
339    ret
340
341
342
343
344