1///*****************************************************************************
2//*
3//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
4//*
5//* Licensed under the Apache License, Version 2.0 (the "License");
6//* you may not use this file except in compliance with the License.
7//* You may obtain a copy of the License at:
8//*
9//* http://www.apache.org/licenses/LICENSE-2.0
10//*
11//* Unless required by applicable law or agreed to in writing, software
12//* distributed under the License is distributed on an "AS IS" BASIS,
13//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14//* See the License for the specific language governing permissions and
15//* limitations under the License.
16//*
17//*****************************************************************************/
18///**
19//*******************************************************************************
20//* //file
21//*  ihevc_inter_pred_chroma_vert_neon.s
22//*
23//* //brief
24//*  contains function definitions for inter prediction  interpolation.
25//* functions are coded using neon  intrinsics and can be compiled using
26
27//* rvct
28//*
29//* //author
30//*  yogeswaran rs
31//*
32//* //par list of functions:
33//*
34//*
35//* //remarks
36//*  none
37//*
38//*******************************************************************************
39//*/
40///**
41///**
42//*******************************************************************************
43//*
44//* //brief
45//*   chroma interprediction filter for vertical input
46//*
47//* //par description:
48//*    applies a vertical filter with coefficients pointed to  by 'pi1_coeff' to
49//*    the elements pointed by 'pu1_src' and  writes to the location pointed by
50//*    'pu1_dst'  the output is down shifted by 6 and clipped to 8 bits
51//*    assumptions : the function is optimized considering the fact width is
52//*    multiple of 2,4 or 8. and also considering height  should be multiple of 2
53//*    width 4,8 is optimized further
54//*
55//* //param[in] pu1_src
56//*  uword8 pointer to the source
57//*
58//* //param[out] pu1_dst
59//*  uword8 pointer to the destination
60//*
61//* //param[in] src_strd
62//*  integer source stride
63//*
64//* //param[in] dst_strd
65//*  integer destination stride
66//*
67//* //param[in] pi1_coeff
68//*  word8 pointer to the filter coefficients
69//*
70//* //param[in] ht
71//*  integer height of the array
72//*
73//* //param[in] wd
74//*  integer width of the array
75//*
76//* //returns
77//*
78//* //remarks
79//*  none
80//*
81//*******************************************************************************
82//*/
83//void ihevc_inter_pred_chroma_vert(uword8 *pu1_src,
84//                                   uword8 *pu1_dst,
85//                                   word32 src_strd,
86//                                   word32 dst_strd,
87//                                   word8 *pi1_coeff,
88//                                   word32 ht,
89//                                   word32 wd)
90//**************variables vs registers*****************************************
91//x0 => *pu1_src
92//x1 => *pi2_dst
93//x2 =>  src_strd
94//x3 =>  dst_strd
95.text
96.align 4
97
98.include "ihevc_neon_macros.s"
99
100.globl ihevc_inter_pred_chroma_vert_av8
101
102.type ihevc_inter_pred_chroma_vert_av8, %function
103
104ihevc_inter_pred_chroma_vert_av8:
105
106    // stmfd sp!,{x4-x12,x14}        //stack stores the values of the arguments
107
108    stp         x19, x20,[sp,#-16]!
109
110    mov         x15,x4 // pi1_coeff
111    mov         x16,x5 // ht
112    mov         x17,x6 // wd
113
114    mov         x4,x16                      //loads ht
115    mov         x12,x15                     //loads pi1_coeff
116    cmp         x4,#0                       //checks ht == 0
117    mov         x6,x17                      //loads wd
118    sub         x0,x0,x2                    //pu1_src - src_strd
119    ld1         {v0.8b},[x12]               //loads pi1_coeff
120
121    ble         end_loops                   //jumps to end
122
123    tst         x6,#3                       //checks (wd & 3)
124    abs         v3.8b, v0.8b                //vabs_s8(coeff)
125    lsl         x10,x6,#1                   //2*wd
126    dup         v0.8b, v3.b[0]              //coeffabs_0
127    dup         v1.8b, v3.b[1]              //coeffabs_1
128    dup         v2.8b, v3.b[2]              //coeffabs_2
129    dup         v3.8b, v3.b[3]              //coeffabs_3
130
131    bgt         outer_loop_wd_2             //jumps to loop handling wd ==2
132
133    tst         x4,#7                       //checks ht for mul of 8
134    beq         core_loop_ht_8              //when height is multiple of 8
135
136    lsl         x7,x3,#1                    //2*dst_strd
137    sub         x9,x7,x10                   //2*dst_strd - 2wd
138    lsl         x12,x2,#1                   //2*src_strd
139    sub         x8,x12,x10                  //2*src_strd - 2wd
140    mov         x5,x10                      //2wd
141
142inner_loop_ht_2:                            //called when wd is multiple of 4 and ht is 4,2
143
144    add         x6,x0,x2                    //pu1_src +src_strd
145    ld1         {v17.8b},[x6],x2            //loads pu1_src
146    subs        x5,x5,#8                    //2wd - 8
147    ld1         {v5.8b},[x0],#8             //loads src
148    umull       v6.8h, v17.8b, v1.8b        //vmull_u8(vreinterpret_u8_u32(src_tmp2), coeffabs_1)
149    ld1         {v4.8b},[x6],x2             //loads incremented src
150    umlsl       v6.8h, v5.8b, v0.8b         //vmlsl_u8(mul_res1, vreinterpret_u8_u32(src_tmp1), coeffabs_0)
151    ld1         {v16.8b},[x6],x2            //loads incremented src
152    umlal       v6.8h, v4.8b, v2.8b         //vmlal_u8(mul_res1, vreinterpret_u8_u32(src_tmp3), coeffabs_2)
153    umull       v4.8h, v4.8b, v1.8b
154    umlsl       v6.8h, v16.8b, v3.8b
155    umlsl       v4.8h, v17.8b, v0.8b
156    ld1         {v18.8b},[x6]               //loads the incremented src
157    umlal       v4.8h, v16.8b, v2.8b
158    sqrshrun    v6.8b, v6.8h,#6             //shifts right
159    umlsl       v4.8h, v18.8b, v3.8b
160    add         x6,x1,x3                    //pu1_dst + dst_strd
161    sqrshrun    v4.8b, v4.8h,#6             //shifts right
162    st1         {v6.8b},[x1],#8             //stores the loaded value
163
164    st1         {v4.8b},[x6]                //stores the loaded value
165
166    bgt         inner_loop_ht_2             //inner loop again
167
168    subs        x4,x4,#2                    //ht - 2
169    add         x1,x1,x9                    //pu1_dst += (2*dst_strd - 2wd)
170    mov         x5,x10                      //2wd
171    add         x0,x0,x8                    //pu1_src += (2*src_strd - 2wd)
172
173    bgt         inner_loop_ht_2             //loop again
174
175    b           end_loops                   //jumps to end
176
177outer_loop_wd_2:                            //called when width is multiple of 2
178    lsl         x5,x3,#1                    //2*dst_strd
179    mov         x12,x10                     //2wd
180    sub         x9,x5,x10                   //2*dst_strd - 2wd
181    lsl         x7,x2,#1                    //2*src_strd
182    sub         x8,x7,x10                   //2*src_strd - 2wd
183
184inner_loop_wd_2:
185
186    add         x6,x0,x2                    //pu1_src + src_strd
187    ld1         {v6.s}[0],[x0]              //vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp1, 0
188    subs        x12,x12,#4                  //2wd - 4
189    add         x0,x0,#4                    //pu1_src + 4
190    ld1         {v6.s}[1],[x6],x2           //loads pu1_src_tmp
191    dup         v7.2s, v6.s[1]
192    ld1         {v7.s}[1],[x6],x2           //loads pu1_src_tmp
193    umull       v4.8h, v7.8b, v1.8b         //vmull_u8(vreinterpret_u8_u32(src_tmp2), coeffabs_1)
194    dup         v7.2s, v7.s[1]
195    ld1         {v7.s}[1],[x6],x2
196    umlsl       v4.8h, v6.8b, v0.8b
197    umlal       v4.8h, v7.8b, v2.8b
198    dup         v7.2s, v7.s[1]
199    ld1         {v7.s}[1],[x6]
200    add         x6,x1,x3                    //pu1_dst + dst_strd
201    umlsl       v4.8h, v7.8b, v3.8b
202    sqrshrun    v4.8b, v4.8h,#6             //vrshrq_n_s16(vreinterpretq_s16_u16(mul_res1),6)
203    st1         {v4.s}[0],[x1]              //stores the loaded value
204    add         x1,x1,#4                    //pu1_dst += 4
205    st1         {v4.s}[1],[x6]              //stores the loaded value
206
207    bgt         inner_loop_wd_2             //inner loop again
208
209    //inner loop ends
210    subs        x4,x4,#2                    //ht - 2
211    add         x1,x1,x9                    //pu1_dst += 2*dst_strd - 2*wd
212    mov         x12,x10                     //2wd
213    add         x0,x0,x8                    //pu1_src += 2*src_strd - 2*wd
214
215    bgt         inner_loop_wd_2             //loop again
216
217    b           end_loops                   //jumps to end
218
219core_loop_ht_8:                             //when wd & ht is multiple of 8
220
221    lsl         x12,x3,#2                   //4*dst_strd
222    sub         x8,x12,x10                  //4*dst_strd - 2wd
223    lsl         x12,x2,#2                   //4*src_strd
224    sub         x9,x12,x10                  //4*src_strd - 2wd
225
226    bic         x5,x10,#7                   //x5 ->wd
227    lsr         x14, x10, #3                //divide by 8
228    mul         x12, x4 , x14               //multiply height by width
229    sub         x12, x12,#4                 //subtract by one for epilog
230
231prolog:
232    add         x6,x0,x2                    //pu1_src + src_strd
233    ld1         {v5.8b},[x6],x2             //loads pu1_src
234    subs        x5,x5,#8                    //2wd - 8
235    ld1         {v4.8b},[x0],#8             //loads the source
236    ld1         {v6.8b},[x6],x2             //load and increment
237    umull       v30.8h, v5.8b, v1.8b        //mul with coeff 1
238    ld1         {v7.8b},[x6],x2             //load and increment
239    umlsl       v30.8h, v4.8b, v0.8b
240    add         x7,x1,x3                    //pu1_dst
241    umlal       v30.8h, v6.8b, v2.8b
242    umlsl       v30.8h, v7.8b, v3.8b
243    ld1         {v16.8b},[x6],x2            //load and increment
244
245    umull       v28.8h, v6.8b, v1.8b        //mul_res 2
246    add         x20,x0,x9                   //pu1_dst += 4*dst_strd - 2*wd
247    csel        x0, x20, x0,le
248    umlsl       v28.8h, v5.8b, v0.8b
249    bic         x20,x10,#7                  //x5 ->wd
250    csel        x5, x20, x5,le
251    umlal       v28.8h, v7.8b, v2.8b
252    ld1         {v17.8b},[x6],x2
253    umlsl       v28.8h, v16.8b, v3.8b
254    sqrshrun    v30.8b, v30.8h,#6
255
256    ld1         {v18.8b},[x6],x2
257    umull       v26.8h, v7.8b, v1.8b
258    add         x6,x0,x2                    //pu1_src + src_strd
259    umlsl       v26.8h, v6.8b, v0.8b
260    st1         {v30.8b},[x1],#8            //stores the loaded value
261    umlal       v26.8h, v16.8b, v2.8b
262    ld1         {v4.8b},[x0],#8             //loads the source
263    umlsl       v26.8h, v17.8b, v3.8b
264    sqrshrun    v28.8b, v28.8h,#6
265
266    add         x20,x1,x8                   //pu1_src += 4*src_strd - 2*wd
267    csel        x1, x20, x1,le
268    umull       v24.8h, v16.8b, v1.8b
269    ld1         {v5.8b},[x6],x2             //loads pu1_src
270    umlsl       v24.8h, v7.8b, v0.8b
271    subs        x12,x12,#4
272    ld1         {v6.8b},[x6],x2             //load and increment
273    umlal       v24.8h, v17.8b, v2.8b
274    ld1         {v7.8b},[x6],x2             //load and increment
275    umlsl       v24.8h, v18.8b, v3.8b
276
277    lsl         x11,x2,#2
278    st1         {v28.8b},[x7],x3            //stores the loaded value
279    sqrshrun    v26.8b, v26.8h,#6
280    sub         x20,x2,x2,lsl #3
281    neg         x11, x20
282    add         x14,x2,x2,lsl #1
283    add         x14,x14,x11
284    ble         epilog                      //jumps to epilog
285
286kernel_8:
287
288    umull       v30.8h, v5.8b, v1.8b        //mul with coeff 1
289    subs        x5,x5,#8                    //2wd - 8
290    umlsl       v30.8h, v4.8b, v0.8b
291    add         x20,x0,x9                   //pu1_dst += 4*dst_strd - 2*wd
292    csel        x0, x20, x0,le
293    umlal       v30.8h, v6.8b, v2.8b
294    lsl         x20,x2,#3
295    sub         x20,x20,x2
296    csel        x11,x20,x11,le
297    //rsble        x11,x2,x2,lsl #3
298    umlsl       v30.8h, v7.8b, v3.8b
299    st1         {v26.8b},[x7],x3            //stores the loaded value
300    sqrshrun    v24.8b, v24.8h,#6
301
302    ld1         {v16.8b},[x6],x2            //load and increment
303
304    umull       v28.8h, v6.8b, v1.8b        //mul_res 2
305    bic         x20,x10,#7                  //x5 ->wd
306    csel        x5, x20, x5,le
307    umlsl       v28.8h, v5.8b, v0.8b
308    st1         {v24.8b},[x7],x3            //stores the loaded value
309
310    umlal       v28.8h, v7.8b, v2.8b
311
312    ld1         {v17.8b},[x6],x2
313    sqrshrun    v30.8b, v30.8h,#6
314
315    umlsl       v28.8h, v16.8b, v3.8b
316    ld1         {v18.8b},[x6],x2
317    add         x7,x1,x3                    //pu1_dst
318    umull       v26.8h, v7.8b, v1.8b
319    add         x6,x0,x2                    //pu1_src + src_strd
320
321    add         x20,x0, x11
322    prfm        PLDL1KEEP,[x20]
323
324
325    umlsl       v26.8h, v6.8b, v0.8b
326    ld1         {v4.8b},[x0],#8             //loads the source
327
328    umlal       v26.8h, v16.8b, v2.8b
329    st1         {v30.8b},[x1],#8            //stores the loaded value
330
331    umlsl       v26.8h, v17.8b, v3.8b
332    ld1         {v5.8b},[x6],x2             //loads pu1_src
333
334    add         x11,x11,x2
335    sqrshrun    v28.8b, v28.8h,#6
336
337    umull       v24.8h, v16.8b, v1.8b
338    ld1         {v6.8b},[x6],x2             //load and increment
339    add         x20,x1,x8                   //pu1_src += 4*src_strd - 2*wd
340    csel        x1, x20, x1,le
341
342    cmp         x11,x14
343    lsl         x20,x2,#3
344    sub         x20,x20,x2
345    csel        x11,x20,x11,gt
346    //rsbgt        x11,x2,x2,lsl #3
347
348    umlsl       v24.8h, v7.8b, v0.8b
349    subs        x12,x12,#4
350
351    umlal       v24.8h, v17.8b, v2.8b
352    ld1         {v7.8b},[x6],x2             //load and increment
353
354    umlsl       v24.8h, v18.8b, v3.8b
355    st1         {v28.8b},[x7],x3            //stores the loaded value
356    sqrshrun    v26.8b, v26.8h,#6
357
358    bgt         kernel_8                    //jumps to kernel_8
359
360epilog:
361
362    umull       v30.8h, v5.8b, v1.8b        //mul with coeff 1
363    umlsl       v30.8h, v4.8b, v0.8b
364    umlal       v30.8h, v6.8b, v2.8b
365    umlsl       v30.8h, v7.8b, v3.8b
366    st1         {v26.8b},[x7],x3            //stores the loaded value
367    sqrshrun    v24.8b, v24.8h,#6
368
369    ld1         {v16.8b},[x6],x2            //load and increment
370    umull       v28.8h, v6.8b, v1.8b        //mul_res 2
371    umlsl       v28.8h, v5.8b, v0.8b
372    umlal       v28.8h, v7.8b, v2.8b
373    umlsl       v28.8h, v16.8b, v3.8b
374    st1         {v24.8b},[x7],x3            //stores the loaded value
375    sqrshrun    v30.8b, v30.8h,#6
376
377    ld1         {v17.8b},[x6],x2
378    umull       v26.8h, v7.8b, v1.8b
379    add         x7,x1,x3                    //pu1_dst
380    umlsl       v26.8h, v6.8b, v0.8b
381    st1         {v30.8b},[x1],#8            //stores the loaded value
382
383    sqrshrun    v28.8b, v28.8h,#6
384    umlal       v26.8h, v16.8b, v2.8b
385    ld1         {v18.8b},[x6],x2
386    umlsl       v26.8h, v17.8b, v3.8b
387
388    umull       v24.8h, v16.8b, v1.8b
389    sqrshrun    v26.8b, v26.8h,#6
390    st1         {v28.8b},[x7],x3            //stores the loaded value
391    umlsl       v24.8h, v7.8b, v0.8b
392    umlal       v24.8h, v17.8b, v2.8b
393    st1         {v26.8b},[x7],x3            //stores the loaded value
394    umlsl       v24.8h, v18.8b, v3.8b
395
396    sqrshrun    v24.8b, v24.8h,#6
397    st1         {v24.8b},[x7],x3            //stores the loaded value
398end_loops:
399    // ldmfd sp!,{x4-x12,x15}        //reload the registers from sp
400    ldp         x19, x20,[sp],#16
401
402    ret
403
404
405
406