1///*****************************************************************************
2//*
3//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
4//*
5//* Licensed under the Apache License, Version 2.0 (the "License");
6//* you may not use this file except in compliance with the License.
7//* You may obtain a copy of the License at:
8//*
9//* http://www.apache.org/licenses/LICENSE-2.0
10//*
11//* Unless required by applicable law or agreed to in writing, software
12//* distributed under the License is distributed on an "AS IS" BASIS,
13//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14//* See the License for the specific language governing permissions and
15//* limitations under the License.
16//*
17//*****************************************************************************/
18///**
19//*******************************************************************************
20//* //file
21//*  ihevc_inter_pred_chroma_copy_w16out_neon.s
22//*
23//* //brief
24//*  contains function definitions for inter prediction  interpolation.
25//* functions are coded using neon  intrinsics and can be compiled using
26
27//* rvct
28//*
29//* //author
30//*  yogeswaran rs
31//*
32//* //par list of functions:
33//*
34//*
35//* //remarks
36//*  none
37//*
38//*******************************************************************************
39//*/
40///**
41//*******************************************************************************
42//*
43//* //brief
44//*   chroma interprediction filter for copy
45//*
46//* //par description:
47//*    copies the array of width 'wd' and height 'ht' from the  location pointed
48//*    by 'src' to the location pointed by 'dst'
49//*
50//* //param[in] pu1_src
51//*  uword8 pointer to the source
52//*
53//* //param[out] pu1_dst
54//*  uword8 pointer to the destination
55//*
56//* //param[in] src_strd
57//*  integer source stride
58//*
59//* //param[in] dst_strd
60//*  integer destination stride
61//*
62//* //param[in] pi1_coeff
63//*  word8 pointer to the filter coefficients
64//*
65//* //param[in] ht
66//*  integer height of the array
67//*
68//* //param[in] wd
69//*  integer width of the array
70//*
71//* //returns
72//*
73//* //remarks
74//*  none
75//*
76//*******************************************************************************
77//*/
78
79//void ihevc_inter_pred_chroma_copy_w16out(uword8 *pu1_src,
80//                                            word16 *pi2_dst,
81//                                            word32 src_strd,
82//                                            word32 dst_strd,
83//                                            word8 *pi1_coeff,
84//                                            word32 ht,
85//                                            word32 wd)
86//**************variables vs registers*****************************************
87//x0 => *pu1_src
88//x1 => *pi2_dst
89//x2 =>  src_strd
90//x3 =>  dst_strd
91//x4 => *pi1_coeff
92//x5 =>  ht
93//x6 =>  wd
94
95.text
96.align 4
97
98.include "ihevc_neon_macros.s"
99
100.globl ihevc_inter_pred_chroma_copy_w16out_av8
101
102.type ihevc_inter_pred_chroma_copy_w16out_av8, %function
103
104ihevc_inter_pred_chroma_copy_w16out_av8:
105
106    // stmfd sp!, {x4-x12, x14}        //stack stores the values of the arguments
107
108    stp         x19, x20,[sp,#-16]!
109
110    mov         x15,x4 // pi1_coeff
111    mov         x16,x5 // ht
112    mov         x17,x6 // wd
113
114
115    mov         x12,x17                     //loads wd
116    lsl         x12,x12,#1                  //2*wd
117    mov         x7,x16                      //loads ht
118    cmp         x7,#0                       //ht condition(ht == 0)
119    ble         end_loops                   //loop
120    and         x8,x7,#3                    //check ht for mul of 2
121    sub         x9,x7,x8                    //check the rounded height value
122    and         x11,x7,#6
123    cmp         x11,#6
124    beq         loop_ht_6
125    tst         x12,#7                      //conditional check for wd (multiples)
126    beq         core_loop_wd_8
127
128loop_ht_6:
129    sub         x11,x12,#4
130    lsl         x6, x3,#1
131    adds        x6, x6,#0
132    cmp         x9,#0
133    beq         outer_loop_wd_4_ht_2
134
135outer_loop_wd_4:
136    subs        x4,x12,#0                   //wd conditional subtract
137    ble         end_inner_loop_wd_4
138
139inner_loop_wd_4:
140    ld1         {v0.8b},[x0]                //vld1_u8(pu1_src_tmp)
141    add         x5,x0,x2                    //pu1_src +src_strd
142    uxtl        v0.8h, v0.8b                //vmovl_u8(vld1_u8(pu1_src_tmp)
143    add         x10,x1,x6
144    subs        x4,x4,#4                    //wd - 4
145    shl         v0.2d, v0.2d,#6             //vshlq_n_s64(temp, 6)
146    ld1         {v22.8b},[x5],x2            //vld1_u8(pu1_src_tmp)
147    add         x0,x0,#4                    //pu1_src += 4
148    st1         {v0.1d},[x1]                //vst1q_lane_s64(pi2_dst_tmp, temp, 0)
149    add         x1,x1,#8
150    uxtl        v22.8h, v22.8b              //vmovl_u8(vld1_u8(pu1_src_tmp)
151    ld1         {v24.8b},[x5],x2            //vld1_u8(pu1_src_tmp)
152    shl         v22.2d, v22.2d,#6           //vshlq_n_s64(temp, 6)
153    uxtl        v24.8h, v24.8b              //vmovl_u8(vld1_u8(pu1_src_tmp)
154    st1         {v22.1d},[x10],x6           //vst1q_lane_s64(pi2_dst_tmp, temp, 0)
155    shl         v24.2d, v24.2d,#6           //vshlq_n_s64(temp, 6)
156    ld1         {v26.8b},[x5],x2            //vld1_u8(pu1_src_tmp)
157    st1         {v24.1d},[x10],x6           //vst1q_lane_s64(pi2_dst_tmp, temp, 0)
158    uxtl        v26.8h, v26.8b              //vmovl_u8(vld1_u8(pu1_src_tmp)
159    shl         v26.2d, v26.2d,#6           //vshlq_n_s64(temp, 6)
160    st1         {v26.1d},[x10],x6           //vst1q_lane_s64(pi2_dst_tmp, temp, 0)
161    bgt         inner_loop_wd_4
162
163end_inner_loop_wd_4:
164    subs        x9,x9,#4                    //ht - 4
165    sub         x0,x5,x11
166    sub         x1,x10,x11,lsl #1
167    bgt         outer_loop_wd_4
168    cmp         x8,#0
169    bgt         outer_loop_wd_4_ht_2
170
171
172end_loops:
173    // ldmfd sp!,{x4-x12,x15}        //reload the registers from sp
174    ldp         x19, x20,[sp],#16
175
176    ret
177
178
179outer_loop_wd_4_ht_2:
180    subs        x4,x12,#0                   //wd conditional subtract
181    ble         end_inner_loop_wd_4
182
183inner_loop_wd_4_ht_2:
184    ld1         {v0.8b},[x0]                //vld1_u8(pu1_src_tmp)
185    add         x5,x0,x2                    //pu1_src +src_strd
186    uxtl        v0.8h, v0.8b                //vmovl_u8(vld1_u8(pu1_src_tmp)
187    add         x10,x1,x6
188    subs        x4,x4,#4                    //wd - 4
189    shl         v0.2d, v0.2d,#6             //vshlq_n_s64(temp, 6)
190    ld1         {v22.8b},[x5],x2            //vld1_u8(pu1_src_tmp)
191    add         x0,x0,#4                    //pu1_src += 4
192    st1         {v0.1d},[x1]                //vst1q_lane_s64(pi2_dst_tmp, temp, 0)
193    add         x1,x1,#8
194    uxtl        v22.8h, v22.8b              //vmovl_u8(vld1_u8(pu1_src_tmp)
195    ld1         {v24.8b},[x5],x2            //vld1_u8(pu1_src_tmp)
196    shl         v22.2d, v22.2d,#6           //vshlq_n_s64(temp, 6)
197    uxtl        v24.8h, v24.8b              //vmovl_u8(vld1_u8(pu1_src_tmp)
198    st1         {v22.1d},[x10],x6           //vst1q_lane_s64(pi2_dst_tmp, temp, 0)
199    bgt         inner_loop_wd_4_ht_2
200    b           end_loops
201
202
203core_loop_wd_8:
204    //sub            x11,x12,#8
205    lsl         x5, x3,#1
206    adds        x5, x5,#0
207    sub         x20,x12,x3, lsl #2          // x11 = (dst_strd * 4) - width
208    neg         x11, x20
209    sub         x20,x12,x2,lsl #2           //x2->src_strd
210    neg         x8, x20
211    lsr         x4, x12, #3                 // divide by 8
212    mov         x7,x9
213    mul         x7, x7, x4
214    sub         x4,x12,#0                   //wd conditional check
215    sub         x7,x7,#4                    //subtract one for epilog
216    cmp         x9,#0
217    beq         core_loop_wd_8_ht_2
218
219prolog:
220    add         x6,x0,x2                    //pu1_src_tmp += src_strd
221    add         x10,x1,x5
222    ld1         {v1.8b},[x0],#8             //vld1_u8(pu1_src_tmp)
223    ld1         {v3.8b},[x6],x2             //vld1_u8(pu1_src_tmp)
224    ld1         {v5.8b},[x6],x2             //vld1_u8(pu1_src_tmp)
225    ld1         {v7.8b},[x6],x2             //vld1_u8(pu1_src_tmp)
226    uxtl        v16.8h, v1.8b               //vmovl_u8(vld1_u8(pu1_src_tmp))
227    uxtl        v18.8h, v3.8b               //vmovl_u8(vld1_u8(pu1_src_tmp)
228    uxtl        v20.8h, v5.8b               //vmovl_u8(vld1_u8(pu1_src_tmp)
229    uxtl        v22.8h, v7.8b               //vmovl_u8(vld1_u8(pu1_src_tmp)
230    subs        x4,x4,#8                    //wd decrements by 8
231    shl         v0.8h, v16.8h,#6            //vshlq_n_s16(tmp, 6)
232    shl         v2.8h, v18.8h,#6            //vshlq_n_s16(tmp, 6)
233    shl         v4.8h, v20.8h,#6            //vshlq_n_s16(tmp, 6)
234    shl         v6.8h, v22.8h,#6            //vshlq_n_s16(tmp, 6)
235    add         x20,x0,x8
236    csel        x0, x20, x0,le
237    add         x6,x0,x2                    //pu1_src_tmp += src_strd
238    ld1         {v1.8b},[x0],#8             //vld1_u8(pu1_src_tmp)
239    ld1         {v3.8b},[x6],x2             //vld1_u8(pu1_src_tmp)
240    ld1         {v5.8b},[x6],x2             //vld1_u8(pu1_src_tmp)
241    ld1         {v7.8b},[x6],x2             //vld1_u8(pu1_src_tmp)
242
243    st1         {v0.8h},[x1],#16            //vst1q_s16(pi2_dst_tmp, tmp)
244    add         x20,x1,x11,lsl #1
245    csel        x1, x20, x1,le
246    sub         x20,x12,#0                  //wd conditional check
247    csel        x4, x20, x4,le
248
249    subs        x7,x7,#4                    //ht - 4
250
251    blt         epilog_end                  //jumps to epilog_end
252    beq         epilog                      //jumps to epilog
253
254
255
256outer_loop_wd_8:
257
258    st1         {v2.8h},[x10],x5            //vst1q_s16(pi2_dst_tmp, tmp)
259    uxtl        v16.8h, v1.8b               //vmovl_u8(vld1_u8(pu1_src_tmp))
260
261    st1         {v4.8h},[x10],x5            //vst1q_s16(pi2_dst_tmp, tmp)
262    uxtl        v18.8h, v3.8b               //vmovl_u8(vld1_u8(pu1_src_tmp)
263
264    st1         {v6.8h},[x10],x5            //vst1q_s16(pi2_dst_tmp, tmp)
265    uxtl        v20.8h, v5.8b               //vmovl_u8(vld1_u8(pu1_src_tmp)
266
267    uxtl        v22.8h, v7.8b               //vmovl_u8(vld1_u8(pu1_src_tmp)
268
269    subs        x4,x4,#8                    //wd decrements by 8
270    add         x20,x0,x8
271    csel        x0, x20, x0,le
272
273    add         x6,x0,x2                    //pu1_src_tmp += src_strd
274
275    ld1         {v1.8b},[x0],#8             //vld1_u8(pu1_src_tmp)
276    shl         v0.8h, v16.8h,#6            //vshlq_n_s16(tmp, 6)
277
278    ld1         {v3.8b},[x6],x2             //vld1_u8(pu1_src_tmp)
279    shl         v2.8h, v18.8h,#6            //vshlq_n_s16(tmp, 6)
280
281    ld1         {v5.8b},[x6],x2             //vld1_u8(pu1_src_tmp)
282    shl         v4.8h, v20.8h,#6            //vshlq_n_s16(tmp, 6)
283
284    ld1         {v7.8b},[x6],x2             //vld1_u8(pu1_src_tmp)
285    add         x10,x1,x5
286
287    shl         v6.8h, v22.8h,#6            //vshlq_n_s16(tmp, 6)
288
289    st1         {v0.8h},[x1],#16            //vst1q_s16(pi2_dst_tmp, tmp)
290
291    add         x20,x1,x11,lsl #1
292    csel        x1, x20, x1,le
293    sub         x20,x12,#0                  //wd conditional check
294    csel        x4, x20, x4,le
295
296    subs        x7,x7,#4                    //ht - 4
297    bgt         outer_loop_wd_8
298
299epilog:
300    st1         {v2.8h},[x10],x5            //vst1q_s16(pi2_dst_tmp, tmp)
301    uxtl        v16.8h, v1.8b               //vmovl_u8(vld1_u8(pu1_src_tmp))
302
303    st1         {v4.8h},[x10],x5            //vst1q_s16(pi2_dst_tmp, tmp)
304    uxtl        v18.8h, v3.8b               //vmovl_u8(vld1_u8(pu1_src_tmp)
305
306    st1         {v6.8h},[x10],x5            //vst1q_s16(pi2_dst_tmp, tmp)
307    uxtl        v20.8h, v5.8b               //vmovl_u8(vld1_u8(pu1_src_tmp)
308
309    uxtl        v22.8h, v7.8b               //vmovl_u8(vld1_u8(pu1_src_tmp)
310    //add          x6,x0,x2                //pu1_src_tmp += src_strd
311
312    shl         v0.8h, v16.8h,#6            //vshlq_n_s16(tmp, 6)
313    shl         v2.8h, v18.8h,#6            //vshlq_n_s16(tmp, 6)
314    shl         v4.8h, v20.8h,#6            //vshlq_n_s16(tmp, 6)
315    add         x10,x1,x5
316    shl         v6.8h, v22.8h,#6            //vshlq_n_s16(tmp, 6)
317
318    st1         {v0.8h},[x1],#16            //vst1q_s16(pi2_dst_tmp, tmp)
319epilog_end:
320    st1         {v2.8h},[x10],x5            //vst1q_s16(pi2_dst_tmp, tmp)
321    st1         {v4.8h},[x10],x5            //vst1q_s16(pi2_dst_tmp, tmp)
322    st1         {v6.8h},[x10],x5            //vst1q_s16(pi2_dst_tmp, tmp)
323    b           end_loops
324
325core_loop_wd_8_ht_2:
326    add         x6,x0,x2                    //pu1_src_tmp += src_strd
327    add         x10,x1,x5
328    ld1         {v1.8b},[x0],#8             //vld1_u8(pu1_src_tmp)
329    ld1         {v3.8b},[x6],x2             //vld1_u8(pu1_src_tmp)
330    uxtl        v16.8h, v1.8b               //vmovl_u8(vld1_u8(pu1_src_tmp))
331    uxtl        v18.8h, v3.8b               //vmovl_u8(vld1_u8(pu1_src_tmp)
332    subs        x12,x12,#8                  //wd decrements by 8
333    shl         v0.8h, v16.8h,#6            //vshlq_n_s16(tmp, 6)
334    shl         v2.8h, v18.8h,#6            //vshlq_n_s16(tmp, 6)
335    st1         {v0.8h},[x1],#16            //vst1q_s16(pi2_dst_tmp, tmp)
336    st1         {v2.8h},[x10],x5            //vst1q_s16(pi2_dst_tmp, tmp)
337    bgt         core_loop_wd_8_ht_2
338
339    // ldmfd sp!,{x4-x12,x15}         //reload the registers from sp
340    ldp         x19, x20,[sp],#16
341
342    ret
343
344
345
346
347
348
349