1///*****************************************************************************
2//*
3//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
4//*
5//* Licensed under the Apache License, Version 2.0 (the "License");
6//* you may not use this file except in compliance with the License.
7//* You may obtain a copy of the License at:
8//*
9//* http://www.apache.org/licenses/LICENSE-2.0
10//*
11//* Unless required by applicable law or agreed to in writing, software
12//* distributed under the License is distributed on an "AS IS" BASIS,
13//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14//* See the License for the specific language governing permissions and
15//* limitations under the License.
16//*
17//*****************************************************************************/
18///**
19///**
20//*******************************************************************************
21//*
22//* //brief
23//*     interprediction luma function for copy
24//*
25//* //par description:
26//*   copies the array of width 'wd' and height 'ht' from the  location pointed
27//*   by 'src' to the location pointed by 'dst'
28//*
29//* //param[in] pu1_src
30//*  uword8 pointer to the source
31//*
32//* //param[out] pu1_dst
33//*  uword8 pointer to the destination
34//*
35//* //param[in] src_strd
36//*  integer source stride
37//*
38//* //param[in] dst_strd
39//*  integer destination stride
40//*
41//* //param[in] pi1_coeff
42//*  word8 pointer to the filter coefficients
43//*
44//* //param[in] ht
45//*  integer height of the array
46//*
47//* //param[in] wd
48//*  integer width of the array
49//*
50//* //returns
51//*
52//* //remarks
53//*  none
54//*
55//*******************************************************************************
56//*/
57
58//void ihevc_inter_pred_luma_copy_w16out (
59//                                uword8 *pu1_src,
60//                                word16 *pi2_dst,
61//                                word32 src_strd,
62//                                word32 dst_strd,
63//                                word8 *pi1_coeff,
64//                                word32 ht,
65//                                word32 wd   )
66
67//**************variables vs registers*****************************************
68//    x0 => *pu1_src
69//    x1 => *pi2_dst
70//    x2 =>  src_strd
71//    x3 =>  dst_strd
72//    x7 =>  ht
73//    x12 => wd
74
75.text
76.align 4
77
78.include "ihevc_neon_macros.s"
79
80.globl ihevc_inter_pred_luma_copy_w16out_av8
81
82.type ihevc_inter_pred_luma_copy_w16out_av8, %function
83
84ihevc_inter_pred_luma_copy_w16out_av8:
85
86    // stmfd sp!, {x4-x12, x14}        //stack stores the values of the arguments
87
88    stp         x19, x20,[sp,#-16]!
89
90    mov         x15,x4 // pi1_coeff
91    mov         x16,x5 // ht
92    mov         x17,x6 // wd
93
94    mov         x12,x17                     //loads wd
95    mov         x7,x16                      //loads ht
96    cmp         x7,#0                       //ht condition(ht == 0)
97    ble         end_loops                   //loop
98    tst         x12,#7                      //conditional check for wd (multiples)
99    beq         core_loop_wd_8
100    sub         x11,x12,#4
101    lsl         x6, x3,#1
102    adds        x6, x6,#0
103
104outer_loop_wd_4:
105    subs        x4,x12,#0                   //wd conditional subtract
106    ble         end_inner_loop_wd_4
107
108inner_loop_wd_4:
109    ld1         {v0.8b},[x0]                //vld1_u8(pu1_src_tmp)
110    add         x5,x0,x2                    //pu1_src +src_strd
111    uxtl        v0.8h, v0.8b                //vmovl_u8(vld1_u8(pu1_src_tmp)
112    add         x10,x1,x6
113    subs        x4,x4,#4                    //wd - 4
114    shl         v0.2d, v0.2d,#6             //vshlq_n_s64(temp, 6)
115    ld1         {v22.8b},[x5],x2            //vld1_u8(pu1_src_tmp)
116    add         x0,x0,#4                    //pu1_src += 4
117    st1         {v0.d}[0],[x1]              //vst1q_lane_s64(pi2_dst_tmp, temp, 0)
118    add         x1,x1,#8
119    uxtl        v22.8h, v22.8b              //vmovl_u8(vld1_u8(pu1_src_tmp)
120    ld1         {v24.8b},[x5],x2            //vld1_u8(pu1_src_tmp)
121    shl         v22.2d, v22.2d,#6           //vshlq_n_s64(temp, 6)
122    uxtl        v24.8h, v24.8b              //vmovl_u8(vld1_u8(pu1_src_tmp)
123    st1         {v22.d}[0],[x10],x6         //vst1q_lane_s64(pi2_dst_tmp, temp, 0)
124    shl         v24.2d, v24.2d,#6           //vshlq_n_s64(temp, 6)
125    ld1         {v26.8b},[x5],x2            //vld1_u8(pu1_src_tmp)
126    st1         {v24.d}[0],[x10],x6         //vst1q_lane_s64(pi2_dst_tmp, temp, 0)
127    uxtl        v26.8h, v26.8b              //vmovl_u8(vld1_u8(pu1_src_tmp)
128    shl         v26.2d, v26.2d,#6           //vshlq_n_s64(temp, 6)
129    st1         {v26.d}[0],[x10],x6         //vst1q_lane_s64(pi2_dst_tmp, temp, 0)
130    bgt         inner_loop_wd_4
131
132end_inner_loop_wd_4:
133    subs        x7,x7,#4                    //ht + 4
134    sub         x0,x5,x11
135    sub         x1,x10,x11,lsl #1
136    bgt         outer_loop_wd_4
137
138end_loops:
139    // ldmfd sp!,{x4-x12,x15}        //reload the registers from sp
140    ldp         x19, x20,[sp], #16
141
142
143    ret
144
145
146core_loop_wd_8:
147    //sub            x11,x12,#8
148    lsl         x5, x3,#1
149    adds        x5, x5,#0
150    sub         x20,x12,x3, lsl #2          // x11 = (dst_strd * 4) - width
151    neg         x11, x20
152    sub         x20,x12,x2,lsl #2           //x2->src_strd
153    neg         x8, x20
154    lsr         x4, x12, #3                 // divide by 8
155    mul         x7, x7, x4
156    sub         x4,x12,#0                   //wd conditional check
157    sub         x7,x7,#4                    //subtract one for epilog
158
159prolog:
160    add         x6,x0,x2                    //pu1_src_tmp += src_strd
161    add         x10,x1,x5
162    ld1         {v1.8b},[x0],#8             //vld1_u8(pu1_src_tmp)
163    ld1         {v3.8b},[x6],x2             //vld1_u8(pu1_src_tmp)
164    ld1         {v5.8b},[x6],x2             //vld1_u8(pu1_src_tmp)
165    ld1         {v7.8b},[x6],x2             //vld1_u8(pu1_src_tmp)
166    uxtl        v16.8h, v1.8b               //vmovl_u8(vld1_u8(pu1_src_tmp))
167    uxtl        v18.8h, v3.8b               //vmovl_u8(vld1_u8(pu1_src_tmp)
168    uxtl        v20.8h, v5.8b               //vmovl_u8(vld1_u8(pu1_src_tmp)
169    uxtl        v22.8h, v7.8b               //vmovl_u8(vld1_u8(pu1_src_tmp)
170    subs        x4,x4,#8                    //wd decrements by 8
171    shl         v0.8h, v16.8h,#6            //vshlq_n_s16(tmp, 6)
172    shl         v2.8h, v18.8h,#6            //vshlq_n_s16(tmp, 6)
173    shl         v4.8h, v20.8h,#6            //vshlq_n_s16(tmp, 6)
174    shl         v6.8h, v22.8h,#6            //vshlq_n_s16(tmp, 6)
175    add         x20,x0,x8
176    csel        x0, x20, x0,le
177    add         x6,x0,x2                    //pu1_src_tmp += src_strd
178    ld1         {v1.8b},[x0],#8             //vld1_u8(pu1_src_tmp)
179    ld1         {v3.8b},[x6],x2             //vld1_u8(pu1_src_tmp)
180    ld1         {v5.8b},[x6],x2             //vld1_u8(pu1_src_tmp)
181    ld1         {v7.8b},[x6],x2             //vld1_u8(pu1_src_tmp)
182
183    st1         {v0.8h},[x1],#16            //vst1q_s16(pi2_dst_tmp, tmp)
184    add         x20,x1,x11,lsl #1
185    csel        x1, x20, x1,le
186    sub         x20,x12,#0                  //wd conditional check
187    csel        x4, x20, x4,le
188
189    subs        x7,x7,#4                    //ht - 4
190
191    blt         epilog_end                  //jumps to epilog_end
192    beq         epilog                      //jumps to epilog
193
194
195
196outer_loop_wd_8:
197
198    st1         {v2.8h},[x10],x5            //vst1q_s16(pi2_dst_tmp, tmp)
199    uxtl        v16.8h, v1.8b               //vmovl_u8(vld1_u8(pu1_src_tmp))
200
201    st1         {v4.8h},[x10],x5            //vst1q_s16(pi2_dst_tmp, tmp)
202    uxtl        v18.8h, v3.8b               //vmovl_u8(vld1_u8(pu1_src_tmp)
203
204    st1         {v6.8h},[x10],x5            //vst1q_s16(pi2_dst_tmp, tmp)
205    uxtl        v20.8h, v5.8b               //vmovl_u8(vld1_u8(pu1_src_tmp)
206
207    uxtl        v22.8h, v7.8b               //vmovl_u8(vld1_u8(pu1_src_tmp)
208
209    subs        x4,x4,#8                    //wd decrements by 8
210    add         x20,x0,x8
211    csel        x0, x20, x0,le
212
213    add         x6,x0,x2                    //pu1_src_tmp += src_strd
214
215    ld1         {v1.8b},[x0],#8             //vld1_u8(pu1_src_tmp)
216    shl         v0.8h, v16.8h,#6            //vshlq_n_s16(tmp, 6)
217
218    ld1         {v3.8b},[x6],x2             //vld1_u8(pu1_src_tmp)
219    shl         v2.8h, v18.8h,#6            //vshlq_n_s16(tmp, 6)
220
221    ld1         {v5.8b},[x6],x2             //vld1_u8(pu1_src_tmp)
222    shl         v4.8h, v20.8h,#6            //vshlq_n_s16(tmp, 6)
223
224    ld1         {v7.8b},[x6],x2             //vld1_u8(pu1_src_tmp)
225    add         x10,x1,x5
226
227    shl         v6.8h, v22.8h,#6            //vshlq_n_s16(tmp, 6)
228
229    st1         {v0.8h},[x1],#16            //vst1q_s16(pi2_dst_tmp, tmp)
230
231    add         x20,x1,x11,lsl #1
232    csel        x1, x20, x1,le
233    sub         x20,x12,#0                  //wd conditional check
234    csel        x4, x20, x4,le
235
236    subs        x7,x7,#4                    //ht - 4
237    bgt         outer_loop_wd_8
238
239epilog:
240    st1         {v2.8h},[x10],x5            //vst1q_s16(pi2_dst_tmp, tmp)
241    uxtl        v16.8h, v1.8b               //vmovl_u8(vld1_u8(pu1_src_tmp))
242
243    st1         {v4.8h},[x10],x5            //vst1q_s16(pi2_dst_tmp, tmp)
244    uxtl        v18.8h, v3.8b               //vmovl_u8(vld1_u8(pu1_src_tmp)
245
246    st1         {v6.8h},[x10],x5            //vst1q_s16(pi2_dst_tmp, tmp)
247    uxtl        v20.8h, v5.8b               //vmovl_u8(vld1_u8(pu1_src_tmp)
248
249    uxtl        v22.8h, v7.8b               //vmovl_u8(vld1_u8(pu1_src_tmp)
250    //add          x6,x0,x2                //pu1_src_tmp += src_strd
251
252    shl         v0.8h, v16.8h,#6            //vshlq_n_s16(tmp, 6)
253    shl         v2.8h, v18.8h,#6            //vshlq_n_s16(tmp, 6)
254    shl         v4.8h, v20.8h,#6            //vshlq_n_s16(tmp, 6)
255    add         x10,x1,x5
256    shl         v6.8h, v22.8h,#6            //vshlq_n_s16(tmp, 6)
257
258    st1         {v0.8h},[x1],#16            //vst1q_s16(pi2_dst_tmp, tmp)
259epilog_end:
260    st1         {v2.8h},[x10],x5            //vst1q_s16(pi2_dst_tmp, tmp)
261    st1         {v4.8h},[x10],x5            //vst1q_s16(pi2_dst_tmp, tmp)
262    st1         {v6.8h},[x10],x5            //vst1q_s16(pi2_dst_tmp, tmp)
263
264
265    // ldmfd sp!,{x4-x12,x15}        //reload the registers from sp
266    ldp         x19, x20,[sp], #16
267
268    ret
269
270
271
272
273