1///*****************************************************************************
2//*
3//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
4//*
5//* Licensed under the Apache License, Version 2.0 (the "License");
6//* you may not use this file except in compliance with the License.
7//* You may obtain a copy of the License at:
8//*
9//* http://www.apache.org/licenses/LICENSE-2.0
10//*
11//* Unless required by applicable law or agreed to in writing, software
12//* distributed under the License is distributed on an "AS IS" BASIS,
13//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14//* See the License for the specific language governing permissions and
15//* limitations under the License.
16//*
17//*****************************************************************************/
18///**
19///**
20//*******************************************************************************
21//*
22//* //brief
23//*     interprediction luma function for copy
24//*
25//* //par description:
26//*   copies the array of width 'wd' and height 'ht' from the  location pointed
27//*   by 'src' to the location pointed by 'dst'
28//*
29//* //param[in] pu1_src
30//*  uword8 pointer to the source
31//*
32//* //param[out] pu1_dst
33//*  uword8 pointer to the destination
34//*
35//* //param[in] src_strd
36//*  integer source stride
37//*
38//* //param[in] dst_strd
39//*  integer destination stride
40//*
41//* //param[in] pi1_coeff
42//*  word8 pointer to the filter coefficients
43//*
44//* //param[in] ht
45//*  integer height of the array
46//*
47//* //param[in] wd
48//*  integer width of the array
49//*
50//* //returns
51//*
52//* //remarks
53//*  none
54//*
55//*******************************************************************************
56//*/
57//void ihevc_inter_pred_luma_copy (
58//                            uword8 *pu1_src,
59//                            uword8 *pu1_dst,
60//                            word32 src_strd,
61//                            word32 dst_strd,
62//                            word8 *pi1_coeff,
63//                            word32 ht,
64//                            word32 wd   )
65
66//**************variables vs registers*****************************************
67//    x0 => *pu1_src
68//    x1 => *pu1_dst
69//    x2 =>  src_strd
70//    x3 =>  dst_strd
71//    x11 =>  ht
72//    x16 => wd
73
74.text
75.align 4
76
77.include "ihevc_neon_macros.s"
78
79.globl ihevc_inter_pred_luma_copy_av8
80
81.type ihevc_inter_pred_luma_copy_av8, %function
82
83ihevc_inter_pred_luma_copy_av8:
84    // stmfd sp!, {x8-x16, lr}                //stack stores the values of the arguments
85    stp         x19,x20,[sp, #-16]!
86    mov         x16,x6                      //loads wd
87    mov         x11,x5                      //loads ht
88    cmp         x11,#0                      //checks ht == 0
89    ble         end_loops
90    tst         x16,#15                     //checks wd for multiples for 4 & 8
91    beq         core_loop_wd_16
92    tst         x16,#7                      //checks wd for multiples for 4 & 8
93    beq         core_loop_wd_8
94    sub         x15,x16,#4
95
96outer_loop_wd_4:
97    subs        x8,x16,#0                   //checks wd == 0
98    ble         end_inner_loop_wd_4
99
100inner_loop_wd_4:
101    ld1         {v0.s}[0],[x0]              //vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0)
102    add         x9,x0,x2                    //pu1_src_tmp += src_strd
103    add         x10,x1,x3                   //pu1_dst_tmp += dst_strd
104    st1         {v0.s}[0],[x1]              //vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0)
105    ld1         {v0.s}[0],[x9],x2           //vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0)
106    add         x0,x0,#4                    //pu1_src += 4
107    st1         {v0.s}[0],[x10],x3          //vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0)
108    ld1         {v0.s}[0],[x9],x2           //vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0)
109    subs        x8,x8,#4                    //(wd -4)
110    st1         {v0.s}[0],[x10],x3          //vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0)
111    ld1         {v0.s}[0],[x9],x2           //vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0)
112    add         x1,x1,#4                    //pu1_dst += 4
113    st1         {v0.s}[0],[x10],x3          //vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0)
114
115    bgt         inner_loop_wd_4
116
117end_inner_loop_wd_4:
118    subs        x11,x11,#4                  //ht - 4
119    sub         x0,x9,x15                   //pu1_src = pu1_src_tmp
120    sub         x1,x10,x15                  //pu1_dst = pu1_dst_tmp
121    bgt         outer_loop_wd_4
122
123end_loops:
124    // ldmfd sp!,{x8-x16,pc}                  //reload the registers from sp
125//  MRS x20,PMCCFILTR_EL0
126    sub         x0,x20,x19
127    ldp         x19,x20,[sp],#16
128    ret
129
130
131core_loop_wd_8:
132    sub         x15,x16,#8
133
134outer_loop_wd_8:
135    subs        x8,x16,#0                   //checks wd
136    ble         end_inner_loop_wd_8
137
138inner_loop_wd_8:
139    add         x9,x0,x2                    //pu1_src_tmp += src_strd
140    ld1         {v0.8b},[x0],#8             //vld1_u8(pu1_src_tmp)
141    add         x10,x1,x3                   //pu1_dst_tmp += dst_strd
142    st1         {v0.8b},[x1],#8             //vst1_u8(pu1_dst_tmp, tmp_src)
143    ld1         {v1.8b},[x9],x2             //vld1_u8(pu1_src_tmp)
144    st1         {v1.8b},[x10],x3            //vst1_u8(pu1_dst_tmp, tmp_src)
145    subs        x8,x8,#8                    //wd - 8(loop condition)
146    ld1         {v2.8b},[x9],x2             //vld1_u8(pu1_src_tmp)
147    st1         {v2.8b},[x10],x3            //vst1_u8(pu1_dst_tmp, tmp_src)
148    ld1         {v3.8b},[x9],x2             //vld1_u8(pu1_src_tmp)
149    st1         {v3.8b},[x10],x3            //vst1_u8(pu1_dst_tmp, tmp_src)
150    bgt         inner_loop_wd_8
151
152end_inner_loop_wd_8:
153    subs        x11,x11,#4                  //ht -= 4
154    sub         x0,x9,x15                   //pu1_src = pu1_src_tmp
155    sub         x1,x10,x15                  //pu1_dst = pu1_dst_tmp
156    bgt         outer_loop_wd_8
157
158    // ldmfd sp!,{x8-x16,pc}                  //reload the registers from sp
159//  MRS x20,PMCCFILTR_EL0
160    sub         x0,x20,x19
161    ldp         x19,x20,[sp],#16
162    ret
163
164core_loop_wd_16:
165    sub         x15,x16,#16
166
167outer_loop_wd_16:
168    subs        x8,x16,#0                   //checks wd
169    ble         end_inner_loop_wd_16
170
171inner_loop_wd_16:
172    add         x9,x0,x2                    //pu1_src_tmp += src_strd
173    ld1         {v0.16b},[x0],#16           //vld1_u8(pu1_src_tmp)
174    add         x10,x1,x3                   //pu1_dst_tmp += dst_strd
175    st1         {v0.16b},[x1],#16           //vst1_u8(pu1_dst_tmp, tmp_src)
176    ld1         {v1.16b},[x9],x2            //vld1_u8(pu1_src_tmp)
177    st1         {v1.16b},[x10],x3           //vst1_u8(pu1_dst_tmp, tmp_src)
178    subs        x8,x8,#16                   //wd - 8(loop condition)
179    ld1         {v2.16b},[x9],x2            //vld1_u8(pu1_src_tmp)
180    st1         {v2.16b},[x10],x3           //vst1_u8(pu1_dst_tmp, tmp_src)
181    ld1         {v3.16b},[x9],x2            //vld1_u8(pu1_src_tmp)
182    st1         {v3.16b},[x10],x3           //vst1_u8(pu1_dst_tmp, tmp_src)
183    bgt         inner_loop_wd_16
184
185end_inner_loop_wd_16:
186    subs        x11,x11,#4                  //ht -= 4
187    sub         x0,x9,x15                   //pu1_src = pu1_src_tmp
188    sub         x1,x10,x15                  //pu1_dst = pu1_dst_tmp
189    bgt         outer_loop_wd_16
190
191    // ldmfd sp!,{x8-x16,pc}                  //reload the registers from sp
192//  MRS x20,PMCCFILTR_EL0
193    sub         x0,x20,x19
194    ldp         x19,x20,[sp],#16
195    ret
196
197
198
199
200