1///***************************************************************************** 2//* 3//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore 4//* 5//* Licensed under the Apache License, Version 2.0 (the "License"); 6//* you may not use this file except in compliance with the License. 7//* You may obtain a copy of the License at: 8//* 9//* http://www.apache.org/licenses/LICENSE-2.0 10//* 11//* Unless required by applicable law or agreed to in writing, software 12//* distributed under the License is distributed on an "AS IS" BASIS, 13//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14//* See the License for the specific language governing permissions and 15//* limitations under the License. 16//* 17//*****************************************************************************/ 18///** 19///** 20//******************************************************************************* 21//* 22//* //brief 23//* interprediction luma function for copy 24//* 25//* //par description: 26//* copies the array of width 'wd' and height 'ht' from the location pointed 27//* by 'src' to the location pointed by 'dst' 28//* 29//* //param[in] pu1_src 30//* uword8 pointer to the source 31//* 32//* //param[out] pu1_dst 33//* uword8 pointer to the destination 34//* 35//* //param[in] src_strd 36//* integer source stride 37//* 38//* //param[in] dst_strd 39//* integer destination stride 40//* 41//* //param[in] pi1_coeff 42//* word8 pointer to the filter coefficients 43//* 44//* //param[in] ht 45//* integer height of the array 46//* 47//* //param[in] wd 48//* integer width of the array 49//* 50//* //returns 51//* 52//* //remarks 53//* none 54//* 55//******************************************************************************* 56//*/ 57//void ihevc_inter_pred_luma_copy ( 58// uword8 *pu1_src, 59// uword8 *pu1_dst, 60// word32 src_strd, 61// word32 dst_strd, 62// word8 *pi1_coeff, 63// word32 ht, 64// word32 wd ) 65 66//**************variables vs registers***************************************** 67// x0 => *pu1_src 68// x1 => *pu1_dst 69// x2 => src_strd 70// x3 => dst_strd 71// x11 => ht 72// x16 => wd 73 74.text 75.align 4 76 77.include "ihevc_neon_macros.s" 78 79.globl ihevc_inter_pred_luma_copy_av8 80 81.type ihevc_inter_pred_luma_copy_av8, %function 82 83ihevc_inter_pred_luma_copy_av8: 84 // stmfd sp!, {x8-x16, lr} //stack stores the values of the arguments 85 stp x19,x20,[sp, #-16]! 86 mov x16,x6 //loads wd 87 mov x11,x5 //loads ht 88 cmp x11,#0 //checks ht == 0 89 ble end_loops 90 tst x16,#15 //checks wd for multiples for 4 & 8 91 beq core_loop_wd_16 92 tst x16,#7 //checks wd for multiples for 4 & 8 93 beq core_loop_wd_8 94 sub x15,x16,#4 95 96outer_loop_wd_4: 97 subs x8,x16,#0 //checks wd == 0 98 ble end_inner_loop_wd_4 99 100inner_loop_wd_4: 101 ld1 {v0.s}[0],[x0] //vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0) 102 add x9,x0,x2 //pu1_src_tmp += src_strd 103 add x10,x1,x3 //pu1_dst_tmp += dst_strd 104 st1 {v0.s}[0],[x1] //vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0) 105 ld1 {v0.s}[0],[x9],x2 //vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0) 106 add x0,x0,#4 //pu1_src += 4 107 st1 {v0.s}[0],[x10],x3 //vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0) 108 ld1 {v0.s}[0],[x9],x2 //vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0) 109 subs x8,x8,#4 //(wd -4) 110 st1 {v0.s}[0],[x10],x3 //vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0) 111 ld1 {v0.s}[0],[x9],x2 //vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0) 112 add x1,x1,#4 //pu1_dst += 4 113 st1 {v0.s}[0],[x10],x3 //vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0) 114 115 bgt inner_loop_wd_4 116 117end_inner_loop_wd_4: 118 subs x11,x11,#4 //ht - 4 119 sub x0,x9,x15 //pu1_src = pu1_src_tmp 120 sub x1,x10,x15 //pu1_dst = pu1_dst_tmp 121 bgt outer_loop_wd_4 122 123end_loops: 124 // ldmfd sp!,{x8-x16,pc} //reload the registers from sp 125// MRS x20,PMCCFILTR_EL0 126 sub x0,x20,x19 127 ldp x19,x20,[sp],#16 128 ret 129 130 131core_loop_wd_8: 132 sub x15,x16,#8 133 134outer_loop_wd_8: 135 subs x8,x16,#0 //checks wd 136 ble end_inner_loop_wd_8 137 138inner_loop_wd_8: 139 add x9,x0,x2 //pu1_src_tmp += src_strd 140 ld1 {v0.8b},[x0],#8 //vld1_u8(pu1_src_tmp) 141 add x10,x1,x3 //pu1_dst_tmp += dst_strd 142 st1 {v0.8b},[x1],#8 //vst1_u8(pu1_dst_tmp, tmp_src) 143 ld1 {v1.8b},[x9],x2 //vld1_u8(pu1_src_tmp) 144 st1 {v1.8b},[x10],x3 //vst1_u8(pu1_dst_tmp, tmp_src) 145 subs x8,x8,#8 //wd - 8(loop condition) 146 ld1 {v2.8b},[x9],x2 //vld1_u8(pu1_src_tmp) 147 st1 {v2.8b},[x10],x3 //vst1_u8(pu1_dst_tmp, tmp_src) 148 ld1 {v3.8b},[x9],x2 //vld1_u8(pu1_src_tmp) 149 st1 {v3.8b},[x10],x3 //vst1_u8(pu1_dst_tmp, tmp_src) 150 bgt inner_loop_wd_8 151 152end_inner_loop_wd_8: 153 subs x11,x11,#4 //ht -= 4 154 sub x0,x9,x15 //pu1_src = pu1_src_tmp 155 sub x1,x10,x15 //pu1_dst = pu1_dst_tmp 156 bgt outer_loop_wd_8 157 158 // ldmfd sp!,{x8-x16,pc} //reload the registers from sp 159// MRS x20,PMCCFILTR_EL0 160 sub x0,x20,x19 161 ldp x19,x20,[sp],#16 162 ret 163 164core_loop_wd_16: 165 sub x15,x16,#16 166 167outer_loop_wd_16: 168 subs x8,x16,#0 //checks wd 169 ble end_inner_loop_wd_16 170 171inner_loop_wd_16: 172 add x9,x0,x2 //pu1_src_tmp += src_strd 173 ld1 {v0.16b},[x0],#16 //vld1_u8(pu1_src_tmp) 174 add x10,x1,x3 //pu1_dst_tmp += dst_strd 175 st1 {v0.16b},[x1],#16 //vst1_u8(pu1_dst_tmp, tmp_src) 176 ld1 {v1.16b},[x9],x2 //vld1_u8(pu1_src_tmp) 177 st1 {v1.16b},[x10],x3 //vst1_u8(pu1_dst_tmp, tmp_src) 178 subs x8,x8,#16 //wd - 8(loop condition) 179 ld1 {v2.16b},[x9],x2 //vld1_u8(pu1_src_tmp) 180 st1 {v2.16b},[x10],x3 //vst1_u8(pu1_dst_tmp, tmp_src) 181 ld1 {v3.16b},[x9],x2 //vld1_u8(pu1_src_tmp) 182 st1 {v3.16b},[x10],x3 //vst1_u8(pu1_dst_tmp, tmp_src) 183 bgt inner_loop_wd_16 184 185end_inner_loop_wd_16: 186 subs x11,x11,#4 //ht -= 4 187 sub x0,x9,x15 //pu1_src = pu1_src_tmp 188 sub x1,x10,x15 //pu1_dst = pu1_dst_tmp 189 bgt outer_loop_wd_16 190 191 // ldmfd sp!,{x8-x16,pc} //reload the registers from sp 192// MRS x20,PMCCFILTR_EL0 193 sub x0,x20,x19 194 ldp x19,x20,[sp],#16 195 ret 196 197 198 199 200