1///***************************************************************************** 2//* 3//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore 4//* 5//* Licensed under the Apache License, Version 2.0 (the "License"); 6//* you may not use this file except in compliance with the License. 7//* You may obtain a copy of the License at: 8//* 9//* http://www.apache.org/licenses/LICENSE-2.0 10//* 11//* Unless required by applicable law or agreed to in writing, software 12//* distributed under the License is distributed on an "AS IS" BASIS, 13//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14//* See the License for the specific language governing permissions and 15//* limitations under the License. 16//* 17//*****************************************************************************/ 18///** 19//******************************************************************************* 20//* @file 21//* ihevc_inter_pred_chroma_copy.s 22//* 23//* @brief 24//* Contains function definitions for inter prediction interpolation. 25//* Functions are coded using NEON intrinsics and can be compiled using ARM 26//* RVCT 27//* 28//* @author 29//* Yogeswaran RS 30//* 31//* @par List of Functions: 32//* 33//* 34//* @remarks 35//* None 36//* 37//******************************************************************************* 38//*/ 39///** 40//******************************************************************************* 41//* 42//* @brief 43//* Chroma interprediction filter for copy 44//* 45//* @par Description: 46//* Copies the array of width 'wd' and height 'ht' from the location pointed 47//* by 'src' to the location pointed by 'dst' 48//* 49//* @param[in] pu1_src 50//* UWORD8 pointer to the source 51//* 52//* @param[out] pu1_dst 53//* UWORD8 pointer to the destination 54//* 55//* @param[in] src_strd 56//* integer source stride 57//* 58//* @param[in] dst_strd 59//* integer destination stride 60//* 61//* @param[in] pi1_coeff 62//* WORD8 pointer to the filter coefficients 63//* 64//* @param[in] ht 65//* integer height of the array 66//* 67//* @param[in] wd 68//* integer width of the array 69//* 70//* @returns 71//* 72//* @remarks 73//* None 74//* 75//******************************************************************************* 76//*/ 77 78//void ihevc_inter_pred_chroma_copy( UWORD8 *pu1_src, 79// UWORD8 *pu1_dst, 80// WORD32 src_strd, 81// WORD32 dst_strd, 82// WORD8 *pi1_coeff, 83// WORD32 ht, 84// WORD32 wd) 85//**************Variables Vs Registers***************************************** 86//x0 => *pu1_src 87//x1 => *pu1_dst 88//x2 => src_strd 89//x3 => dst_strd 90//x4 => *pi1_coeff 91//x5 => ht 92//x6 => wd 93 94.text 95.align 4 96 97.globl ihevc_inter_pred_chroma_copy_av8 98 99.type ihevc_inter_pred_chroma_copy_av8, %function 100 101ihevc_inter_pred_chroma_copy_av8: 102 103 LSL x12,x6,#1 //wd << 1 104 CMP x5,#0 //checks ht == 0 105 BLE END_LOOPS 106 AND x8,x5,#3 //check ht for mul of 2 107 SUB x5,x5,x8 //check the rounded height value 108 TST x12,#15 //checks wd for multiples for 16 109 BEQ CORE_LOOP_WD_16 110 TST x12,#7 //checks wd for multiples for 4 & 8 111 BEQ CORE_LOOP_WD_8 112 SUB x11,x12,#4 113 CMP x5,#0 114 BEQ OUTER_LOOP_WD_4_HT_2 115 116OUTER_LOOP_WD_4: 117 SUBS x4,x12,#0 //checks wd == 0 118 BLE END_INNER_LOOP_WD_4 119 120INNER_LOOP_WD_4: 121 LD1 {v0.s}[0],[x0] //vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0) 122 ADD x7,x0,x2 //pu1_src_tmp += src_strd 123 ADD x6,x1,x3 //pu1_dst_tmp += dst_strd 124 ST1 {v0.s}[0],[x1] //vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0) 125 LD1 {v0.s}[0],[x7],x2 //vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0) 126 ADD x0,x0,#4 //pu1_src += 4 127 ST1 {v0.s}[0],[x6],x3 //vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0) 128 LD1 {v0.s}[0],[x7],x2 //vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0) 129 SUBS x4,x4,#4 //(wd -4) 130 ST1 {v0.s}[0],[x6],x3 //vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0) 131 LD1 {v0.s}[0],[x7],x2 //vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0) 132 ADD x1,x1,#4 //pu1_dst += 4 133 ST1 {v0.s}[0],[x6],x3 //vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0) 134 BGT INNER_LOOP_WD_4 135 136END_INNER_LOOP_WD_4: 137 SUBS x5,x5,#4 //ht - 4 138 SUB x0,x7,x11 //pu1_src = pu1_src_tmp 139 SUB x1,x6,x11 //pu1_dst = pu1_dst_tmp 140 BGT OUTER_LOOP_WD_4 141 CMP x8,#0 142 BGT OUTER_LOOP_WD_4_HT_2 143 144END_LOOPS: 145 RET 146 147OUTER_LOOP_WD_4_HT_2: 148 SUBS x4,x12,#0 //checks wd == 0 149 BLE END_LOOPS 150 151INNER_LOOP_WD_4_HT_2: 152 LD1 {v0.s}[0],[x0] //vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0) 153 ADD x7,x0,x2 //pu1_src_tmp += src_strd 154 ADD x6,x1,x3 //pu1_dst_tmp += dst_strd 155 ST1 {v0.s}[0],[x1] //vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0) 156 LD1 {v0.s}[0],[x7],x2 //vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0) 157 ADD x0,x0,#4 //pu1_src += 4 158 ST1 {v0.s}[0],[x6],x3 //vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0) 159 SUBS x4,x4,#4 //(wd -4) 160 ADD x1,x1,#4 //pu1_dst += 4 161 BGT INNER_LOOP_WD_4_HT_2 162 B END_LOOPS 163 164CORE_LOOP_WD_8: 165 SUB x11,x12,#8 166 CMP x5,#0 167 BEQ OUTER_LOOP_WD_8_HT_2 168 169OUTER_LOOP_WD_8: 170 SUBS x4,x12,#0 //checks wd 171 BLE END_INNER_LOOP_WD_8 172 173 174INNER_LOOP_WD_8: 175 ADD x7,x0,x2 //pu1_src_tmp += src_strd 176 LD1 {v0.8b},[x0],#8 //vld1_u8(pu1_src_tmp) 177 ADD x6,x1,x3 //pu1_dst_tmp += dst_strd 178 ST1 {v0.8b},[x1],#8 //vst1_u8(pu1_dst_tmp, tmp_src) 179 LD1 {v1.8b},[x7],x2 //vld1_u8(pu1_src_tmp) 180 ST1 {v1.8b},[x6],x3 //vst1_u8(pu1_dst_tmp, tmp_src) 181 SUBS x4,x4,#8 //wd - 8(Loop condition) 182 LD1 {v2.8b},[x7],x2 //vld1_u8(pu1_src_tmp) 183 ST1 {v2.8b},[x6],x3 //vst1_u8(pu1_dst_tmp, tmp_src) 184 LD1 {v3.8b},[x7],x2 //vld1_u8(pu1_src_tmp) 185 ST1 {v3.8b},[x6],x3 //vst1_u8(pu1_dst_tmp, tmp_src) 186 BGT INNER_LOOP_WD_8 187 188END_INNER_LOOP_WD_8: 189 SUBS x5,x5,#4 //ht -= 4 190 SUB x0,x7,x11 //pu1_src = pu1_src_tmp 191 SUB x1,x6,x11 //pu1_dst = pu1_dst_tmp 192 BGT OUTER_LOOP_WD_8 193 CMP x8,#0 194 BGT OUTER_LOOP_WD_8_HT_2 195 B END_LOOPS 196 197OUTER_LOOP_WD_8_HT_2: 198 SUBS x4,x12,#0 //checks wd 199 BLE END_LOOPS 200 201INNER_LOOP_WD_8_HT_2: 202 ADD x7,x0,x2 //pu1_src_tmp += src_strd 203 LD1 {v0.8b},[x0],#8 //vld1_u8(pu1_src_tmp) 204 ADD x6,x1,x3 //pu1_dst_tmp += dst_strd 205 ST1 {v0.8b},[x1],#8 //vst1_u8(pu1_dst_tmp, tmp_src) 206 LD1 {v1.8b},[x7],x2 //vld1_u8(pu1_src_tmp) 207 ST1 {v1.8b},[x6],x3 //vst1_u8(pu1_dst_tmp, tmp_src) 208 B END_LOOPS 209 210CORE_LOOP_WD_16: 211 SUB x11,x12,#16 212 CMP x5,#0 213 BEQ OUTER_LOOP_WD_16_HT_2 214 215OUTER_LOOP_WD_16: 216 SUBS x4,x12,#0 //checks wd 217 BLE END_INNER_LOOP_WD_16 218 219INNER_LOOP_WD_16: 220 ADD x7,x0,x2 //pu1_src_tmp += src_strd 221 LD1 {v0.16b},[x0],#16 //vld1_u8(pu1_src_tmp) 222 ADD x6,x1,x3 //pu1_dst_tmp += dst_strd 223 ST1 {v0.16b},[x1],#16 //vst1_u8(pu1_dst_tmp, tmp_src) 224 LD1 {v1.16b},[x7],x2 //vld1_u8(pu1_src_tmp) 225 ST1 {v1.16b},[x6],x3 //vst1_u8(pu1_dst_tmp, tmp_src) 226 SUBS x4,x4,#16 //wd - 16(Loop condition) 227 LD1 {v2.16b},[x7],x2 //vld1_u8(pu1_src_tmp) 228 ST1 {v2.16b},[x6],x3 //vst1_u8(pu1_dst_tmp, tmp_src) 229 LD1 {v3.16b},[x7],x2 //vld1_u8(pu1_src_tmp) 230 ST1 {v3.16b},[x6],x3 //vst1_u8(pu1_dst_tmp, tmp_src) 231 BGT INNER_LOOP_WD_16 232 233END_INNER_LOOP_WD_16: 234 SUBS x5,x5,#4 //ht -= 4 235 SUB x0,x7,x11 //pu1_src = pu1_src_tmp 236 SUB x1,x6,x11 //pu1_dst = pu1_dst_tmp 237 BGT OUTER_LOOP_WD_16 238 CMP x8,#0 239 BGT OUTER_LOOP_WD_16_HT_2 240 B END_LOOPS 241 242OUTER_LOOP_WD_16_HT_2: 243 SUBS x4,x12,#0 //checks wd 244 BLE END_LOOPS 245 246INNER_LOOP_WD_16_HT_2: 247 ADD x7,x0,x2 //pu1_src_tmp += src_strd 248 LD1 {v0.16b},[x0],#16 //vld1_u8(pu1_src_tmp) 249 ADD x6,x1,x3 //pu1_dst_tmp += dst_strd 250 ST1 {v0.16b},[x1],#16 //vst1_u8(pu1_dst_tmp, tmp_src) 251 LD1 {v1.16b},[x7],x2 //vld1_u8(pu1_src_tmp) 252 ST1 {v1.16b},[x6],x3 //vst1_u8(pu1_dst_tmp, tmp_src) 253 254 RET 255 256 257