1@/***************************************************************************** 2@* 3@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore 4@* 5@* Licensed under the Apache License, Version 2.0 (the "License"); 6@* you may not use this file except in compliance with the License. 7@* You may obtain a copy of the License at: 8@* 9@* http://www.apache.org/licenses/LICENSE-2.0 10@* 11@* Unless required by applicable law or agreed to in writing, software 12@* distributed under the License is distributed on an "AS IS" BASIS, 13@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14@* See the License for the specific language governing permissions and 15@* limitations under the License. 16@* 17@*****************************************************************************/ 18@/** 19@/** 20@******************************************************************************* 21@* 22@* @brief 23@* interprediction luma function for copy 24@* 25@* @par description: 26@* copies the array of width 'wd' and height 'ht' from the location pointed 27@* by 'src' to the location pointed by 'dst' 28@* 29@* @param[in] pu1_src 30@* uword8 pointer to the source 31@* 32@* @param[out] pu1_dst 33@* uword8 pointer to the destination 34@* 35@* @param[in] src_strd 36@* integer source stride 37@* 38@* @param[in] dst_strd 39@* integer destination stride 40@* 41@* @param[in] pi1_coeff 42@* word8 pointer to the filter coefficients 43@* 44@* @param[in] ht 45@* integer height of the array 46@* 47@* @param[in] wd 48@* integer width of the array 49@* 50@* @returns 51@* 52@* @remarks 53@* none 54@* 55@******************************************************************************* 56@*/ 57@void ihevc_inter_pred_luma_copy ( 58@ uword8 *pu1_src, 59@ uword8 *pu1_dst, 60@ word32 src_strd, 61@ word32 dst_strd, 62@ word8 *pi1_coeff, 63@ word32 ht, 64@ word32 wd ) 65 66@**************variables vs registers***************************************** 67@ r0 => *pu1_src 68@ r1 => *pu1_dst 69@ r2 => src_strd 70@ r3 => dst_strd 71@ r7 => ht 72@ r12 => wd 73 74.text 75.align 4 76 77 78 79 80.globl ihevc_inter_pred_luma_copy_a9q 81 82.type ihevc_inter_pred_luma_copy_a9q, %function 83 84ihevc_inter_pred_luma_copy_a9q: 85 stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments 86 ldr r12,[sp,#48] @loads wd 87 ldr r7,[sp,#44] @loads ht 88 cmp r7,#0 @checks ht == 0 89 ble end_loops 90 tst r12,#15 @checks wd for multiples for 4 & 8 91 beq core_loop_wd_16 92 tst r12,#7 @checks wd for multiples for 4 & 8 93 beq core_loop_wd_8 94 sub r11,r12,#4 95 96outer_loop_wd_4: 97 subs r4,r12,#0 @checks wd == 0 98 ble end_inner_loop_wd_4 99 100inner_loop_wd_4: 101 vld1.32 {d0[0]},[r0] @vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0) 102 add r5,r0,r2 @pu1_src_tmp += src_strd 103 add r6,r1,r3 @pu1_dst_tmp += dst_strd 104 vst1.32 {d0[0]},[r1] @vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0) 105 vld1.32 {d0[0]},[r5],r2 @vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0) 106 add r0,r0,#4 @pu1_src += 4 107 vst1.32 {d0[0]},[r6],r3 @vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0) 108 vld1.32 {d0[0]},[r5],r2 @vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0) 109 subs r4,r4,#4 @(wd -4) 110 vst1.32 {d0[0]},[r6],r3 @vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0) 111 vld1.32 {d0[0]},[r5],r2 @vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp, 0) 112 add r1,r1,#4 @pu1_dst += 4 113 vst1.32 {d0[0]},[r6],r3 @vst1_lane_u32((uint32_t *)pu1_dst_tmp, src_tmp, 0) 114 115 bgt inner_loop_wd_4 116 117end_inner_loop_wd_4: 118 subs r7,r7,#4 @ht - 4 119 sub r0,r5,r11 @pu1_src = pu1_src_tmp 120 sub r1,r6,r11 @pu1_dst = pu1_dst_tmp 121 bgt outer_loop_wd_4 122 123end_loops: 124 ldmfd sp!,{r4-r12,r15} @reload the registers from sp 125 126 127core_loop_wd_8: 128 sub r11,r12,#8 129 130outer_loop_wd_8: 131 subs r4,r12,#0 @checks wd 132 ble end_inner_loop_wd_8 133 134inner_loop_wd_8: 135 add r5,r0,r2 @pu1_src_tmp += src_strd 136 vld1.8 {d0},[r0]! @vld1_u8(pu1_src_tmp) 137 add r6,r1,r3 @pu1_dst_tmp += dst_strd 138 vst1.8 {d0},[r1]! @vst1_u8(pu1_dst_tmp, tmp_src) 139 vld1.8 {d1},[r5],r2 @vld1_u8(pu1_src_tmp) 140 vst1.8 {d1},[r6],r3 @vst1_u8(pu1_dst_tmp, tmp_src) 141 subs r4,r4,#8 @wd - 8(loop condition) 142 vld1.8 {d2},[r5],r2 @vld1_u8(pu1_src_tmp) 143 vst1.8 {d2},[r6],r3 @vst1_u8(pu1_dst_tmp, tmp_src) 144 vld1.8 {d3},[r5],r2 @vld1_u8(pu1_src_tmp) 145 vst1.8 {d3},[r6],r3 @vst1_u8(pu1_dst_tmp, tmp_src) 146 bgt inner_loop_wd_8 147 148end_inner_loop_wd_8: 149 subs r7,r7,#4 @ht -= 4 150 sub r0,r5,r11 @pu1_src = pu1_src_tmp 151 sub r1,r6,r11 @pu1_dst = pu1_dst_tmp 152 bgt outer_loop_wd_8 153 154 ldmfd sp!,{r4-r12,r15} @reload the registers from sp 155 156core_loop_wd_16: 157 sub r11,r12,#16 158 159outer_loop_wd_16: 160 subs r4,r12,#0 @checks wd 161 ble end_inner_loop_wd_16 162 163inner_loop_wd_16: 164 add r5,r0,r2 @pu1_src_tmp += src_strd 165 vld1.8 {q0},[r0]! @vld1_u8(pu1_src_tmp) 166 add r6,r1,r3 @pu1_dst_tmp += dst_strd 167 vst1.8 {q0},[r1]! @vst1_u8(pu1_dst_tmp, tmp_src) 168 vld1.8 {q1},[r5],r2 @vld1_u8(pu1_src_tmp) 169 vst1.8 {q1},[r6],r3 @vst1_u8(pu1_dst_tmp, tmp_src) 170 subs r4,r4,#16 @wd - 8(loop condition) 171 vld1.8 {q2},[r5],r2 @vld1_u8(pu1_src_tmp) 172 vst1.8 {q2},[r6],r3 @vst1_u8(pu1_dst_tmp, tmp_src) 173 vld1.8 {q3},[r5],r2 @vld1_u8(pu1_src_tmp) 174 vst1.8 {q3},[r6],r3 @vst1_u8(pu1_dst_tmp, tmp_src) 175 bgt inner_loop_wd_16 176 177end_inner_loop_wd_16: 178 subs r7,r7,#4 @ht -= 4 179 sub r0,r5,r11 @pu1_src = pu1_src_tmp 180 sub r1,r6,r11 @pu1_dst = pu1_dst_tmp 181 bgt outer_loop_wd_16 182 183 ldmfd sp!,{r4-r12,r15} @reload the registers from sp 184 185 186 187 188 189