1; 2; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3; 4; Use of this source code is governed by a BSD-style license 5; that can be found in the LICENSE file in the root of the source 6; tree. An additional intellectual property rights grant can be found 7; in the file PATENTS. All contributing project authors may 8; be found in the AUTHORS file in the root of the source tree. 9; 10 11 12 EXPORT |vp8_sub_pixel_variance8x8_neon| 13 ARM 14 REQUIRE8 15 PRESERVE8 16 17 AREA ||.text||, CODE, READONLY, ALIGN=2 18; r0 unsigned char *src_ptr, 19; r1 int src_pixels_per_line, 20; r2 int xoffset, 21; r3 int yoffset, 22; stack(r4) unsigned char *dst_ptr, 23; stack(r5) int dst_pixels_per_line, 24; stack(r6) unsigned int *sse 25;note: most of the code is copied from bilinear_predict8x8_neon and vp8_variance8x8_neon. 26 27|vp8_sub_pixel_variance8x8_neon| PROC 28 push {r4-r5, lr} 29 30 adr r12, bilinear_taps_coeff 31 ldr r4, [sp, #12] ;load *dst_ptr from stack 32 ldr r5, [sp, #16] ;load dst_pixels_per_line from stack 33 ldr lr, [sp, #20] ;load *sse from stack 34 35 cmp r2, #0 ;skip first_pass filter if xoffset=0 36 beq skip_firstpass_filter 37 38;First pass: output_height lines x output_width columns (9x8) 39 add r2, r12, r2, lsl #3 ;calculate filter location 40 41 vld1.u8 {q1}, [r0], r1 ;load src data 42 vld1.u32 {d31}, [r2] ;load first_pass filter 43 vld1.u8 {q2}, [r0], r1 44 vdup.8 d0, d31[0] ;first_pass filter (d0 d1) 45 vld1.u8 {q3}, [r0], r1 46 vdup.8 d1, d31[4] 47 vld1.u8 {q4}, [r0], r1 48 49 vmull.u8 q6, d2, d0 ;(src_ptr[0] * Filter[0]) 50 vmull.u8 q7, d4, d0 51 vmull.u8 q8, d6, d0 52 vmull.u8 q9, d8, d0 53 54 vext.8 d3, d2, d3, #1 ;construct src_ptr[-1] 55 vext.8 d5, d4, d5, #1 56 vext.8 d7, d6, d7, #1 57 vext.8 d9, d8, d9, #1 58 59 vmlal.u8 q6, d3, d1 ;(src_ptr[1] * Filter[1]) 60 vmlal.u8 q7, d5, d1 61 vmlal.u8 q8, d7, d1 62 vmlal.u8 q9, d9, d1 63 64 vld1.u8 {q1}, [r0], r1 ;load src data 65 vqrshrn.u16 d22, q6, #7 ;shift/round/saturate to u8 66 vld1.u8 {q2}, [r0], r1 67 vqrshrn.u16 d23, q7, #7 68 vld1.u8 {q3}, [r0], r1 69 vqrshrn.u16 d24, q8, #7 70 vld1.u8 {q4}, [r0], r1 71 vqrshrn.u16 d25, q9, #7 72 73 ;first_pass filtering on the rest 5-line data 74 vld1.u8 {q5}, [r0], r1 75 76 vmull.u8 q6, d2, d0 ;(src_ptr[0] * Filter[0]) 77 vmull.u8 q7, d4, d0 78 vmull.u8 q8, d6, d0 79 vmull.u8 q9, d8, d0 80 vmull.u8 q10, d10, d0 81 82 vext.8 d3, d2, d3, #1 ;construct src_ptr[-1] 83 vext.8 d5, d4, d5, #1 84 vext.8 d7, d6, d7, #1 85 vext.8 d9, d8, d9, #1 86 vext.8 d11, d10, d11, #1 87 88 vmlal.u8 q6, d3, d1 ;(src_ptr[1] * Filter[1]) 89 vmlal.u8 q7, d5, d1 90 vmlal.u8 q8, d7, d1 91 vmlal.u8 q9, d9, d1 92 vmlal.u8 q10, d11, d1 93 94 vqrshrn.u16 d26, q6, #7 ;shift/round/saturate to u8 95 vqrshrn.u16 d27, q7, #7 96 vqrshrn.u16 d28, q8, #7 97 vqrshrn.u16 d29, q9, #7 98 vqrshrn.u16 d30, q10, #7 99 100;Second pass: 8x8 101secondpass_filter 102 cmp r3, #0 ;skip second_pass filter if yoffset=0 103 ;skip_secondpass_filter 104 beq sub_pixel_variance8x8_neon 105 106 add r3, r12, r3, lsl #3 107 108 vld1.u32 {d31}, [r3] ;load second_pass filter 109 110 vdup.8 d0, d31[0] ;second_pass filter parameters (d0 d1) 111 vdup.8 d1, d31[4] 112 113 vmull.u8 q1, d22, d0 ;(src_ptr[0] * Filter[0]) 114 vmull.u8 q2, d23, d0 115 vmull.u8 q3, d24, d0 116 vmull.u8 q4, d25, d0 117 vmull.u8 q5, d26, d0 118 vmull.u8 q6, d27, d0 119 vmull.u8 q7, d28, d0 120 vmull.u8 q8, d29, d0 121 122 vmlal.u8 q1, d23, d1 ;(src_ptr[pixel_step] * Filter[1]) 123 vmlal.u8 q2, d24, d1 124 vmlal.u8 q3, d25, d1 125 vmlal.u8 q4, d26, d1 126 vmlal.u8 q5, d27, d1 127 vmlal.u8 q6, d28, d1 128 vmlal.u8 q7, d29, d1 129 vmlal.u8 q8, d30, d1 130 131 vqrshrn.u16 d22, q1, #7 ;shift/round/saturate to u8 132 vqrshrn.u16 d23, q2, #7 133 vqrshrn.u16 d24, q3, #7 134 vqrshrn.u16 d25, q4, #7 135 vqrshrn.u16 d26, q5, #7 136 vqrshrn.u16 d27, q6, #7 137 vqrshrn.u16 d28, q7, #7 138 vqrshrn.u16 d29, q8, #7 139 140 b sub_pixel_variance8x8_neon 141 142;-------------------- 143skip_firstpass_filter 144 vld1.u8 {d22}, [r0], r1 ;load src data 145 vld1.u8 {d23}, [r0], r1 146 vld1.u8 {d24}, [r0], r1 147 vld1.u8 {d25}, [r0], r1 148 vld1.u8 {d26}, [r0], r1 149 vld1.u8 {d27}, [r0], r1 150 vld1.u8 {d28}, [r0], r1 151 vld1.u8 {d29}, [r0], r1 152 vld1.u8 {d30}, [r0], r1 153 154 b secondpass_filter 155 156;---------------------- 157;vp8_variance8x8_neon 158sub_pixel_variance8x8_neon 159 vmov.i8 q8, #0 ;q8 - sum 160 vmov.i8 q9, #0 ;q9, q10 - sse 161 vmov.i8 q10, #0 162 163 mov r12, #2 164 165sub_pixel_variance8x8_neon_loop 166 vld1.8 {d0}, [r4], r5 ;load dst data 167 subs r12, r12, #1 168 vld1.8 {d1}, [r4], r5 169 vld1.8 {d2}, [r4], r5 170 vsubl.u8 q4, d22, d0 ;calculate diff 171 vld1.8 {d3}, [r4], r5 172 173 vsubl.u8 q5, d23, d1 174 vsubl.u8 q6, d24, d2 175 176 vpadal.s16 q8, q4 ;sum 177 vmlal.s16 q9, d8, d8 ;sse 178 vmlal.s16 q10, d9, d9 179 180 vsubl.u8 q7, d25, d3 181 182 vpadal.s16 q8, q5 183 vmlal.s16 q9, d10, d10 184 vmlal.s16 q10, d11, d11 185 186 vmov q11, q13 187 188 vpadal.s16 q8, q6 189 vmlal.s16 q9, d12, d12 190 vmlal.s16 q10, d13, d13 191 192 vmov q12, q14 193 194 vpadal.s16 q8, q7 195 vmlal.s16 q9, d14, d14 196 vmlal.s16 q10, d15, d15 197 198 bne sub_pixel_variance8x8_neon_loop 199 200 vadd.u32 q10, q9, q10 ;accumulate sse 201 vpaddl.s32 q0, q8 ;accumulate sum 202 203 vpaddl.u32 q1, q10 204 vadd.s64 d0, d0, d1 205 vadd.u64 d1, d2, d3 206 207 vmull.s32 q5, d0, d0 208 vst1.32 {d1[0]}, [lr] ;store sse 209 vshr.u32 d10, d10, #6 210 vsub.u32 d0, d1, d10 211 212 vmov.32 r0, d0[0] ;return 213 pop {r4-r5, pc} 214 215 ENDP 216 217;----------------- 218 219bilinear_taps_coeff 220 DCD 128, 0, 112, 16, 96, 32, 80, 48, 64, 64, 48, 80, 32, 96, 16, 112 221 222 END 223