1; 2; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3; 4; Use of this source code is governed by a BSD-style license 5; that can be found in the LICENSE file in the root of the source 6; tree. An additional intellectual property rights grant can be found 7; in the file PATENTS. All contributing project authors may 8; be found in the AUTHORS file in the root of the source tree. 9; 10 11 12 EXPORT |vp8_variance16x16_neon| 13 EXPORT |vp8_variance16x8_neon| 14 EXPORT |vp8_variance8x16_neon| 15 EXPORT |vp8_variance8x8_neon| 16 17 ARM 18 REQUIRE8 19 PRESERVE8 20 21 AREA ||.text||, CODE, READONLY, ALIGN=2 22 23; r0 unsigned char *src_ptr 24; r1 int source_stride 25; r2 unsigned char *ref_ptr 26; r3 int recon_stride 27; stack unsigned int *sse 28|vp8_variance16x16_neon| PROC 29 vmov.i8 q8, #0 ;q8 - sum 30 vmov.i8 q9, #0 ;q9, q10 - sse 31 vmov.i8 q10, #0 32 33 mov r12, #8 34 35variance16x16_neon_loop 36 vld1.8 {q0}, [r0], r1 ;Load up source and reference 37 vld1.8 {q2}, [r2], r3 38 vld1.8 {q1}, [r0], r1 39 vld1.8 {q3}, [r2], r3 40 41 vsubl.u8 q11, d0, d4 ;calculate diff 42 vsubl.u8 q12, d1, d5 43 vsubl.u8 q13, d2, d6 44 vsubl.u8 q14, d3, d7 45 46 ;VPADAL adds adjacent pairs of elements of a vector, and accumulates 47 ;the results into the elements of the destination vector. The explanation 48 ;in ARM guide is wrong. 49 vpadal.s16 q8, q11 ;calculate sum 50 vmlal.s16 q9, d22, d22 ;calculate sse 51 vmlal.s16 q10, d23, d23 52 53 subs r12, r12, #1 54 55 vpadal.s16 q8, q12 56 vmlal.s16 q9, d24, d24 57 vmlal.s16 q10, d25, d25 58 vpadal.s16 q8, q13 59 vmlal.s16 q9, d26, d26 60 vmlal.s16 q10, d27, d27 61 vpadal.s16 q8, q14 62 vmlal.s16 q9, d28, d28 63 vmlal.s16 q10, d29, d29 64 65 bne variance16x16_neon_loop 66 67 vadd.u32 q10, q9, q10 ;accumulate sse 68 vpaddl.s32 q0, q8 ;accumulate sum 69 70 ldr r12, [sp] ;load *sse from stack 71 72 vpaddl.u32 q1, q10 73 vadd.s64 d0, d0, d1 74 vadd.u64 d1, d2, d3 75 76 ;vmov.32 r0, d0[0] ;this instruction costs a lot 77 ;vmov.32 r1, d1[0] 78 ;mul r0, r0, r0 79 ;str r1, [r12] 80 ;sub r0, r1, r0, lsr #8 81 82 ; while sum is signed, sum * sum is always positive and must be treated as 83 ; unsigned to avoid propagating the sign bit. 84 vmull.s32 q5, d0, d0 85 vst1.32 {d1[0]}, [r12] ;store sse 86 vshr.u32 d10, d10, #8 87 vsub.u32 d0, d1, d10 88 89 vmov.32 r0, d0[0] ;return 90 bx lr 91 92 ENDP 93 94;================================ 95;unsigned int vp8_variance16x8_c( 96; unsigned char *src_ptr, 97; int source_stride, 98; unsigned char *ref_ptr, 99; int recon_stride, 100; unsigned int *sse) 101|vp8_variance16x8_neon| PROC 102 vmov.i8 q8, #0 ;q8 - sum 103 vmov.i8 q9, #0 ;q9, q10 - sse 104 vmov.i8 q10, #0 105 106 mov r12, #4 107 108variance16x8_neon_loop 109 vld1.8 {q0}, [r0], r1 ;Load up source and reference 110 vld1.8 {q2}, [r2], r3 111 vld1.8 {q1}, [r0], r1 112 vld1.8 {q3}, [r2], r3 113 114 vsubl.u8 q11, d0, d4 ;calculate diff 115 vsubl.u8 q12, d1, d5 116 vsubl.u8 q13, d2, d6 117 vsubl.u8 q14, d3, d7 118 119 vpadal.s16 q8, q11 ;calculate sum 120 vmlal.s16 q9, d22, d22 ;calculate sse 121 vmlal.s16 q10, d23, d23 122 123 subs r12, r12, #1 124 125 vpadal.s16 q8, q12 126 vmlal.s16 q9, d24, d24 127 vmlal.s16 q10, d25, d25 128 vpadal.s16 q8, q13 129 vmlal.s16 q9, d26, d26 130 vmlal.s16 q10, d27, d27 131 vpadal.s16 q8, q14 132 vmlal.s16 q9, d28, d28 133 vmlal.s16 q10, d29, d29 134 135 bne variance16x8_neon_loop 136 137 vadd.u32 q10, q9, q10 ;accumulate sse 138 vpaddl.s32 q0, q8 ;accumulate sum 139 140 ldr r12, [sp] ;load *sse from stack 141 142 vpaddl.u32 q1, q10 143 vadd.s64 d0, d0, d1 144 vadd.u64 d1, d2, d3 145 146 vmull.s32 q5, d0, d0 147 vst1.32 {d1[0]}, [r12] ;store sse 148 vshr.u32 d10, d10, #7 149 vsub.u32 d0, d1, d10 150 151 vmov.32 r0, d0[0] ;return 152 bx lr 153 154 ENDP 155 156;================================= 157;unsigned int vp8_variance8x16_c( 158; unsigned char *src_ptr, 159; int source_stride, 160; unsigned char *ref_ptr, 161; int recon_stride, 162; unsigned int *sse) 163 164|vp8_variance8x16_neon| PROC 165 vmov.i8 q8, #0 ;q8 - sum 166 vmov.i8 q9, #0 ;q9, q10 - sse 167 vmov.i8 q10, #0 168 169 mov r12, #8 170 171variance8x16_neon_loop 172 vld1.8 {d0}, [r0], r1 ;Load up source and reference 173 vld1.8 {d4}, [r2], r3 174 vld1.8 {d2}, [r0], r1 175 vld1.8 {d6}, [r2], r3 176 177 vsubl.u8 q11, d0, d4 ;calculate diff 178 vsubl.u8 q12, d2, d6 179 180 vpadal.s16 q8, q11 ;calculate sum 181 vmlal.s16 q9, d22, d22 ;calculate sse 182 vmlal.s16 q10, d23, d23 183 184 subs r12, r12, #1 185 186 vpadal.s16 q8, q12 187 vmlal.s16 q9, d24, d24 188 vmlal.s16 q10, d25, d25 189 190 bne variance8x16_neon_loop 191 192 vadd.u32 q10, q9, q10 ;accumulate sse 193 vpaddl.s32 q0, q8 ;accumulate sum 194 195 ldr r12, [sp] ;load *sse from stack 196 197 vpaddl.u32 q1, q10 198 vadd.s64 d0, d0, d1 199 vadd.u64 d1, d2, d3 200 201 vmull.s32 q5, d0, d0 202 vst1.32 {d1[0]}, [r12] ;store sse 203 vshr.u32 d10, d10, #7 204 vsub.u32 d0, d1, d10 205 206 vmov.32 r0, d0[0] ;return 207 bx lr 208 209 ENDP 210 211;================================== 212; r0 unsigned char *src_ptr 213; r1 int source_stride 214; r2 unsigned char *ref_ptr 215; r3 int recon_stride 216; stack unsigned int *sse 217|vp8_variance8x8_neon| PROC 218 vmov.i8 q8, #0 ;q8 - sum 219 vmov.i8 q9, #0 ;q9, q10 - sse 220 vmov.i8 q10, #0 221 222 mov r12, #2 223 224variance8x8_neon_loop 225 vld1.8 {d0}, [r0], r1 ;Load up source and reference 226 vld1.8 {d4}, [r2], r3 227 vld1.8 {d1}, [r0], r1 228 vld1.8 {d5}, [r2], r3 229 vld1.8 {d2}, [r0], r1 230 vld1.8 {d6}, [r2], r3 231 vld1.8 {d3}, [r0], r1 232 vld1.8 {d7}, [r2], r3 233 234 vsubl.u8 q11, d0, d4 ;calculate diff 235 vsubl.u8 q12, d1, d5 236 vsubl.u8 q13, d2, d6 237 vsubl.u8 q14, d3, d7 238 239 vpadal.s16 q8, q11 ;calculate sum 240 vmlal.s16 q9, d22, d22 ;calculate sse 241 vmlal.s16 q10, d23, d23 242 243 subs r12, r12, #1 244 245 vpadal.s16 q8, q12 246 vmlal.s16 q9, d24, d24 247 vmlal.s16 q10, d25, d25 248 vpadal.s16 q8, q13 249 vmlal.s16 q9, d26, d26 250 vmlal.s16 q10, d27, d27 251 vpadal.s16 q8, q14 252 vmlal.s16 q9, d28, d28 253 vmlal.s16 q10, d29, d29 254 255 bne variance8x8_neon_loop 256 257 vadd.u32 q10, q9, q10 ;accumulate sse 258 vpaddl.s32 q0, q8 ;accumulate sum 259 260 ldr r12, [sp] ;load *sse from stack 261 262 vpaddl.u32 q1, q10 263 vadd.s64 d0, d0, d1 264 vadd.u64 d1, d2, d3 265 266 vmull.s32 q5, d0, d0 267 vst1.32 {d1[0]}, [r12] ;store sse 268 vshr.u32 d10, d10, #6 269 vsub.u32 d0, d1, d10 270 271 vmov.32 r0, d0[0] ;return 272 bx lr 273 274 ENDP 275 276 END 277