1; 2; Copyright (c) 2011 The WebM project authors. All Rights Reserved. 3; 4; Use of this source code is governed by a BSD-style license 5; that can be found in the LICENSE file in the root of the source 6; tree. An additional intellectual property rights grant can be found 7; in the file PATENTS. All contributing project authors may 8; be found in the AUTHORS file in the root of the source tree. 9; 10 11 12 EXPORT |vpx_variance16x16_media| 13 EXPORT |vpx_variance8x8_media| 14 EXPORT |vpx_mse16x16_media| 15 16 ARM 17 REQUIRE8 18 PRESERVE8 19 20 AREA ||.text||, CODE, READONLY, ALIGN=2 21 22; r0 unsigned char *src_ptr 23; r1 int source_stride 24; r2 unsigned char *ref_ptr 25; r3 int recon_stride 26; stack unsigned int *sse 27|vpx_variance16x16_media| PROC 28 29 stmfd sp!, {r4-r12, lr} 30 31 pld [r0, r1, lsl #0] 32 pld [r2, r3, lsl #0] 33 34 mov r8, #0 ; initialize sum = 0 35 mov r11, #0 ; initialize sse = 0 36 mov r12, #16 ; set loop counter to 16 (=block height) 37 38loop16x16 39 ; 1st 4 pixels 40 ldr r4, [r0, #0] ; load 4 src pixels 41 ldr r5, [r2, #0] ; load 4 ref pixels 42 43 mov lr, #0 ; constant zero 44 45 usub8 r6, r4, r5 ; calculate difference 46 pld [r0, r1, lsl #1] 47 sel r7, r6, lr ; select bytes with positive difference 48 usub8 r9, r5, r4 ; calculate difference with reversed operands 49 pld [r2, r3, lsl #1] 50 sel r6, r9, lr ; select bytes with negative difference 51 52 ; calculate partial sums 53 usad8 r4, r7, lr ; calculate sum of positive differences 54 usad8 r5, r6, lr ; calculate sum of negative differences 55 orr r6, r6, r7 ; differences of all 4 pixels 56 ; calculate total sum 57 adds r8, r8, r4 ; add positive differences to sum 58 subs r8, r8, r5 ; subtract negative differences from sum 59 60 ; calculate sse 61 uxtb16 r5, r6 ; byte (two pixels) to halfwords 62 uxtb16 r10, r6, ror #8 ; another two pixels to halfwords 63 smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) 64 65 ; 2nd 4 pixels 66 ldr r4, [r0, #4] ; load 4 src pixels 67 ldr r5, [r2, #4] ; load 4 ref pixels 68 smlad r11, r10, r10, r11 ; dual signed multiply, add and accumulate (2) 69 70 usub8 r6, r4, r5 ; calculate difference 71 sel r7, r6, lr ; select bytes with positive difference 72 usub8 r9, r5, r4 ; calculate difference with reversed operands 73 sel r6, r9, lr ; select bytes with negative difference 74 75 ; calculate partial sums 76 usad8 r4, r7, lr ; calculate sum of positive differences 77 usad8 r5, r6, lr ; calculate sum of negative differences 78 orr r6, r6, r7 ; differences of all 4 pixels 79 80 ; calculate total sum 81 add r8, r8, r4 ; add positive differences to sum 82 sub r8, r8, r5 ; subtract negative differences from sum 83 84 ; calculate sse 85 uxtb16 r5, r6 ; byte (two pixels) to halfwords 86 uxtb16 r10, r6, ror #8 ; another two pixels to halfwords 87 smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) 88 89 ; 3rd 4 pixels 90 ldr r4, [r0, #8] ; load 4 src pixels 91 ldr r5, [r2, #8] ; load 4 ref pixels 92 smlad r11, r10, r10, r11 ; dual signed multiply, add and accumulate (2) 93 94 usub8 r6, r4, r5 ; calculate difference 95 sel r7, r6, lr ; select bytes with positive difference 96 usub8 r9, r5, r4 ; calculate difference with reversed operands 97 sel r6, r9, lr ; select bytes with negative difference 98 99 ; calculate partial sums 100 usad8 r4, r7, lr ; calculate sum of positive differences 101 usad8 r5, r6, lr ; calculate sum of negative differences 102 orr r6, r6, r7 ; differences of all 4 pixels 103 104 ; calculate total sum 105 add r8, r8, r4 ; add positive differences to sum 106 sub r8, r8, r5 ; subtract negative differences from sum 107 108 ; calculate sse 109 uxtb16 r5, r6 ; byte (two pixels) to halfwords 110 uxtb16 r10, r6, ror #8 ; another two pixels to halfwords 111 smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) 112 113 ; 4th 4 pixels 114 ldr r4, [r0, #12] ; load 4 src pixels 115 ldr r5, [r2, #12] ; load 4 ref pixels 116 smlad r11, r10, r10, r11 ; dual signed multiply, add and accumulate (2) 117 118 usub8 r6, r4, r5 ; calculate difference 119 add r0, r0, r1 ; set src_ptr to next row 120 sel r7, r6, lr ; select bytes with positive difference 121 usub8 r9, r5, r4 ; calculate difference with reversed operands 122 add r2, r2, r3 ; set dst_ptr to next row 123 sel r6, r9, lr ; select bytes with negative difference 124 125 ; calculate partial sums 126 usad8 r4, r7, lr ; calculate sum of positive differences 127 usad8 r5, r6, lr ; calculate sum of negative differences 128 orr r6, r6, r7 ; differences of all 4 pixels 129 130 ; calculate total sum 131 add r8, r8, r4 ; add positive differences to sum 132 sub r8, r8, r5 ; subtract negative differences from sum 133 134 ; calculate sse 135 uxtb16 r5, r6 ; byte (two pixels) to halfwords 136 uxtb16 r10, r6, ror #8 ; another two pixels to halfwords 137 smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) 138 smlad r11, r10, r10, r11 ; dual signed multiply, add and accumulate (2) 139 140 141 subs r12, r12, #1 142 143 bne loop16x16 144 145 ; return stuff 146 ldr r6, [sp, #40] ; get address of sse 147 mul r0, r8, r8 ; sum * sum 148 str r11, [r6] ; store sse 149 sub r0, r11, r0, lsr #8 ; return (sse - ((sum * sum) >> 8)) 150 151 ldmfd sp!, {r4-r12, pc} 152 153 ENDP 154 155; r0 unsigned char *src_ptr 156; r1 int source_stride 157; r2 unsigned char *ref_ptr 158; r3 int recon_stride 159; stack unsigned int *sse 160|vpx_variance8x8_media| PROC 161 162 push {r4-r10, lr} 163 164 pld [r0, r1, lsl #0] 165 pld [r2, r3, lsl #0] 166 167 mov r12, #8 ; set loop counter to 8 (=block height) 168 mov r4, #0 ; initialize sum = 0 169 mov r5, #0 ; initialize sse = 0 170 171loop8x8 172 ; 1st 4 pixels 173 ldr r6, [r0, #0x0] ; load 4 src pixels 174 ldr r7, [r2, #0x0] ; load 4 ref pixels 175 176 mov lr, #0 ; constant zero 177 178 usub8 r8, r6, r7 ; calculate difference 179 pld [r0, r1, lsl #1] 180 sel r10, r8, lr ; select bytes with positive difference 181 usub8 r9, r7, r6 ; calculate difference with reversed operands 182 pld [r2, r3, lsl #1] 183 sel r8, r9, lr ; select bytes with negative difference 184 185 ; calculate partial sums 186 usad8 r6, r10, lr ; calculate sum of positive differences 187 usad8 r7, r8, lr ; calculate sum of negative differences 188 orr r8, r8, r10 ; differences of all 4 pixels 189 ; calculate total sum 190 add r4, r4, r6 ; add positive differences to sum 191 sub r4, r4, r7 ; subtract negative differences from sum 192 193 ; calculate sse 194 uxtb16 r7, r8 ; byte (two pixels) to halfwords 195 uxtb16 r10, r8, ror #8 ; another two pixels to halfwords 196 smlad r5, r7, r7, r5 ; dual signed multiply, add and accumulate (1) 197 198 ; 2nd 4 pixels 199 ldr r6, [r0, #0x4] ; load 4 src pixels 200 ldr r7, [r2, #0x4] ; load 4 ref pixels 201 smlad r5, r10, r10, r5 ; dual signed multiply, add and accumulate (2) 202 203 usub8 r8, r6, r7 ; calculate difference 204 add r0, r0, r1 ; set src_ptr to next row 205 sel r10, r8, lr ; select bytes with positive difference 206 usub8 r9, r7, r6 ; calculate difference with reversed operands 207 add r2, r2, r3 ; set dst_ptr to next row 208 sel r8, r9, lr ; select bytes with negative difference 209 210 ; calculate partial sums 211 usad8 r6, r10, lr ; calculate sum of positive differences 212 usad8 r7, r8, lr ; calculate sum of negative differences 213 orr r8, r8, r10 ; differences of all 4 pixels 214 215 ; calculate total sum 216 add r4, r4, r6 ; add positive differences to sum 217 sub r4, r4, r7 ; subtract negative differences from sum 218 219 ; calculate sse 220 uxtb16 r7, r8 ; byte (two pixels) to halfwords 221 uxtb16 r10, r8, ror #8 ; another two pixels to halfwords 222 smlad r5, r7, r7, r5 ; dual signed multiply, add and accumulate (1) 223 subs r12, r12, #1 ; next row 224 smlad r5, r10, r10, r5 ; dual signed multiply, add and accumulate (2) 225 226 bne loop8x8 227 228 ; return stuff 229 ldr r8, [sp, #32] ; get address of sse 230 mul r1, r4, r4 ; sum * sum 231 str r5, [r8] ; store sse 232 sub r0, r5, r1, ASR #6 ; return (sse - ((sum * sum) >> 6)) 233 234 pop {r4-r10, pc} 235 236 ENDP 237 238; r0 unsigned char *src_ptr 239; r1 int source_stride 240; r2 unsigned char *ref_ptr 241; r3 int recon_stride 242; stack unsigned int *sse 243; 244;note: Based on vpx_variance16x16_media. In this function, sum is never used. 245; So, we can remove this part of calculation. 246 247|vpx_mse16x16_media| PROC 248 249 push {r4-r9, lr} 250 251 pld [r0, r1, lsl #0] 252 pld [r2, r3, lsl #0] 253 254 mov r12, #16 ; set loop counter to 16 (=block height) 255 mov r4, #0 ; initialize sse = 0 256 257loopmse 258 ; 1st 4 pixels 259 ldr r5, [r0, #0x0] ; load 4 src pixels 260 ldr r6, [r2, #0x0] ; load 4 ref pixels 261 262 mov lr, #0 ; constant zero 263 264 usub8 r8, r5, r6 ; calculate difference 265 pld [r0, r1, lsl #1] 266 sel r7, r8, lr ; select bytes with positive difference 267 usub8 r9, r6, r5 ; calculate difference with reversed operands 268 pld [r2, r3, lsl #1] 269 sel r8, r9, lr ; select bytes with negative difference 270 271 ; calculate partial sums 272 usad8 r5, r7, lr ; calculate sum of positive differences 273 usad8 r6, r8, lr ; calculate sum of negative differences 274 orr r8, r8, r7 ; differences of all 4 pixels 275 276 ldr r5, [r0, #0x4] ; load 4 src pixels 277 278 ; calculate sse 279 uxtb16 r6, r8 ; byte (two pixels) to halfwords 280 uxtb16 r7, r8, ror #8 ; another two pixels to halfwords 281 smlad r4, r6, r6, r4 ; dual signed multiply, add and accumulate (1) 282 283 ; 2nd 4 pixels 284 ldr r6, [r2, #0x4] ; load 4 ref pixels 285 smlad r4, r7, r7, r4 ; dual signed multiply, add and accumulate (2) 286 287 usub8 r8, r5, r6 ; calculate difference 288 sel r7, r8, lr ; select bytes with positive difference 289 usub8 r9, r6, r5 ; calculate difference with reversed operands 290 sel r8, r9, lr ; select bytes with negative difference 291 292 ; calculate partial sums 293 usad8 r5, r7, lr ; calculate sum of positive differences 294 usad8 r6, r8, lr ; calculate sum of negative differences 295 orr r8, r8, r7 ; differences of all 4 pixels 296 ldr r5, [r0, #0x8] ; load 4 src pixels 297 ; calculate sse 298 uxtb16 r6, r8 ; byte (two pixels) to halfwords 299 uxtb16 r7, r8, ror #8 ; another two pixels to halfwords 300 smlad r4, r6, r6, r4 ; dual signed multiply, add and accumulate (1) 301 302 ; 3rd 4 pixels 303 ldr r6, [r2, #0x8] ; load 4 ref pixels 304 smlad r4, r7, r7, r4 ; dual signed multiply, add and accumulate (2) 305 306 usub8 r8, r5, r6 ; calculate difference 307 sel r7, r8, lr ; select bytes with positive difference 308 usub8 r9, r6, r5 ; calculate difference with reversed operands 309 sel r8, r9, lr ; select bytes with negative difference 310 311 ; calculate partial sums 312 usad8 r5, r7, lr ; calculate sum of positive differences 313 usad8 r6, r8, lr ; calculate sum of negative differences 314 orr r8, r8, r7 ; differences of all 4 pixels 315 316 ldr r5, [r0, #0xc] ; load 4 src pixels 317 318 ; calculate sse 319 uxtb16 r6, r8 ; byte (two pixels) to halfwords 320 uxtb16 r7, r8, ror #8 ; another two pixels to halfwords 321 smlad r4, r6, r6, r4 ; dual signed multiply, add and accumulate (1) 322 323 ; 4th 4 pixels 324 ldr r6, [r2, #0xc] ; load 4 ref pixels 325 smlad r4, r7, r7, r4 ; dual signed multiply, add and accumulate (2) 326 327 usub8 r8, r5, r6 ; calculate difference 328 add r0, r0, r1 ; set src_ptr to next row 329 sel r7, r8, lr ; select bytes with positive difference 330 usub8 r9, r6, r5 ; calculate difference with reversed operands 331 add r2, r2, r3 ; set dst_ptr to next row 332 sel r8, r9, lr ; select bytes with negative difference 333 334 ; calculate partial sums 335 usad8 r5, r7, lr ; calculate sum of positive differences 336 usad8 r6, r8, lr ; calculate sum of negative differences 337 orr r8, r8, r7 ; differences of all 4 pixels 338 339 subs r12, r12, #1 ; next row 340 341 ; calculate sse 342 uxtb16 r6, r8 ; byte (two pixels) to halfwords 343 uxtb16 r7, r8, ror #8 ; another two pixels to halfwords 344 smlad r4, r6, r6, r4 ; dual signed multiply, add and accumulate (1) 345 smlad r4, r7, r7, r4 ; dual signed multiply, add and accumulate (2) 346 347 bne loopmse 348 349 ; return stuff 350 ldr r1, [sp, #28] ; get address of sse 351 mov r0, r4 ; return sse 352 str r4, [r1] ; store sse 353 354 pop {r4-r9, pc} 355 356 ENDP 357 358 END 359