1; 2; Copyright (c) 2011 The WebM project authors. All Rights Reserved. 3; 4; Use of this source code is governed by a BSD-style license 5; that can be found in the LICENSE file in the root of the source 6; tree. An additional intellectual property rights grant can be found 7; in the file PATENTS. All contributing project authors may 8; be found in the AUTHORS file in the root of the source tree. 9; 10 11 12 EXPORT |vp8_mse16x16_armv6| 13 14 ARM 15 16 AREA ||.text||, CODE, READONLY, ALIGN=2 17 18; r0 unsigned char *src_ptr 19; r1 int source_stride 20; r2 unsigned char *ref_ptr 21; r3 int recon_stride 22; stack unsigned int *sse 23; 24;note: Based on vp8_variance16x16_armv6. In this function, sum is never used. 25; So, we can remove this part of calculation. 26 27|vp8_mse16x16_armv6| PROC 28 29 push {r4-r9, lr} 30 31 pld [r0, r1, lsl #0] 32 pld [r2, r3, lsl #0] 33 34 mov r12, #16 ; set loop counter to 16 (=block height) 35 mov r4, #0 ; initialize sse = 0 36 37loop 38 ; 1st 4 pixels 39 ldr r5, [r0, #0x0] ; load 4 src pixels 40 ldr r6, [r2, #0x0] ; load 4 ref pixels 41 42 mov lr, #0 ; constant zero 43 44 usub8 r8, r5, r6 ; calculate difference 45 pld [r0, r1, lsl #1] 46 sel r7, r8, lr ; select bytes with positive difference 47 usub8 r9, r6, r5 ; calculate difference with reversed operands 48 pld [r2, r3, lsl #1] 49 sel r8, r9, lr ; select bytes with negative difference 50 51 ; calculate partial sums 52 usad8 r5, r7, lr ; calculate sum of positive differences 53 usad8 r6, r8, lr ; calculate sum of negative differences 54 orr r8, r8, r7 ; differences of all 4 pixels 55 56 ldr r5, [r0, #0x4] ; load 4 src pixels 57 58 ; calculate sse 59 uxtb16 r6, r8 ; byte (two pixels) to halfwords 60 uxtb16 r7, r8, ror #8 ; another two pixels to halfwords 61 smlad r4, r6, r6, r4 ; dual signed multiply, add and accumulate (1) 62 63 ; 2nd 4 pixels 64 ldr r6, [r2, #0x4] ; load 4 ref pixels 65 smlad r4, r7, r7, r4 ; dual signed multiply, add and accumulate (2) 66 67 usub8 r8, r5, r6 ; calculate difference 68 sel r7, r8, lr ; select bytes with positive difference 69 usub8 r9, r6, r5 ; calculate difference with reversed operands 70 sel r8, r9, lr ; select bytes with negative difference 71 72 ; calculate partial sums 73 usad8 r5, r7, lr ; calculate sum of positive differences 74 usad8 r6, r8, lr ; calculate sum of negative differences 75 orr r8, r8, r7 ; differences of all 4 pixels 76 ldr r5, [r0, #0x8] ; load 4 src pixels 77 ; calculate sse 78 uxtb16 r6, r8 ; byte (two pixels) to halfwords 79 uxtb16 r7, r8, ror #8 ; another two pixels to halfwords 80 smlad r4, r6, r6, r4 ; dual signed multiply, add and accumulate (1) 81 82 ; 3rd 4 pixels 83 ldr r6, [r2, #0x8] ; load 4 ref pixels 84 smlad r4, r7, r7, r4 ; dual signed multiply, add and accumulate (2) 85 86 usub8 r8, r5, r6 ; calculate difference 87 sel r7, r8, lr ; select bytes with positive difference 88 usub8 r9, r6, r5 ; calculate difference with reversed operands 89 sel r8, r9, lr ; select bytes with negative difference 90 91 ; calculate partial sums 92 usad8 r5, r7, lr ; calculate sum of positive differences 93 usad8 r6, r8, lr ; calculate sum of negative differences 94 orr r8, r8, r7 ; differences of all 4 pixels 95 96 ldr r5, [r0, #0xc] ; load 4 src pixels 97 98 ; calculate sse 99 uxtb16 r6, r8 ; byte (two pixels) to halfwords 100 uxtb16 r7, r8, ror #8 ; another two pixels to halfwords 101 smlad r4, r6, r6, r4 ; dual signed multiply, add and accumulate (1) 102 103 ; 4th 4 pixels 104 ldr r6, [r2, #0xc] ; load 4 ref pixels 105 smlad r4, r7, r7, r4 ; dual signed multiply, add and accumulate (2) 106 107 usub8 r8, r5, r6 ; calculate difference 108 add r0, r0, r1 ; set src_ptr to next row 109 sel r7, r8, lr ; select bytes with positive difference 110 usub8 r9, r6, r5 ; calculate difference with reversed operands 111 add r2, r2, r3 ; set dst_ptr to next row 112 sel r8, r9, lr ; select bytes with negative difference 113 114 ; calculate partial sums 115 usad8 r5, r7, lr ; calculate sum of positive differences 116 usad8 r6, r8, lr ; calculate sum of negative differences 117 orr r8, r8, r7 ; differences of all 4 pixels 118 119 subs r12, r12, #1 ; next row 120 121 ; calculate sse 122 uxtb16 r6, r8 ; byte (two pixels) to halfwords 123 uxtb16 r7, r8, ror #8 ; another two pixels to halfwords 124 smlad r4, r6, r6, r4 ; dual signed multiply, add and accumulate (1) 125 smlad r4, r7, r7, r4 ; dual signed multiply, add and accumulate (2) 126 127 bne loop 128 129 ; return stuff 130 ldr r1, [sp, #28] ; get address of sse 131 mov r0, r4 ; return sse 132 str r4, [r1] ; store sse 133 134 pop {r4-r9, pc} 135 136 ENDP 137 138 END 139