1; 2; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3; 4; Use of this source code is governed by a BSD-style license 5; that can be found in the LICENSE file in the root of the source 6; tree. An additional intellectual property rights grant can be found 7; in the file PATENTS. All contributing project authors may 8; be found in the AUTHORS file in the root of the source tree. 9; 10 11 12 EXPORT |vp8_mse16x16_neon| 13 EXPORT |vp8_get4x4sse_cs_neon| 14 15 ARM 16 REQUIRE8 17 PRESERVE8 18 19 AREA ||.text||, CODE, READONLY, ALIGN=2 20;============================ 21; r0 unsigned char *src_ptr 22; r1 int source_stride 23; r2 unsigned char *ref_ptr 24; r3 int recon_stride 25; stack unsigned int *sse 26;note: in this function, sum is never used. So, we can remove this part of calculation 27;from vp8_variance(). 28 29|vp8_mse16x16_neon| PROC 30 vpush {q7} 31 32 vmov.i8 q7, #0 ;q7, q8, q9, q10 - sse 33 vmov.i8 q8, #0 34 vmov.i8 q9, #0 35 vmov.i8 q10, #0 36 37 mov r12, #8 38 39mse16x16_neon_loop 40 vld1.8 {q0}, [r0], r1 ;Load up source and reference 41 vld1.8 {q2}, [r2], r3 42 vld1.8 {q1}, [r0], r1 43 vld1.8 {q3}, [r2], r3 44 45 vsubl.u8 q11, d0, d4 46 vsubl.u8 q12, d1, d5 47 vsubl.u8 q13, d2, d6 48 vsubl.u8 q14, d3, d7 49 50 vmlal.s16 q7, d22, d22 51 vmlal.s16 q8, d23, d23 52 53 subs r12, r12, #1 54 55 vmlal.s16 q9, d24, d24 56 vmlal.s16 q10, d25, d25 57 vmlal.s16 q7, d26, d26 58 vmlal.s16 q8, d27, d27 59 vmlal.s16 q9, d28, d28 60 vmlal.s16 q10, d29, d29 61 62 bne mse16x16_neon_loop 63 64 vadd.u32 q7, q7, q8 65 vadd.u32 q9, q9, q10 66 67 ldr r12, [sp, #16] ;load *sse from stack 68 69 vadd.u32 q10, q7, q9 70 vpaddl.u32 q1, q10 71 vadd.u64 d0, d2, d3 72 73 vst1.32 {d0[0]}, [r12] 74 vmov.32 r0, d0[0] 75 76 vpop {q7} 77 bx lr 78 79 ENDP 80 81 82;============================= 83; r0 unsigned char *src_ptr, 84; r1 int source_stride, 85; r2 unsigned char *ref_ptr, 86; r3 int recon_stride 87|vp8_get4x4sse_cs_neon| PROC 88 vpush {q7} 89 90 vld1.8 {d0}, [r0], r1 ;Load up source and reference 91 vld1.8 {d4}, [r2], r3 92 vld1.8 {d1}, [r0], r1 93 vld1.8 {d5}, [r2], r3 94 vld1.8 {d2}, [r0], r1 95 vld1.8 {d6}, [r2], r3 96 vld1.8 {d3}, [r0], r1 97 vld1.8 {d7}, [r2], r3 98 99 vsubl.u8 q11, d0, d4 100 vsubl.u8 q12, d1, d5 101 vsubl.u8 q13, d2, d6 102 vsubl.u8 q14, d3, d7 103 104 vmull.s16 q7, d22, d22 105 vmull.s16 q8, d24, d24 106 vmull.s16 q9, d26, d26 107 vmull.s16 q10, d28, d28 108 109 vadd.u32 q7, q7, q8 110 vadd.u32 q9, q9, q10 111 vadd.u32 q9, q7, q9 112 113 vpaddl.u32 q1, q9 114 vadd.u64 d0, d2, d3 115 116 vmov.32 r0, d0[0] 117 118 vpop {q7} 119 bx lr 120 121 ENDP 122 123 END 124