1;
2;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3;
4;  Use of this source code is governed by a BSD-style license
5;  that can be found in the LICENSE file in the root of the source
6;  tree. An additional intellectual property rights grant can be found
7;  in the file PATENTS.  All contributing project authors may
8;  be found in the AUTHORS file in the root of the source tree.
9;
10
11
12    EXPORT  |vp8_mse16x16_neon|
13    EXPORT  |vp8_get4x4sse_cs_neon|
14
15    ARM
16    REQUIRE8
17    PRESERVE8
18
19    AREA ||.text||, CODE, READONLY, ALIGN=2
20;============================
21; r0    unsigned char *src_ptr
22; r1    int source_stride
23; r2    unsigned char *ref_ptr
24; r3    int  recon_stride
25; stack unsigned int *sse
26;note: in this function, sum is never used. So, we can remove this part of calculation
27;from vp8_variance().
28
29|vp8_mse16x16_neon| PROC
30    vpush           {q7}
31
32    vmov.i8         q7, #0                      ;q7, q8, q9, q10 - sse
33    vmov.i8         q8, #0
34    vmov.i8         q9, #0
35    vmov.i8         q10, #0
36
37    mov             r12, #8
38
39mse16x16_neon_loop
40    vld1.8          {q0}, [r0], r1              ;Load up source and reference
41    vld1.8          {q2}, [r2], r3
42    vld1.8          {q1}, [r0], r1
43    vld1.8          {q3}, [r2], r3
44
45    vsubl.u8        q11, d0, d4
46    vsubl.u8        q12, d1, d5
47    vsubl.u8        q13, d2, d6
48    vsubl.u8        q14, d3, d7
49
50    vmlal.s16       q7, d22, d22
51    vmlal.s16       q8, d23, d23
52
53    subs            r12, r12, #1
54
55    vmlal.s16       q9, d24, d24
56    vmlal.s16       q10, d25, d25
57    vmlal.s16       q7, d26, d26
58    vmlal.s16       q8, d27, d27
59    vmlal.s16       q9, d28, d28
60    vmlal.s16       q10, d29, d29
61
62    bne             mse16x16_neon_loop
63
64    vadd.u32        q7, q7, q8
65    vadd.u32        q9, q9, q10
66
67    ldr             r12, [sp, #16]              ;load *sse from stack
68
69    vadd.u32        q10, q7, q9
70    vpaddl.u32      q1, q10
71    vadd.u64        d0, d2, d3
72
73    vst1.32         {d0[0]}, [r12]
74    vmov.32         r0, d0[0]
75
76    vpop            {q7}
77    bx              lr
78
79    ENDP
80
81
82;=============================
83; r0    unsigned char *src_ptr,
84; r1    int  source_stride,
85; r2    unsigned char *ref_ptr,
86; r3    int  recon_stride
87|vp8_get4x4sse_cs_neon| PROC
88    vpush           {q7}
89
90    vld1.8          {d0}, [r0], r1              ;Load up source and reference
91    vld1.8          {d4}, [r2], r3
92    vld1.8          {d1}, [r0], r1
93    vld1.8          {d5}, [r2], r3
94    vld1.8          {d2}, [r0], r1
95    vld1.8          {d6}, [r2], r3
96    vld1.8          {d3}, [r0], r1
97    vld1.8          {d7}, [r2], r3
98
99    vsubl.u8        q11, d0, d4
100    vsubl.u8        q12, d1, d5
101    vsubl.u8        q13, d2, d6
102    vsubl.u8        q14, d3, d7
103
104    vmull.s16       q7, d22, d22
105    vmull.s16       q8, d24, d24
106    vmull.s16       q9, d26, d26
107    vmull.s16       q10, d28, d28
108
109    vadd.u32        q7, q7, q8
110    vadd.u32        q9, q9, q10
111    vadd.u32        q9, q7, q9
112
113    vpaddl.u32      q1, q9
114    vadd.u64        d0, d2, d3
115
116    vmov.32         r0, d0[0]
117
118    vpop            {q7}
119    bx              lr
120
121    ENDP
122
123    END
124