1;
2;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3;
4;  Use of this source code is governed by a BSD-style license
5;  that can be found in the LICENSE file in the root of the source
6;  tree. An additional intellectual property rights grant can be found
7;  in the file PATENTS.  All contributing project authors may
8;  be found in the AUTHORS file in the root of the source tree.
9;
10
11
12    EXPORT  |vp8_mse16x16_neon|
13    EXPORT  |vp8_get4x4sse_cs_neon|
14
15    ARM
16    REQUIRE8
17    PRESERVE8
18
19    AREA ||.text||, CODE, READONLY, ALIGN=2
20;============================
21; r0    unsigned char *src_ptr
22; r1    int source_stride
23; r2    unsigned char *ref_ptr
24; r3    int  recon_stride
25; stack unsigned int *sse
26;note: in this function, sum is never used. So, we can remove this part of calculation
27;from vp8_variance().
28
29|vp8_mse16x16_neon| PROC
30    vmov.i8         q7, #0                      ;q7, q8, q9, q10 - sse
31    vmov.i8         q8, #0
32    vmov.i8         q9, #0
33    vmov.i8         q10, #0
34
35    mov             r12, #8
36
37mse16x16_neon_loop
38    vld1.8          {q0}, [r0], r1              ;Load up source and reference
39    vld1.8          {q2}, [r2], r3
40    vld1.8          {q1}, [r0], r1
41    vld1.8          {q3}, [r2], r3
42
43    vsubl.u8        q11, d0, d4
44    vsubl.u8        q12, d1, d5
45    vsubl.u8        q13, d2, d6
46    vsubl.u8        q14, d3, d7
47
48    vmlal.s16       q7, d22, d22
49    vmlal.s16       q8, d23, d23
50
51    subs            r12, r12, #1
52
53    vmlal.s16       q9, d24, d24
54    vmlal.s16       q10, d25, d25
55    vmlal.s16       q7, d26, d26
56    vmlal.s16       q8, d27, d27
57    vmlal.s16       q9, d28, d28
58    vmlal.s16       q10, d29, d29
59
60    bne             mse16x16_neon_loop
61
62    vadd.u32        q7, q7, q8
63    vadd.u32        q9, q9, q10
64
65    ldr             r12, [sp]               ;load *sse from stack
66
67    vadd.u32        q10, q7, q9
68    vpaddl.u32      q1, q10
69    vadd.u64        d0, d2, d3
70
71    vst1.32         {d0[0]}, [r12]
72    vmov.32         r0, d0[0]
73
74    bx              lr
75
76    ENDP
77
78
79;=============================
80; r0    unsigned char *src_ptr,
81; r1    int  source_stride,
82; r2    unsigned char *ref_ptr,
83; r3    int  recon_stride
84|vp8_get4x4sse_cs_neon| PROC
85    vld1.8          {d0}, [r0], r1              ;Load up source and reference
86    vld1.8          {d4}, [r2], r3
87    vld1.8          {d1}, [r0], r1
88    vld1.8          {d5}, [r2], r3
89    vld1.8          {d2}, [r0], r1
90    vld1.8          {d6}, [r2], r3
91    vld1.8          {d3}, [r0], r1
92    vld1.8          {d7}, [r2], r3
93
94    vsubl.u8        q11, d0, d4
95    vsubl.u8        q12, d1, d5
96    vsubl.u8        q13, d2, d6
97    vsubl.u8        q14, d3, d7
98
99    vmull.s16       q7, d22, d22
100    vmull.s16       q8, d24, d24
101    vmull.s16       q9, d26, d26
102    vmull.s16       q10, d28, d28
103
104    vadd.u32        q7, q7, q8
105    vadd.u32        q9, q9, q10
106    vadd.u32        q9, q7, q9
107
108    vpaddl.u32      q1, q9
109    vadd.u64        d0, d2, d3
110
111    vmov.32         r0, d0[0]
112    bx              lr
113
114    ENDP
115
116    END
117