1 /*
2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include <arm_neon.h>
12
vp8_sad8x8_neon(unsigned char * src_ptr,int src_stride,unsigned char * ref_ptr,int ref_stride)13 unsigned int vp8_sad8x8_neon(
14 unsigned char *src_ptr,
15 int src_stride,
16 unsigned char *ref_ptr,
17 int ref_stride) {
18 uint8x8_t d0, d8;
19 uint16x8_t q12;
20 uint32x4_t q1;
21 uint64x2_t q3;
22 uint32x2_t d5;
23 int i;
24
25 d0 = vld1_u8(src_ptr);
26 src_ptr += src_stride;
27 d8 = vld1_u8(ref_ptr);
28 ref_ptr += ref_stride;
29 q12 = vabdl_u8(d0, d8);
30
31 for (i = 0; i < 7; i++) {
32 d0 = vld1_u8(src_ptr);
33 src_ptr += src_stride;
34 d8 = vld1_u8(ref_ptr);
35 ref_ptr += ref_stride;
36 q12 = vabal_u8(q12, d0, d8);
37 }
38
39 q1 = vpaddlq_u16(q12);
40 q3 = vpaddlq_u32(q1);
41 d5 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(q3)),
42 vreinterpret_u32_u64(vget_high_u64(q3)));
43
44 return vget_lane_u32(d5, 0);
45 }
46
vp8_sad8x16_neon(unsigned char * src_ptr,int src_stride,unsigned char * ref_ptr,int ref_stride)47 unsigned int vp8_sad8x16_neon(
48 unsigned char *src_ptr,
49 int src_stride,
50 unsigned char *ref_ptr,
51 int ref_stride) {
52 uint8x8_t d0, d8;
53 uint16x8_t q12;
54 uint32x4_t q1;
55 uint64x2_t q3;
56 uint32x2_t d5;
57 int i;
58
59 d0 = vld1_u8(src_ptr);
60 src_ptr += src_stride;
61 d8 = vld1_u8(ref_ptr);
62 ref_ptr += ref_stride;
63 q12 = vabdl_u8(d0, d8);
64
65 for (i = 0; i < 15; i++) {
66 d0 = vld1_u8(src_ptr);
67 src_ptr += src_stride;
68 d8 = vld1_u8(ref_ptr);
69 ref_ptr += ref_stride;
70 q12 = vabal_u8(q12, d0, d8);
71 }
72
73 q1 = vpaddlq_u16(q12);
74 q3 = vpaddlq_u32(q1);
75 d5 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(q3)),
76 vreinterpret_u32_u64(vget_high_u64(q3)));
77
78 return vget_lane_u32(d5, 0);
79 }
80
vp8_sad4x4_neon(unsigned char * src_ptr,int src_stride,unsigned char * ref_ptr,int ref_stride)81 unsigned int vp8_sad4x4_neon(
82 unsigned char *src_ptr,
83 int src_stride,
84 unsigned char *ref_ptr,
85 int ref_stride) {
86 uint8x8_t d0, d8;
87 uint16x8_t q12;
88 uint32x2_t d1;
89 uint64x1_t d3;
90 int i;
91
92 d0 = vld1_u8(src_ptr);
93 src_ptr += src_stride;
94 d8 = vld1_u8(ref_ptr);
95 ref_ptr += ref_stride;
96 q12 = vabdl_u8(d0, d8);
97
98 for (i = 0; i < 3; i++) {
99 d0 = vld1_u8(src_ptr);
100 src_ptr += src_stride;
101 d8 = vld1_u8(ref_ptr);
102 ref_ptr += ref_stride;
103 q12 = vabal_u8(q12, d0, d8);
104 }
105
106 d1 = vpaddl_u16(vget_low_u16(q12));
107 d3 = vpaddl_u32(d1);
108
109 return vget_lane_u32(vreinterpret_u32_u64(d3), 0);
110 }
111
vp8_sad16x16_neon(unsigned char * src_ptr,int src_stride,unsigned char * ref_ptr,int ref_stride)112 unsigned int vp8_sad16x16_neon(
113 unsigned char *src_ptr,
114 int src_stride,
115 unsigned char *ref_ptr,
116 int ref_stride) {
117 uint8x16_t q0, q4;
118 uint16x8_t q12, q13;
119 uint32x4_t q1;
120 uint64x2_t q3;
121 uint32x2_t d5;
122 int i;
123
124 q0 = vld1q_u8(src_ptr);
125 src_ptr += src_stride;
126 q4 = vld1q_u8(ref_ptr);
127 ref_ptr += ref_stride;
128 q12 = vabdl_u8(vget_low_u8(q0), vget_low_u8(q4));
129 q13 = vabdl_u8(vget_high_u8(q0), vget_high_u8(q4));
130
131 for (i = 0; i < 15; i++) {
132 q0 = vld1q_u8(src_ptr);
133 src_ptr += src_stride;
134 q4 = vld1q_u8(ref_ptr);
135 ref_ptr += ref_stride;
136 q12 = vabal_u8(q12, vget_low_u8(q0), vget_low_u8(q4));
137 q13 = vabal_u8(q13, vget_high_u8(q0), vget_high_u8(q4));
138 }
139
140 q12 = vaddq_u16(q12, q13);
141 q1 = vpaddlq_u16(q12);
142 q3 = vpaddlq_u32(q1);
143 d5 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(q3)),
144 vreinterpret_u32_u64(vget_high_u64(q3)));
145
146 return vget_lane_u32(d5, 0);
147 }
148
vp8_sad16x8_neon(unsigned char * src_ptr,int src_stride,unsigned char * ref_ptr,int ref_stride)149 unsigned int vp8_sad16x8_neon(
150 unsigned char *src_ptr,
151 int src_stride,
152 unsigned char *ref_ptr,
153 int ref_stride) {
154 uint8x16_t q0, q4;
155 uint16x8_t q12, q13;
156 uint32x4_t q1;
157 uint64x2_t q3;
158 uint32x2_t d5;
159 int i;
160
161 q0 = vld1q_u8(src_ptr);
162 src_ptr += src_stride;
163 q4 = vld1q_u8(ref_ptr);
164 ref_ptr += ref_stride;
165 q12 = vabdl_u8(vget_low_u8(q0), vget_low_u8(q4));
166 q13 = vabdl_u8(vget_high_u8(q0), vget_high_u8(q4));
167
168 for (i = 0; i < 7; i++) {
169 q0 = vld1q_u8(src_ptr);
170 src_ptr += src_stride;
171 q4 = vld1q_u8(ref_ptr);
172 ref_ptr += ref_stride;
173 q12 = vabal_u8(q12, vget_low_u8(q0), vget_low_u8(q4));
174 q13 = vabal_u8(q13, vget_high_u8(q0), vget_high_u8(q4));
175 }
176
177 q12 = vaddq_u16(q12, q13);
178 q1 = vpaddlq_u16(q12);
179 q3 = vpaddlq_u32(q1);
180 d5 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(q3)),
181 vreinterpret_u32_u64(vget_high_u64(q3)));
182
183 return vget_lane_u32(d5, 0);
184 }
185