1 /*
2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include <arm_neon.h>
12 #include "./vp9_rtcd.h"
13 #include "./vpx_config.h"
14
15 #include "vpx/vpx_integer.h"
16
horizontal_long_add_16x8(const uint16x8_t vec_lo,const uint16x8_t vec_hi)17 static INLINE unsigned int horizontal_long_add_16x8(const uint16x8_t vec_lo,
18 const uint16x8_t vec_hi) {
19 const uint32x4_t vec_l_lo = vaddl_u16(vget_low_u16(vec_lo),
20 vget_high_u16(vec_lo));
21 const uint32x4_t vec_l_hi = vaddl_u16(vget_low_u16(vec_hi),
22 vget_high_u16(vec_hi));
23 const uint32x4_t a = vaddq_u32(vec_l_lo, vec_l_hi);
24 const uint64x2_t b = vpaddlq_u32(a);
25 const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)),
26 vreinterpret_u32_u64(vget_high_u64(b)));
27 return vget_lane_u32(c, 0);
28 }
horizontal_add_16x8(const uint16x8_t vec_16x8)29 static INLINE unsigned int horizontal_add_16x8(const uint16x8_t vec_16x8) {
30 const uint32x4_t a = vpaddlq_u16(vec_16x8);
31 const uint64x2_t b = vpaddlq_u32(a);
32 const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)),
33 vreinterpret_u32_u64(vget_high_u64(b)));
34 return vget_lane_u32(c, 0);
35 }
36
vp9_sad64x64_neon(const uint8_t * src,int src_stride,const uint8_t * ref,int ref_stride)37 unsigned int vp9_sad64x64_neon(const uint8_t *src, int src_stride,
38 const uint8_t *ref, int ref_stride) {
39 int i;
40 uint16x8_t vec_accum_lo = vdupq_n_u16(0);
41 uint16x8_t vec_accum_hi = vdupq_n_u16(0);
42 for (i = 0; i < 64; ++i) {
43 const uint8x16_t vec_src_00 = vld1q_u8(src);
44 const uint8x16_t vec_src_16 = vld1q_u8(src + 16);
45 const uint8x16_t vec_src_32 = vld1q_u8(src + 32);
46 const uint8x16_t vec_src_48 = vld1q_u8(src + 48);
47 const uint8x16_t vec_ref_00 = vld1q_u8(ref);
48 const uint8x16_t vec_ref_16 = vld1q_u8(ref + 16);
49 const uint8x16_t vec_ref_32 = vld1q_u8(ref + 32);
50 const uint8x16_t vec_ref_48 = vld1q_u8(ref + 48);
51 src += src_stride;
52 ref += ref_stride;
53 vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_00),
54 vget_low_u8(vec_ref_00));
55 vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_00),
56 vget_high_u8(vec_ref_00));
57 vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_16),
58 vget_low_u8(vec_ref_16));
59 vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_16),
60 vget_high_u8(vec_ref_16));
61 vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_32),
62 vget_low_u8(vec_ref_32));
63 vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_32),
64 vget_high_u8(vec_ref_32));
65 vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_48),
66 vget_low_u8(vec_ref_48));
67 vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_48),
68 vget_high_u8(vec_ref_48));
69 }
70 return horizontal_long_add_16x8(vec_accum_lo, vec_accum_hi);
71 }
72
vp9_sad32x32_neon(const uint8_t * src,int src_stride,const uint8_t * ref,int ref_stride)73 unsigned int vp9_sad32x32_neon(const uint8_t *src, int src_stride,
74 const uint8_t *ref, int ref_stride) {
75 int i;
76 uint16x8_t vec_accum_lo = vdupq_n_u16(0);
77 uint16x8_t vec_accum_hi = vdupq_n_u16(0);
78
79 for (i = 0; i < 32; ++i) {
80 const uint8x16_t vec_src_00 = vld1q_u8(src);
81 const uint8x16_t vec_src_16 = vld1q_u8(src + 16);
82 const uint8x16_t vec_ref_00 = vld1q_u8(ref);
83 const uint8x16_t vec_ref_16 = vld1q_u8(ref + 16);
84 src += src_stride;
85 ref += ref_stride;
86 vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_00),
87 vget_low_u8(vec_ref_00));
88 vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_00),
89 vget_high_u8(vec_ref_00));
90 vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_16),
91 vget_low_u8(vec_ref_16));
92 vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_16),
93 vget_high_u8(vec_ref_16));
94 }
95 return horizontal_add_16x8(vaddq_u16(vec_accum_lo, vec_accum_hi));
96 }
97
vp9_sad16x16_neon(const uint8_t * src,int src_stride,const uint8_t * ref,int ref_stride)98 unsigned int vp9_sad16x16_neon(const uint8_t *src, int src_stride,
99 const uint8_t *ref, int ref_stride) {
100 int i;
101 uint16x8_t vec_accum_lo = vdupq_n_u16(0);
102 uint16x8_t vec_accum_hi = vdupq_n_u16(0);
103
104 for (i = 0; i < 16; ++i) {
105 const uint8x16_t vec_src = vld1q_u8(src);
106 const uint8x16_t vec_ref = vld1q_u8(ref);
107 src += src_stride;
108 ref += ref_stride;
109 vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src),
110 vget_low_u8(vec_ref));
111 vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src),
112 vget_high_u8(vec_ref));
113 }
114 return horizontal_add_16x8(vaddq_u16(vec_accum_lo, vec_accum_hi));
115 }
116
vp9_sad8x8_neon(const uint8_t * src,int src_stride,const uint8_t * ref,int ref_stride)117 unsigned int vp9_sad8x8_neon(const uint8_t *src, int src_stride,
118 const uint8_t *ref, int ref_stride) {
119 int i;
120 uint16x8_t vec_accum = vdupq_n_u16(0);
121
122 for (i = 0; i < 8; ++i) {
123 const uint8x8_t vec_src = vld1_u8(src);
124 const uint8x8_t vec_ref = vld1_u8(ref);
125 src += src_stride;
126 ref += ref_stride;
127 vec_accum = vabal_u8(vec_accum, vec_src, vec_ref);
128 }
129 return horizontal_add_16x8(vec_accum);
130 }
131