1 /*
2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include <arm_neon.h>
12
13 #include "./vpx_config.h"
14 #include "./vp8_rtcd.h"
15
vp8_loop_filter_simple_horizontal_edge_neon(unsigned char * s,int p,const unsigned char * blimit)16 static INLINE void vp8_loop_filter_simple_horizontal_edge_neon(
17 unsigned char *s, int p, const unsigned char *blimit) {
18 uint8_t *sp;
19 uint8x16_t qblimit, q0u8;
20 uint8x16_t q5u8, q6u8, q7u8, q8u8, q9u8, q10u8, q14u8, q15u8;
21 int16x8_t q2s16, q3s16, q13s16;
22 int8x8_t d8s8, d9s8;
23 int8x16_t q2s8, q3s8, q4s8, q10s8, q11s8, q14s8;
24
25 qblimit = vdupq_n_u8(*blimit);
26
27 sp = s - (p << 1);
28 q5u8 = vld1q_u8(sp);
29 sp += p;
30 q6u8 = vld1q_u8(sp);
31 sp += p;
32 q7u8 = vld1q_u8(sp);
33 sp += p;
34 q8u8 = vld1q_u8(sp);
35
36 q15u8 = vabdq_u8(q6u8, q7u8);
37 q14u8 = vabdq_u8(q5u8, q8u8);
38
39 q15u8 = vqaddq_u8(q15u8, q15u8);
40 q14u8 = vshrq_n_u8(q14u8, 1);
41 q0u8 = vdupq_n_u8(0x80);
42 q13s16 = vdupq_n_s16(3);
43 q15u8 = vqaddq_u8(q15u8, q14u8);
44
45 q5u8 = veorq_u8(q5u8, q0u8);
46 q6u8 = veorq_u8(q6u8, q0u8);
47 q7u8 = veorq_u8(q7u8, q0u8);
48 q8u8 = veorq_u8(q8u8, q0u8);
49
50 q15u8 = vcgeq_u8(qblimit, q15u8);
51
52 q2s16 = vsubl_s8(vget_low_s8(vreinterpretq_s8_u8(q7u8)),
53 vget_low_s8(vreinterpretq_s8_u8(q6u8)));
54 q3s16 = vsubl_s8(vget_high_s8(vreinterpretq_s8_u8(q7u8)),
55 vget_high_s8(vreinterpretq_s8_u8(q6u8)));
56
57 q4s8 = vqsubq_s8(vreinterpretq_s8_u8(q5u8), vreinterpretq_s8_u8(q8u8));
58
59 q2s16 = vmulq_s16(q2s16, q13s16);
60 q3s16 = vmulq_s16(q3s16, q13s16);
61
62 q10u8 = vdupq_n_u8(3);
63 q9u8 = vdupq_n_u8(4);
64
65 q2s16 = vaddw_s8(q2s16, vget_low_s8(q4s8));
66 q3s16 = vaddw_s8(q3s16, vget_high_s8(q4s8));
67
68 d8s8 = vqmovn_s16(q2s16);
69 d9s8 = vqmovn_s16(q3s16);
70 q4s8 = vcombine_s8(d8s8, d9s8);
71
72 q14s8 = vandq_s8(q4s8, vreinterpretq_s8_u8(q15u8));
73
74 q2s8 = vqaddq_s8(q14s8, vreinterpretq_s8_u8(q10u8));
75 q3s8 = vqaddq_s8(q14s8, vreinterpretq_s8_u8(q9u8));
76 q2s8 = vshrq_n_s8(q2s8, 3);
77 q3s8 = vshrq_n_s8(q3s8, 3);
78
79 q11s8 = vqaddq_s8(vreinterpretq_s8_u8(q6u8), q2s8);
80 q10s8 = vqsubq_s8(vreinterpretq_s8_u8(q7u8), q3s8);
81
82 q6u8 = veorq_u8(vreinterpretq_u8_s8(q11s8), q0u8);
83 q7u8 = veorq_u8(vreinterpretq_u8_s8(q10s8), q0u8);
84
85 vst1q_u8(s, q7u8);
86 s -= p;
87 vst1q_u8(s, q6u8);
88 return;
89 }
90
vp8_loop_filter_bhs_neon(unsigned char * y_ptr,int y_stride,const unsigned char * blimit)91 void vp8_loop_filter_bhs_neon(unsigned char *y_ptr, int y_stride,
92 const unsigned char *blimit) {
93 y_ptr += y_stride * 4;
94 vp8_loop_filter_simple_horizontal_edge_neon(y_ptr, y_stride, blimit);
95 y_ptr += y_stride * 4;
96 vp8_loop_filter_simple_horizontal_edge_neon(y_ptr, y_stride, blimit);
97 y_ptr += y_stride * 4;
98 vp8_loop_filter_simple_horizontal_edge_neon(y_ptr, y_stride, blimit);
99 return;
100 }
101
vp8_loop_filter_mbhs_neon(unsigned char * y_ptr,int y_stride,const unsigned char * blimit)102 void vp8_loop_filter_mbhs_neon(unsigned char *y_ptr, int y_stride,
103 const unsigned char *blimit) {
104 vp8_loop_filter_simple_horizontal_edge_neon(y_ptr, y_stride, blimit);
105 return;
106 }
107