1 /*
2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include <stdlib.h>
12
13 #include "./vpx_config.h"
14 #include "./vpx_dsp_rtcd.h"
15 #include "vpx_dsp/vpx_dsp_common.h"
16 #include "vpx_ports/mem.h"
17
signed_char_clamp(int t)18 static INLINE int8_t signed_char_clamp(int t) {
19 return (int8_t)clamp(t, -128, 127);
20 }
21
22 #if CONFIG_VP9_HIGHBITDEPTH
signed_char_clamp_high(int t,int bd)23 static INLINE int16_t signed_char_clamp_high(int t, int bd) {
24 switch (bd) {
25 case 10: return (int16_t)clamp(t, -128 * 4, 128 * 4 - 1);
26 case 12: return (int16_t)clamp(t, -128 * 16, 128 * 16 - 1);
27 case 8:
28 default: return (int16_t)clamp(t, -128, 128 - 1);
29 }
30 }
31 #endif
32
33 // Should we apply any filter at all: 11111111 yes, 00000000 no
filter_mask(uint8_t limit,uint8_t blimit,uint8_t p3,uint8_t p2,uint8_t p1,uint8_t p0,uint8_t q0,uint8_t q1,uint8_t q2,uint8_t q3)34 static INLINE int8_t filter_mask(uint8_t limit, uint8_t blimit, uint8_t p3,
35 uint8_t p2, uint8_t p1, uint8_t p0, uint8_t q0,
36 uint8_t q1, uint8_t q2, uint8_t q3) {
37 int8_t mask = 0;
38 mask |= (abs(p3 - p2) > limit) * -1;
39 mask |= (abs(p2 - p1) > limit) * -1;
40 mask |= (abs(p1 - p0) > limit) * -1;
41 mask |= (abs(q1 - q0) > limit) * -1;
42 mask |= (abs(q2 - q1) > limit) * -1;
43 mask |= (abs(q3 - q2) > limit) * -1;
44 mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
45 return ~mask;
46 }
47
flat_mask4(uint8_t thresh,uint8_t p3,uint8_t p2,uint8_t p1,uint8_t p0,uint8_t q0,uint8_t q1,uint8_t q2,uint8_t q3)48 static INLINE int8_t flat_mask4(uint8_t thresh, uint8_t p3, uint8_t p2,
49 uint8_t p1, uint8_t p0, uint8_t q0, uint8_t q1,
50 uint8_t q2, uint8_t q3) {
51 int8_t mask = 0;
52 mask |= (abs(p1 - p0) > thresh) * -1;
53 mask |= (abs(q1 - q0) > thresh) * -1;
54 mask |= (abs(p2 - p0) > thresh) * -1;
55 mask |= (abs(q2 - q0) > thresh) * -1;
56 mask |= (abs(p3 - p0) > thresh) * -1;
57 mask |= (abs(q3 - q0) > thresh) * -1;
58 return ~mask;
59 }
60
flat_mask5(uint8_t thresh,uint8_t p4,uint8_t p3,uint8_t p2,uint8_t p1,uint8_t p0,uint8_t q0,uint8_t q1,uint8_t q2,uint8_t q3,uint8_t q4)61 static INLINE int8_t flat_mask5(uint8_t thresh, uint8_t p4, uint8_t p3,
62 uint8_t p2, uint8_t p1, uint8_t p0, uint8_t q0,
63 uint8_t q1, uint8_t q2, uint8_t q3,
64 uint8_t q4) {
65 int8_t mask = ~flat_mask4(thresh, p3, p2, p1, p0, q0, q1, q2, q3);
66 mask |= (abs(p4 - p0) > thresh) * -1;
67 mask |= (abs(q4 - q0) > thresh) * -1;
68 return ~mask;
69 }
70
71 // Is there high edge variance internal edge: 11111111 yes, 00000000 no
hev_mask(uint8_t thresh,uint8_t p1,uint8_t p0,uint8_t q0,uint8_t q1)72 static INLINE int8_t hev_mask(uint8_t thresh, uint8_t p1, uint8_t p0,
73 uint8_t q0, uint8_t q1) {
74 int8_t hev = 0;
75 hev |= (abs(p1 - p0) > thresh) * -1;
76 hev |= (abs(q1 - q0) > thresh) * -1;
77 return hev;
78 }
79
filter4(int8_t mask,uint8_t thresh,uint8_t * op1,uint8_t * op0,uint8_t * oq0,uint8_t * oq1)80 static INLINE void filter4(int8_t mask, uint8_t thresh, uint8_t *op1,
81 uint8_t *op0, uint8_t *oq0, uint8_t *oq1) {
82 int8_t filter1, filter2;
83
84 const int8_t ps1 = (int8_t)*op1 ^ 0x80;
85 const int8_t ps0 = (int8_t)*op0 ^ 0x80;
86 const int8_t qs0 = (int8_t)*oq0 ^ 0x80;
87 const int8_t qs1 = (int8_t)*oq1 ^ 0x80;
88 const uint8_t hev = hev_mask(thresh, *op1, *op0, *oq0, *oq1);
89
90 // add outer taps if we have high edge variance
91 int8_t filter = signed_char_clamp(ps1 - qs1) & hev;
92
93 // inner taps
94 filter = signed_char_clamp(filter + 3 * (qs0 - ps0)) & mask;
95
96 // save bottom 3 bits so that we round one side +4 and the other +3
97 // if it equals 4 we'll set it to adjust by -1 to account for the fact
98 // we'd round it by 3 the other way
99 filter1 = signed_char_clamp(filter + 4) >> 3;
100 filter2 = signed_char_clamp(filter + 3) >> 3;
101
102 *oq0 = signed_char_clamp(qs0 - filter1) ^ 0x80;
103 *op0 = signed_char_clamp(ps0 + filter2) ^ 0x80;
104
105 // outer tap adjustments
106 filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev;
107
108 *oq1 = signed_char_clamp(qs1 - filter) ^ 0x80;
109 *op1 = signed_char_clamp(ps1 + filter) ^ 0x80;
110 }
111
vpx_lpf_horizontal_4_c(uint8_t * s,int pitch,const uint8_t * blimit,const uint8_t * limit,const uint8_t * thresh)112 void vpx_lpf_horizontal_4_c(uint8_t *s, int pitch, const uint8_t *blimit,
113 const uint8_t *limit, const uint8_t *thresh) {
114 int i;
115
116 // loop filter designed to work using chars so that we can make maximum use
117 // of 8 bit simd instructions.
118 for (i = 0; i < 8; ++i) {
119 const uint8_t p3 = s[-4 * pitch], p2 = s[-3 * pitch], p1 = s[-2 * pitch],
120 p0 = s[-pitch];
121 const uint8_t q0 = s[0 * pitch], q1 = s[1 * pitch], q2 = s[2 * pitch],
122 q3 = s[3 * pitch];
123 const int8_t mask =
124 filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3);
125 filter4(mask, *thresh, s - 2 * pitch, s - 1 * pitch, s, s + 1 * pitch);
126 ++s;
127 }
128 }
129
vpx_lpf_horizontal_4_dual_c(uint8_t * s,int pitch,const uint8_t * blimit0,const uint8_t * limit0,const uint8_t * thresh0,const uint8_t * blimit1,const uint8_t * limit1,const uint8_t * thresh1)130 void vpx_lpf_horizontal_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0,
131 const uint8_t *limit0, const uint8_t *thresh0,
132 const uint8_t *blimit1, const uint8_t *limit1,
133 const uint8_t *thresh1) {
134 vpx_lpf_horizontal_4_c(s, pitch, blimit0, limit0, thresh0);
135 vpx_lpf_horizontal_4_c(s + 8, pitch, blimit1, limit1, thresh1);
136 }
137
vpx_lpf_vertical_4_c(uint8_t * s,int pitch,const uint8_t * blimit,const uint8_t * limit,const uint8_t * thresh)138 void vpx_lpf_vertical_4_c(uint8_t *s, int pitch, const uint8_t *blimit,
139 const uint8_t *limit, const uint8_t *thresh) {
140 int i;
141
142 // loop filter designed to work using chars so that we can make maximum use
143 // of 8 bit simd instructions.
144 for (i = 0; i < 8; ++i) {
145 const uint8_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1];
146 const uint8_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3];
147 const int8_t mask =
148 filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3);
149 filter4(mask, *thresh, s - 2, s - 1, s, s + 1);
150 s += pitch;
151 }
152 }
153
vpx_lpf_vertical_4_dual_c(uint8_t * s,int pitch,const uint8_t * blimit0,const uint8_t * limit0,const uint8_t * thresh0,const uint8_t * blimit1,const uint8_t * limit1,const uint8_t * thresh1)154 void vpx_lpf_vertical_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0,
155 const uint8_t *limit0, const uint8_t *thresh0,
156 const uint8_t *blimit1, const uint8_t *limit1,
157 const uint8_t *thresh1) {
158 vpx_lpf_vertical_4_c(s, pitch, blimit0, limit0, thresh0);
159 vpx_lpf_vertical_4_c(s + 8 * pitch, pitch, blimit1, limit1, thresh1);
160 }
161
filter8(int8_t mask,uint8_t thresh,uint8_t flat,uint8_t * op3,uint8_t * op2,uint8_t * op1,uint8_t * op0,uint8_t * oq0,uint8_t * oq1,uint8_t * oq2,uint8_t * oq3)162 static INLINE void filter8(int8_t mask, uint8_t thresh, uint8_t flat,
163 uint8_t *op3, uint8_t *op2, uint8_t *op1,
164 uint8_t *op0, uint8_t *oq0, uint8_t *oq1,
165 uint8_t *oq2, uint8_t *oq3) {
166 if (flat && mask) {
167 const uint8_t p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0;
168 const uint8_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3;
169
170 // 7-tap filter [1, 1, 1, 2, 1, 1, 1]
171 *op2 = ROUND_POWER_OF_TWO(p3 + p3 + p3 + 2 * p2 + p1 + p0 + q0, 3);
172 *op1 = ROUND_POWER_OF_TWO(p3 + p3 + p2 + 2 * p1 + p0 + q0 + q1, 3);
173 *op0 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + 2 * p0 + q0 + q1 + q2, 3);
174 *oq0 = ROUND_POWER_OF_TWO(p2 + p1 + p0 + 2 * q0 + q1 + q2 + q3, 3);
175 *oq1 = ROUND_POWER_OF_TWO(p1 + p0 + q0 + 2 * q1 + q2 + q3 + q3, 3);
176 *oq2 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + 2 * q2 + q3 + q3 + q3, 3);
177 } else {
178 filter4(mask, thresh, op1, op0, oq0, oq1);
179 }
180 }
181
vpx_lpf_horizontal_8_c(uint8_t * s,int pitch,const uint8_t * blimit,const uint8_t * limit,const uint8_t * thresh)182 void vpx_lpf_horizontal_8_c(uint8_t *s, int pitch, const uint8_t *blimit,
183 const uint8_t *limit, const uint8_t *thresh) {
184 int i;
185
186 // loop filter designed to work using chars so that we can make maximum use
187 // of 8 bit simd instructions.
188 for (i = 0; i < 8; ++i) {
189 const uint8_t p3 = s[-4 * pitch], p2 = s[-3 * pitch], p1 = s[-2 * pitch],
190 p0 = s[-pitch];
191 const uint8_t q0 = s[0 * pitch], q1 = s[1 * pitch], q2 = s[2 * pitch],
192 q3 = s[3 * pitch];
193
194 const int8_t mask =
195 filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3);
196 const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3);
197 filter8(mask, *thresh, flat, s - 4 * pitch, s - 3 * pitch, s - 2 * pitch,
198 s - 1 * pitch, s, s + 1 * pitch, s + 2 * pitch, s + 3 * pitch);
199 ++s;
200 }
201 }
202
vpx_lpf_horizontal_8_dual_c(uint8_t * s,int pitch,const uint8_t * blimit0,const uint8_t * limit0,const uint8_t * thresh0,const uint8_t * blimit1,const uint8_t * limit1,const uint8_t * thresh1)203 void vpx_lpf_horizontal_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0,
204 const uint8_t *limit0, const uint8_t *thresh0,
205 const uint8_t *blimit1, const uint8_t *limit1,
206 const uint8_t *thresh1) {
207 vpx_lpf_horizontal_8_c(s, pitch, blimit0, limit0, thresh0);
208 vpx_lpf_horizontal_8_c(s + 8, pitch, blimit1, limit1, thresh1);
209 }
210
vpx_lpf_vertical_8_c(uint8_t * s,int pitch,const uint8_t * blimit,const uint8_t * limit,const uint8_t * thresh)211 void vpx_lpf_vertical_8_c(uint8_t *s, int pitch, const uint8_t *blimit,
212 const uint8_t *limit, const uint8_t *thresh) {
213 int i;
214
215 for (i = 0; i < 8; ++i) {
216 const uint8_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1];
217 const uint8_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3];
218 const int8_t mask =
219 filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3);
220 const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3);
221 filter8(mask, *thresh, flat, s - 4, s - 3, s - 2, s - 1, s, s + 1, s + 2,
222 s + 3);
223 s += pitch;
224 }
225 }
226
vpx_lpf_vertical_8_dual_c(uint8_t * s,int pitch,const uint8_t * blimit0,const uint8_t * limit0,const uint8_t * thresh0,const uint8_t * blimit1,const uint8_t * limit1,const uint8_t * thresh1)227 void vpx_lpf_vertical_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0,
228 const uint8_t *limit0, const uint8_t *thresh0,
229 const uint8_t *blimit1, const uint8_t *limit1,
230 const uint8_t *thresh1) {
231 vpx_lpf_vertical_8_c(s, pitch, blimit0, limit0, thresh0);
232 vpx_lpf_vertical_8_c(s + 8 * pitch, pitch, blimit1, limit1, thresh1);
233 }
234
filter16(int8_t mask,uint8_t thresh,uint8_t flat,uint8_t flat2,uint8_t * op7,uint8_t * op6,uint8_t * op5,uint8_t * op4,uint8_t * op3,uint8_t * op2,uint8_t * op1,uint8_t * op0,uint8_t * oq0,uint8_t * oq1,uint8_t * oq2,uint8_t * oq3,uint8_t * oq4,uint8_t * oq5,uint8_t * oq6,uint8_t * oq7)235 static INLINE void filter16(int8_t mask, uint8_t thresh, uint8_t flat,
236 uint8_t flat2, uint8_t *op7, uint8_t *op6,
237 uint8_t *op5, uint8_t *op4, uint8_t *op3,
238 uint8_t *op2, uint8_t *op1, uint8_t *op0,
239 uint8_t *oq0, uint8_t *oq1, uint8_t *oq2,
240 uint8_t *oq3, uint8_t *oq4, uint8_t *oq5,
241 uint8_t *oq6, uint8_t *oq7) {
242 if (flat2 && flat && mask) {
243 const uint8_t p7 = *op7, p6 = *op6, p5 = *op5, p4 = *op4, p3 = *op3,
244 p2 = *op2, p1 = *op1, p0 = *op0;
245
246 const uint8_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3, q4 = *oq4,
247 q5 = *oq5, q6 = *oq6, q7 = *oq7;
248
249 // 15-tap filter [1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1]
250 *op6 = ROUND_POWER_OF_TWO(
251 p7 * 7 + p6 * 2 + p5 + p4 + p3 + p2 + p1 + p0 + q0, 4);
252 *op5 = ROUND_POWER_OF_TWO(
253 p7 * 6 + p6 + p5 * 2 + p4 + p3 + p2 + p1 + p0 + q0 + q1, 4);
254 *op4 = ROUND_POWER_OF_TWO(
255 p7 * 5 + p6 + p5 + p4 * 2 + p3 + p2 + p1 + p0 + q0 + q1 + q2, 4);
256 *op3 = ROUND_POWER_OF_TWO(
257 p7 * 4 + p6 + p5 + p4 + p3 * 2 + p2 + p1 + p0 + q0 + q1 + q2 + q3, 4);
258 *op2 = ROUND_POWER_OF_TWO(
259 p7 * 3 + p6 + p5 + p4 + p3 + p2 * 2 + p1 + p0 + q0 + q1 + q2 + q3 + q4,
260 4);
261 *op1 = ROUND_POWER_OF_TWO(p7 * 2 + p6 + p5 + p4 + p3 + p2 + p1 * 2 + p0 +
262 q0 + q1 + q2 + q3 + q4 + q5,
263 4);
264 *op0 = ROUND_POWER_OF_TWO(p7 + p6 + p5 + p4 + p3 + p2 + p1 + p0 * 2 + q0 +
265 q1 + q2 + q3 + q4 + q5 + q6,
266 4);
267 *oq0 = ROUND_POWER_OF_TWO(p6 + p5 + p4 + p3 + p2 + p1 + p0 + q0 * 2 + q1 +
268 q2 + q3 + q4 + q5 + q6 + q7,
269 4);
270 *oq1 = ROUND_POWER_OF_TWO(p5 + p4 + p3 + p2 + p1 + p0 + q0 + q1 * 2 + q2 +
271 q3 + q4 + q5 + q6 + q7 * 2,
272 4);
273 *oq2 = ROUND_POWER_OF_TWO(
274 p4 + p3 + p2 + p1 + p0 + q0 + q1 + q2 * 2 + q3 + q4 + q5 + q6 + q7 * 3,
275 4);
276 *oq3 = ROUND_POWER_OF_TWO(
277 p3 + p2 + p1 + p0 + q0 + q1 + q2 + q3 * 2 + q4 + q5 + q6 + q7 * 4, 4);
278 *oq4 = ROUND_POWER_OF_TWO(
279 p2 + p1 + p0 + q0 + q1 + q2 + q3 + q4 * 2 + q5 + q6 + q7 * 5, 4);
280 *oq5 = ROUND_POWER_OF_TWO(
281 p1 + p0 + q0 + q1 + q2 + q3 + q4 + q5 * 2 + q6 + q7 * 6, 4);
282 *oq6 = ROUND_POWER_OF_TWO(
283 p0 + q0 + q1 + q2 + q3 + q4 + q5 + q6 * 2 + q7 * 7, 4);
284 } else {
285 filter8(mask, thresh, flat, op3, op2, op1, op0, oq0, oq1, oq2, oq3);
286 }
287 }
288
mb_lpf_horizontal_edge_w(uint8_t * s,int pitch,const uint8_t * blimit,const uint8_t * limit,const uint8_t * thresh,int count)289 static void mb_lpf_horizontal_edge_w(uint8_t *s, int pitch,
290 const uint8_t *blimit,
291 const uint8_t *limit,
292 const uint8_t *thresh, int count) {
293 int i;
294
295 // loop filter designed to work using chars so that we can make maximum use
296 // of 8 bit simd instructions.
297 for (i = 0; i < 8 * count; ++i) {
298 const uint8_t p3 = s[-4 * pitch], p2 = s[-3 * pitch], p1 = s[-2 * pitch],
299 p0 = s[-pitch];
300 const uint8_t q0 = s[0 * pitch], q1 = s[1 * pitch], q2 = s[2 * pitch],
301 q3 = s[3 * pitch];
302 const int8_t mask =
303 filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3);
304 const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3);
305 const int8_t flat2 = flat_mask5(
306 1, s[-8 * pitch], s[-7 * pitch], s[-6 * pitch], s[-5 * pitch], p0, q0,
307 s[4 * pitch], s[5 * pitch], s[6 * pitch], s[7 * pitch]);
308
309 filter16(mask, *thresh, flat, flat2, s - 8 * pitch, s - 7 * pitch,
310 s - 6 * pitch, s - 5 * pitch, s - 4 * pitch, s - 3 * pitch,
311 s - 2 * pitch, s - 1 * pitch, s, s + 1 * pitch, s + 2 * pitch,
312 s + 3 * pitch, s + 4 * pitch, s + 5 * pitch, s + 6 * pitch,
313 s + 7 * pitch);
314 ++s;
315 }
316 }
317
vpx_lpf_horizontal_16_c(uint8_t * s,int pitch,const uint8_t * blimit,const uint8_t * limit,const uint8_t * thresh)318 void vpx_lpf_horizontal_16_c(uint8_t *s, int pitch, const uint8_t *blimit,
319 const uint8_t *limit, const uint8_t *thresh) {
320 mb_lpf_horizontal_edge_w(s, pitch, blimit, limit, thresh, 1);
321 }
322
vpx_lpf_horizontal_16_dual_c(uint8_t * s,int pitch,const uint8_t * blimit,const uint8_t * limit,const uint8_t * thresh)323 void vpx_lpf_horizontal_16_dual_c(uint8_t *s, int pitch, const uint8_t *blimit,
324 const uint8_t *limit, const uint8_t *thresh) {
325 mb_lpf_horizontal_edge_w(s, pitch, blimit, limit, thresh, 2);
326 }
327
mb_lpf_vertical_edge_w(uint8_t * s,int pitch,const uint8_t * blimit,const uint8_t * limit,const uint8_t * thresh,int count)328 static void mb_lpf_vertical_edge_w(uint8_t *s, int pitch, const uint8_t *blimit,
329 const uint8_t *limit, const uint8_t *thresh,
330 int count) {
331 int i;
332
333 for (i = 0; i < count; ++i) {
334 const uint8_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1];
335 const uint8_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3];
336 const int8_t mask =
337 filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3);
338 const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3);
339 const int8_t flat2 = flat_mask5(1, s[-8], s[-7], s[-6], s[-5], p0, q0, s[4],
340 s[5], s[6], s[7]);
341
342 filter16(mask, *thresh, flat, flat2, s - 8, s - 7, s - 6, s - 5, s - 4,
343 s - 3, s - 2, s - 1, s, s + 1, s + 2, s + 3, s + 4, s + 5, s + 6,
344 s + 7);
345 s += pitch;
346 }
347 }
348
vpx_lpf_vertical_16_c(uint8_t * s,int pitch,const uint8_t * blimit,const uint8_t * limit,const uint8_t * thresh)349 void vpx_lpf_vertical_16_c(uint8_t *s, int pitch, const uint8_t *blimit,
350 const uint8_t *limit, const uint8_t *thresh) {
351 mb_lpf_vertical_edge_w(s, pitch, blimit, limit, thresh, 8);
352 }
353
vpx_lpf_vertical_16_dual_c(uint8_t * s,int pitch,const uint8_t * blimit,const uint8_t * limit,const uint8_t * thresh)354 void vpx_lpf_vertical_16_dual_c(uint8_t *s, int pitch, const uint8_t *blimit,
355 const uint8_t *limit, const uint8_t *thresh) {
356 mb_lpf_vertical_edge_w(s, pitch, blimit, limit, thresh, 16);
357 }
358
359 #if CONFIG_VP9_HIGHBITDEPTH
360 // Should we apply any filter at all: 11111111 yes, 00000000 no ?
highbd_filter_mask(uint8_t limit,uint8_t blimit,uint16_t p3,uint16_t p2,uint16_t p1,uint16_t p0,uint16_t q0,uint16_t q1,uint16_t q2,uint16_t q3,int bd)361 static INLINE int8_t highbd_filter_mask(uint8_t limit, uint8_t blimit,
362 uint16_t p3, uint16_t p2, uint16_t p1,
363 uint16_t p0, uint16_t q0, uint16_t q1,
364 uint16_t q2, uint16_t q3, int bd) {
365 int8_t mask = 0;
366 int16_t limit16 = (uint16_t)limit << (bd - 8);
367 int16_t blimit16 = (uint16_t)blimit << (bd - 8);
368 mask |= (abs(p3 - p2) > limit16) * -1;
369 mask |= (abs(p2 - p1) > limit16) * -1;
370 mask |= (abs(p1 - p0) > limit16) * -1;
371 mask |= (abs(q1 - q0) > limit16) * -1;
372 mask |= (abs(q2 - q1) > limit16) * -1;
373 mask |= (abs(q3 - q2) > limit16) * -1;
374 mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit16) * -1;
375 return ~mask;
376 }
377
highbd_flat_mask4(uint8_t thresh,uint16_t p3,uint16_t p2,uint16_t p1,uint16_t p0,uint16_t q0,uint16_t q1,uint16_t q2,uint16_t q3,int bd)378 static INLINE int8_t highbd_flat_mask4(uint8_t thresh, uint16_t p3, uint16_t p2,
379 uint16_t p1, uint16_t p0, uint16_t q0,
380 uint16_t q1, uint16_t q2, uint16_t q3,
381 int bd) {
382 int8_t mask = 0;
383 int16_t thresh16 = (uint16_t)thresh << (bd - 8);
384 mask |= (abs(p1 - p0) > thresh16) * -1;
385 mask |= (abs(q1 - q0) > thresh16) * -1;
386 mask |= (abs(p2 - p0) > thresh16) * -1;
387 mask |= (abs(q2 - q0) > thresh16) * -1;
388 mask |= (abs(p3 - p0) > thresh16) * -1;
389 mask |= (abs(q3 - q0) > thresh16) * -1;
390 return ~mask;
391 }
392
highbd_flat_mask5(uint8_t thresh,uint16_t p4,uint16_t p3,uint16_t p2,uint16_t p1,uint16_t p0,uint16_t q0,uint16_t q1,uint16_t q2,uint16_t q3,uint16_t q4,int bd)393 static INLINE int8_t highbd_flat_mask5(uint8_t thresh, uint16_t p4, uint16_t p3,
394 uint16_t p2, uint16_t p1, uint16_t p0,
395 uint16_t q0, uint16_t q1, uint16_t q2,
396 uint16_t q3, uint16_t q4, int bd) {
397 int8_t mask = ~highbd_flat_mask4(thresh, p3, p2, p1, p0, q0, q1, q2, q3, bd);
398 int16_t thresh16 = (uint16_t)thresh << (bd - 8);
399 mask |= (abs(p4 - p0) > thresh16) * -1;
400 mask |= (abs(q4 - q0) > thresh16) * -1;
401 return ~mask;
402 }
403
404 // Is there high edge variance internal edge:
405 // 11111111_11111111 yes, 00000000_00000000 no ?
highbd_hev_mask(uint8_t thresh,uint16_t p1,uint16_t p0,uint16_t q0,uint16_t q1,int bd)406 static INLINE int16_t highbd_hev_mask(uint8_t thresh, uint16_t p1, uint16_t p0,
407 uint16_t q0, uint16_t q1, int bd) {
408 int16_t hev = 0;
409 int16_t thresh16 = (uint16_t)thresh << (bd - 8);
410 hev |= (abs(p1 - p0) > thresh16) * -1;
411 hev |= (abs(q1 - q0) > thresh16) * -1;
412 return hev;
413 }
414
highbd_filter4(int8_t mask,uint8_t thresh,uint16_t * op1,uint16_t * op0,uint16_t * oq0,uint16_t * oq1,int bd)415 static INLINE void highbd_filter4(int8_t mask, uint8_t thresh, uint16_t *op1,
416 uint16_t *op0, uint16_t *oq0, uint16_t *oq1,
417 int bd) {
418 int16_t filter1, filter2;
419 // ^0x80 equivalent to subtracting 0x80 from the values to turn them
420 // into -128 to +127 instead of 0 to 255.
421 int shift = bd - 8;
422 const int16_t ps1 = (int16_t)*op1 - (0x80 << shift);
423 const int16_t ps0 = (int16_t)*op0 - (0x80 << shift);
424 const int16_t qs0 = (int16_t)*oq0 - (0x80 << shift);
425 const int16_t qs1 = (int16_t)*oq1 - (0x80 << shift);
426 const uint16_t hev = highbd_hev_mask(thresh, *op1, *op0, *oq0, *oq1, bd);
427
428 // Add outer taps if we have high edge variance.
429 int16_t filter = signed_char_clamp_high(ps1 - qs1, bd) & hev;
430
431 // Inner taps.
432 filter = signed_char_clamp_high(filter + 3 * (qs0 - ps0), bd) & mask;
433
434 // Save bottom 3 bits so that we round one side +4 and the other +3
435 // if it equals 4 we'll set it to adjust by -1 to account for the fact
436 // we'd round it by 3 the other way.
437 filter1 = signed_char_clamp_high(filter + 4, bd) >> 3;
438 filter2 = signed_char_clamp_high(filter + 3, bd) >> 3;
439
440 *oq0 = signed_char_clamp_high(qs0 - filter1, bd) + (0x80 << shift);
441 *op0 = signed_char_clamp_high(ps0 + filter2, bd) + (0x80 << shift);
442
443 // Outer tap adjustments.
444 filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev;
445
446 *oq1 = signed_char_clamp_high(qs1 - filter, bd) + (0x80 << shift);
447 *op1 = signed_char_clamp_high(ps1 + filter, bd) + (0x80 << shift);
448 }
449
vpx_highbd_lpf_horizontal_4_c(uint16_t * s,int pitch,const uint8_t * blimit,const uint8_t * limit,const uint8_t * thresh,int bd)450 void vpx_highbd_lpf_horizontal_4_c(uint16_t *s, int pitch,
451 const uint8_t *blimit, const uint8_t *limit,
452 const uint8_t *thresh, int bd) {
453 int i;
454
455 // loop filter designed to work using chars so that we can make maximum use
456 // of 8 bit simd instructions.
457 for (i = 0; i < 8; ++i) {
458 const uint16_t p3 = s[-4 * pitch];
459 const uint16_t p2 = s[-3 * pitch];
460 const uint16_t p1 = s[-2 * pitch];
461 const uint16_t p0 = s[-pitch];
462 const uint16_t q0 = s[0 * pitch];
463 const uint16_t q1 = s[1 * pitch];
464 const uint16_t q2 = s[2 * pitch];
465 const uint16_t q3 = s[3 * pitch];
466 const int8_t mask =
467 highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd);
468 highbd_filter4(mask, *thresh, s - 2 * pitch, s - 1 * pitch, s,
469 s + 1 * pitch, bd);
470 ++s;
471 }
472 }
473
vpx_highbd_lpf_horizontal_4_dual_c(uint16_t * s,int pitch,const uint8_t * blimit0,const uint8_t * limit0,const uint8_t * thresh0,const uint8_t * blimit1,const uint8_t * limit1,const uint8_t * thresh1,int bd)474 void vpx_highbd_lpf_horizontal_4_dual_c(
475 uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0,
476 const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
477 const uint8_t *thresh1, int bd) {
478 vpx_highbd_lpf_horizontal_4_c(s, pitch, blimit0, limit0, thresh0, bd);
479 vpx_highbd_lpf_horizontal_4_c(s + 8, pitch, blimit1, limit1, thresh1, bd);
480 }
481
vpx_highbd_lpf_vertical_4_c(uint16_t * s,int pitch,const uint8_t * blimit,const uint8_t * limit,const uint8_t * thresh,int bd)482 void vpx_highbd_lpf_vertical_4_c(uint16_t *s, int pitch, const uint8_t *blimit,
483 const uint8_t *limit, const uint8_t *thresh,
484 int bd) {
485 int i;
486
487 // loop filter designed to work using chars so that we can make maximum use
488 // of 8 bit simd instructions.
489 for (i = 0; i < 8; ++i) {
490 const uint16_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1];
491 const uint16_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3];
492 const int8_t mask =
493 highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd);
494 highbd_filter4(mask, *thresh, s - 2, s - 1, s, s + 1, bd);
495 s += pitch;
496 }
497 }
498
vpx_highbd_lpf_vertical_4_dual_c(uint16_t * s,int pitch,const uint8_t * blimit0,const uint8_t * limit0,const uint8_t * thresh0,const uint8_t * blimit1,const uint8_t * limit1,const uint8_t * thresh1,int bd)499 void vpx_highbd_lpf_vertical_4_dual_c(
500 uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0,
501 const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
502 const uint8_t *thresh1, int bd) {
503 vpx_highbd_lpf_vertical_4_c(s, pitch, blimit0, limit0, thresh0, bd);
504 vpx_highbd_lpf_vertical_4_c(s + 8 * pitch, pitch, blimit1, limit1, thresh1,
505 bd);
506 }
507
highbd_filter8(int8_t mask,uint8_t thresh,uint8_t flat,uint16_t * op3,uint16_t * op2,uint16_t * op1,uint16_t * op0,uint16_t * oq0,uint16_t * oq1,uint16_t * oq2,uint16_t * oq3,int bd)508 static INLINE void highbd_filter8(int8_t mask, uint8_t thresh, uint8_t flat,
509 uint16_t *op3, uint16_t *op2, uint16_t *op1,
510 uint16_t *op0, uint16_t *oq0, uint16_t *oq1,
511 uint16_t *oq2, uint16_t *oq3, int bd) {
512 if (flat && mask) {
513 const uint16_t p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0;
514 const uint16_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3;
515
516 // 7-tap filter [1, 1, 1, 2, 1, 1, 1]
517 *op2 = ROUND_POWER_OF_TWO(p3 + p3 + p3 + 2 * p2 + p1 + p0 + q0, 3);
518 *op1 = ROUND_POWER_OF_TWO(p3 + p3 + p2 + 2 * p1 + p0 + q0 + q1, 3);
519 *op0 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + 2 * p0 + q0 + q1 + q2, 3);
520 *oq0 = ROUND_POWER_OF_TWO(p2 + p1 + p0 + 2 * q0 + q1 + q2 + q3, 3);
521 *oq1 = ROUND_POWER_OF_TWO(p1 + p0 + q0 + 2 * q1 + q2 + q3 + q3, 3);
522 *oq2 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + 2 * q2 + q3 + q3 + q3, 3);
523 } else {
524 highbd_filter4(mask, thresh, op1, op0, oq0, oq1, bd);
525 }
526 }
527
vpx_highbd_lpf_horizontal_8_c(uint16_t * s,int pitch,const uint8_t * blimit,const uint8_t * limit,const uint8_t * thresh,int bd)528 void vpx_highbd_lpf_horizontal_8_c(uint16_t *s, int pitch,
529 const uint8_t *blimit, const uint8_t *limit,
530 const uint8_t *thresh, int bd) {
531 int i;
532
533 // loop filter designed to work using chars so that we can make maximum use
534 // of 8 bit simd instructions.
535 for (i = 0; i < 8; ++i) {
536 const uint16_t p3 = s[-4 * pitch], p2 = s[-3 * pitch], p1 = s[-2 * pitch],
537 p0 = s[-pitch];
538 const uint16_t q0 = s[0 * pitch], q1 = s[1 * pitch], q2 = s[2 * pitch],
539 q3 = s[3 * pitch];
540
541 const int8_t mask =
542 highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd);
543 const int8_t flat =
544 highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3, bd);
545 highbd_filter8(mask, *thresh, flat, s - 4 * pitch, s - 3 * pitch,
546 s - 2 * pitch, s - 1 * pitch, s, s + 1 * pitch,
547 s + 2 * pitch, s + 3 * pitch, bd);
548 ++s;
549 }
550 }
551
vpx_highbd_lpf_horizontal_8_dual_c(uint16_t * s,int pitch,const uint8_t * blimit0,const uint8_t * limit0,const uint8_t * thresh0,const uint8_t * blimit1,const uint8_t * limit1,const uint8_t * thresh1,int bd)552 void vpx_highbd_lpf_horizontal_8_dual_c(
553 uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0,
554 const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
555 const uint8_t *thresh1, int bd) {
556 vpx_highbd_lpf_horizontal_8_c(s, pitch, blimit0, limit0, thresh0, bd);
557 vpx_highbd_lpf_horizontal_8_c(s + 8, pitch, blimit1, limit1, thresh1, bd);
558 }
559
vpx_highbd_lpf_vertical_8_c(uint16_t * s,int pitch,const uint8_t * blimit,const uint8_t * limit,const uint8_t * thresh,int bd)560 void vpx_highbd_lpf_vertical_8_c(uint16_t *s, int pitch, const uint8_t *blimit,
561 const uint8_t *limit, const uint8_t *thresh,
562 int bd) {
563 int i;
564
565 for (i = 0; i < 8; ++i) {
566 const uint16_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1];
567 const uint16_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3];
568 const int8_t mask =
569 highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd);
570 const int8_t flat =
571 highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3, bd);
572 highbd_filter8(mask, *thresh, flat, s - 4, s - 3, s - 2, s - 1, s, s + 1,
573 s + 2, s + 3, bd);
574 s += pitch;
575 }
576 }
577
vpx_highbd_lpf_vertical_8_dual_c(uint16_t * s,int pitch,const uint8_t * blimit0,const uint8_t * limit0,const uint8_t * thresh0,const uint8_t * blimit1,const uint8_t * limit1,const uint8_t * thresh1,int bd)578 void vpx_highbd_lpf_vertical_8_dual_c(
579 uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0,
580 const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
581 const uint8_t *thresh1, int bd) {
582 vpx_highbd_lpf_vertical_8_c(s, pitch, blimit0, limit0, thresh0, bd);
583 vpx_highbd_lpf_vertical_8_c(s + 8 * pitch, pitch, blimit1, limit1, thresh1,
584 bd);
585 }
586
highbd_filter16(int8_t mask,uint8_t thresh,uint8_t flat,uint8_t flat2,uint16_t * op7,uint16_t * op6,uint16_t * op5,uint16_t * op4,uint16_t * op3,uint16_t * op2,uint16_t * op1,uint16_t * op0,uint16_t * oq0,uint16_t * oq1,uint16_t * oq2,uint16_t * oq3,uint16_t * oq4,uint16_t * oq5,uint16_t * oq6,uint16_t * oq7,int bd)587 static INLINE void highbd_filter16(int8_t mask, uint8_t thresh, uint8_t flat,
588 uint8_t flat2, uint16_t *op7, uint16_t *op6,
589 uint16_t *op5, uint16_t *op4, uint16_t *op3,
590 uint16_t *op2, uint16_t *op1, uint16_t *op0,
591 uint16_t *oq0, uint16_t *oq1, uint16_t *oq2,
592 uint16_t *oq3, uint16_t *oq4, uint16_t *oq5,
593 uint16_t *oq6, uint16_t *oq7, int bd) {
594 if (flat2 && flat && mask) {
595 const uint16_t p7 = *op7;
596 const uint16_t p6 = *op6;
597 const uint16_t p5 = *op5;
598 const uint16_t p4 = *op4;
599 const uint16_t p3 = *op3;
600 const uint16_t p2 = *op2;
601 const uint16_t p1 = *op1;
602 const uint16_t p0 = *op0;
603 const uint16_t q0 = *oq0;
604 const uint16_t q1 = *oq1;
605 const uint16_t q2 = *oq2;
606 const uint16_t q3 = *oq3;
607 const uint16_t q4 = *oq4;
608 const uint16_t q5 = *oq5;
609 const uint16_t q6 = *oq6;
610 const uint16_t q7 = *oq7;
611
612 // 15-tap filter [1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1]
613 *op6 = ROUND_POWER_OF_TWO(
614 p7 * 7 + p6 * 2 + p5 + p4 + p3 + p2 + p1 + p0 + q0, 4);
615 *op5 = ROUND_POWER_OF_TWO(
616 p7 * 6 + p6 + p5 * 2 + p4 + p3 + p2 + p1 + p0 + q0 + q1, 4);
617 *op4 = ROUND_POWER_OF_TWO(
618 p7 * 5 + p6 + p5 + p4 * 2 + p3 + p2 + p1 + p0 + q0 + q1 + q2, 4);
619 *op3 = ROUND_POWER_OF_TWO(
620 p7 * 4 + p6 + p5 + p4 + p3 * 2 + p2 + p1 + p0 + q0 + q1 + q2 + q3, 4);
621 *op2 = ROUND_POWER_OF_TWO(
622 p7 * 3 + p6 + p5 + p4 + p3 + p2 * 2 + p1 + p0 + q0 + q1 + q2 + q3 + q4,
623 4);
624 *op1 = ROUND_POWER_OF_TWO(p7 * 2 + p6 + p5 + p4 + p3 + p2 + p1 * 2 + p0 +
625 q0 + q1 + q2 + q3 + q4 + q5,
626 4);
627 *op0 = ROUND_POWER_OF_TWO(p7 + p6 + p5 + p4 + p3 + p2 + p1 + p0 * 2 + q0 +
628 q1 + q2 + q3 + q4 + q5 + q6,
629 4);
630 *oq0 = ROUND_POWER_OF_TWO(p6 + p5 + p4 + p3 + p2 + p1 + p0 + q0 * 2 + q1 +
631 q2 + q3 + q4 + q5 + q6 + q7,
632 4);
633 *oq1 = ROUND_POWER_OF_TWO(p5 + p4 + p3 + p2 + p1 + p0 + q0 + q1 * 2 + q2 +
634 q3 + q4 + q5 + q6 + q7 * 2,
635 4);
636 *oq2 = ROUND_POWER_OF_TWO(
637 p4 + p3 + p2 + p1 + p0 + q0 + q1 + q2 * 2 + q3 + q4 + q5 + q6 + q7 * 3,
638 4);
639 *oq3 = ROUND_POWER_OF_TWO(
640 p3 + p2 + p1 + p0 + q0 + q1 + q2 + q3 * 2 + q4 + q5 + q6 + q7 * 4, 4);
641 *oq4 = ROUND_POWER_OF_TWO(
642 p2 + p1 + p0 + q0 + q1 + q2 + q3 + q4 * 2 + q5 + q6 + q7 * 5, 4);
643 *oq5 = ROUND_POWER_OF_TWO(
644 p1 + p0 + q0 + q1 + q2 + q3 + q4 + q5 * 2 + q6 + q7 * 6, 4);
645 *oq6 = ROUND_POWER_OF_TWO(
646 p0 + q0 + q1 + q2 + q3 + q4 + q5 + q6 * 2 + q7 * 7, 4);
647 } else {
648 highbd_filter8(mask, thresh, flat, op3, op2, op1, op0, oq0, oq1, oq2, oq3,
649 bd);
650 }
651 }
652
highbd_mb_lpf_horizontal_edge_w(uint16_t * s,int pitch,const uint8_t * blimit,const uint8_t * limit,const uint8_t * thresh,int count,int bd)653 static void highbd_mb_lpf_horizontal_edge_w(uint16_t *s, int pitch,
654 const uint8_t *blimit,
655 const uint8_t *limit,
656 const uint8_t *thresh, int count,
657 int bd) {
658 int i;
659
660 // loop filter designed to work using chars so that we can make maximum use
661 // of 8 bit simd instructions.
662 for (i = 0; i < 8 * count; ++i) {
663 const uint16_t p3 = s[-4 * pitch];
664 const uint16_t p2 = s[-3 * pitch];
665 const uint16_t p1 = s[-2 * pitch];
666 const uint16_t p0 = s[-pitch];
667 const uint16_t q0 = s[0 * pitch];
668 const uint16_t q1 = s[1 * pitch];
669 const uint16_t q2 = s[2 * pitch];
670 const uint16_t q3 = s[3 * pitch];
671 const int8_t mask =
672 highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd);
673 const int8_t flat =
674 highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3, bd);
675 const int8_t flat2 = highbd_flat_mask5(
676 1, s[-8 * pitch], s[-7 * pitch], s[-6 * pitch], s[-5 * pitch], p0, q0,
677 s[4 * pitch], s[5 * pitch], s[6 * pitch], s[7 * pitch], bd);
678
679 highbd_filter16(mask, *thresh, flat, flat2, s - 8 * pitch, s - 7 * pitch,
680 s - 6 * pitch, s - 5 * pitch, s - 4 * pitch, s - 3 * pitch,
681 s - 2 * pitch, s - 1 * pitch, s, s + 1 * pitch,
682 s + 2 * pitch, s + 3 * pitch, s + 4 * pitch, s + 5 * pitch,
683 s + 6 * pitch, s + 7 * pitch, bd);
684 ++s;
685 }
686 }
687
vpx_highbd_lpf_horizontal_16_c(uint16_t * s,int pitch,const uint8_t * blimit,const uint8_t * limit,const uint8_t * thresh,int bd)688 void vpx_highbd_lpf_horizontal_16_c(uint16_t *s, int pitch,
689 const uint8_t *blimit, const uint8_t *limit,
690 const uint8_t *thresh, int bd) {
691 highbd_mb_lpf_horizontal_edge_w(s, pitch, blimit, limit, thresh, 1, bd);
692 }
693
vpx_highbd_lpf_horizontal_16_dual_c(uint16_t * s,int pitch,const uint8_t * blimit,const uint8_t * limit,const uint8_t * thresh,int bd)694 void vpx_highbd_lpf_horizontal_16_dual_c(uint16_t *s, int pitch,
695 const uint8_t *blimit,
696 const uint8_t *limit,
697 const uint8_t *thresh, int bd) {
698 highbd_mb_lpf_horizontal_edge_w(s, pitch, blimit, limit, thresh, 2, bd);
699 }
700
highbd_mb_lpf_vertical_edge_w(uint16_t * s,int pitch,const uint8_t * blimit,const uint8_t * limit,const uint8_t * thresh,int count,int bd)701 static void highbd_mb_lpf_vertical_edge_w(uint16_t *s, int pitch,
702 const uint8_t *blimit,
703 const uint8_t *limit,
704 const uint8_t *thresh, int count,
705 int bd) {
706 int i;
707
708 for (i = 0; i < count; ++i) {
709 const uint16_t p3 = s[-4];
710 const uint16_t p2 = s[-3];
711 const uint16_t p1 = s[-2];
712 const uint16_t p0 = s[-1];
713 const uint16_t q0 = s[0];
714 const uint16_t q1 = s[1];
715 const uint16_t q2 = s[2];
716 const uint16_t q3 = s[3];
717 const int8_t mask =
718 highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd);
719 const int8_t flat =
720 highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3, bd);
721 const int8_t flat2 = highbd_flat_mask5(1, s[-8], s[-7], s[-6], s[-5], p0,
722 q0, s[4], s[5], s[6], s[7], bd);
723
724 highbd_filter16(mask, *thresh, flat, flat2, s - 8, s - 7, s - 6, s - 5,
725 s - 4, s - 3, s - 2, s - 1, s, s + 1, s + 2, s + 3, s + 4,
726 s + 5, s + 6, s + 7, bd);
727 s += pitch;
728 }
729 }
730
vpx_highbd_lpf_vertical_16_c(uint16_t * s,int pitch,const uint8_t * blimit,const uint8_t * limit,const uint8_t * thresh,int bd)731 void vpx_highbd_lpf_vertical_16_c(uint16_t *s, int pitch, const uint8_t *blimit,
732 const uint8_t *limit, const uint8_t *thresh,
733 int bd) {
734 highbd_mb_lpf_vertical_edge_w(s, pitch, blimit, limit, thresh, 8, bd);
735 }
736
vpx_highbd_lpf_vertical_16_dual_c(uint16_t * s,int pitch,const uint8_t * blimit,const uint8_t * limit,const uint8_t * thresh,int bd)737 void vpx_highbd_lpf_vertical_16_dual_c(uint16_t *s, int pitch,
738 const uint8_t *blimit,
739 const uint8_t *limit,
740 const uint8_t *thresh, int bd) {
741 highbd_mb_lpf_vertical_edge_w(s, pitch, blimit, limit, thresh, 16, bd);
742 }
743 #endif // CONFIG_VP9_HIGHBITDEPTH
744