1 /*
2  * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 
12 #ifndef AOM_AV1_COMMON_CDEF_BLOCK_SIMD_H_
13 #define AOM_AV1_COMMON_CDEF_BLOCK_SIMD_H_
14 
15 #include "config/av1_rtcd.h"
16 
17 #include "av1/common/cdef_block.h"
18 
19 /* partial A is a 16-bit vector of the form:
20    [x8 x7 x6 x5 x4 x3 x2 x1] and partial B has the form:
21    [0  y1 y2 y3 y4 y5 y6 y7].
22    This function computes (x1^2+y1^2)*C1 + (x2^2+y2^2)*C2 + ...
23    (x7^2+y2^7)*C7 + (x8^2+0^2)*C8 where the C1..C8 constants are in const1
24    and const2. */
fold_mul_and_sum(v128 partiala,v128 partialb,v128 const1,v128 const2)25 static INLINE v128 fold_mul_and_sum(v128 partiala, v128 partialb, v128 const1,
26                                     v128 const2) {
27   v128 tmp;
28   /* Reverse partial B. */
29   partialb = v128_shuffle_8(
30       partialb, v128_from_32(0x0f0e0100, 0x03020504, 0x07060908, 0x0b0a0d0c));
31   /* Interleave the x and y values of identical indices and pair x8 with 0. */
32   tmp = partiala;
33   partiala = v128_ziplo_16(partialb, partiala);
34   partialb = v128_ziphi_16(partialb, tmp);
35   /* Square and add the corresponding x and y values. */
36   partiala = v128_madd_s16(partiala, partiala);
37   partialb = v128_madd_s16(partialb, partialb);
38   /* Multiply by constant. */
39   partiala = v128_mullo_s32(partiala, const1);
40   partialb = v128_mullo_s32(partialb, const2);
41   /* Sum all results. */
42   partiala = v128_add_32(partiala, partialb);
43   return partiala;
44 }
45 
hsum4(v128 x0,v128 x1,v128 x2,v128 x3)46 static INLINE v128 hsum4(v128 x0, v128 x1, v128 x2, v128 x3) {
47   v128 t0, t1, t2, t3;
48   t0 = v128_ziplo_32(x1, x0);
49   t1 = v128_ziplo_32(x3, x2);
50   t2 = v128_ziphi_32(x1, x0);
51   t3 = v128_ziphi_32(x3, x2);
52   x0 = v128_ziplo_64(t1, t0);
53   x1 = v128_ziphi_64(t1, t0);
54   x2 = v128_ziplo_64(t3, t2);
55   x3 = v128_ziphi_64(t3, t2);
56   return v128_add_32(v128_add_32(x0, x1), v128_add_32(x2, x3));
57 }
58 
59 /* Computes cost for directions 0, 5, 6 and 7. We can call this function again
60    to compute the remaining directions. */
compute_directions(v128 lines[8],int32_t tmp_cost1[4])61 static INLINE v128 compute_directions(v128 lines[8], int32_t tmp_cost1[4]) {
62   v128 partial4a, partial4b, partial5a, partial5b, partial7a, partial7b;
63   v128 partial6;
64   v128 tmp;
65   /* Partial sums for lines 0 and 1. */
66   partial4a = v128_shl_n_byte(lines[0], 14);
67   partial4b = v128_shr_n_byte(lines[0], 2);
68   partial4a = v128_add_16(partial4a, v128_shl_n_byte(lines[1], 12));
69   partial4b = v128_add_16(partial4b, v128_shr_n_byte(lines[1], 4));
70   tmp = v128_add_16(lines[0], lines[1]);
71   partial5a = v128_shl_n_byte(tmp, 10);
72   partial5b = v128_shr_n_byte(tmp, 6);
73   partial7a = v128_shl_n_byte(tmp, 4);
74   partial7b = v128_shr_n_byte(tmp, 12);
75   partial6 = tmp;
76 
77   /* Partial sums for lines 2 and 3. */
78   partial4a = v128_add_16(partial4a, v128_shl_n_byte(lines[2], 10));
79   partial4b = v128_add_16(partial4b, v128_shr_n_byte(lines[2], 6));
80   partial4a = v128_add_16(partial4a, v128_shl_n_byte(lines[3], 8));
81   partial4b = v128_add_16(partial4b, v128_shr_n_byte(lines[3], 8));
82   tmp = v128_add_16(lines[2], lines[3]);
83   partial5a = v128_add_16(partial5a, v128_shl_n_byte(tmp, 8));
84   partial5b = v128_add_16(partial5b, v128_shr_n_byte(tmp, 8));
85   partial7a = v128_add_16(partial7a, v128_shl_n_byte(tmp, 6));
86   partial7b = v128_add_16(partial7b, v128_shr_n_byte(tmp, 10));
87   partial6 = v128_add_16(partial6, tmp);
88 
89   /* Partial sums for lines 4 and 5. */
90   partial4a = v128_add_16(partial4a, v128_shl_n_byte(lines[4], 6));
91   partial4b = v128_add_16(partial4b, v128_shr_n_byte(lines[4], 10));
92   partial4a = v128_add_16(partial4a, v128_shl_n_byte(lines[5], 4));
93   partial4b = v128_add_16(partial4b, v128_shr_n_byte(lines[5], 12));
94   tmp = v128_add_16(lines[4], lines[5]);
95   partial5a = v128_add_16(partial5a, v128_shl_n_byte(tmp, 6));
96   partial5b = v128_add_16(partial5b, v128_shr_n_byte(tmp, 10));
97   partial7a = v128_add_16(partial7a, v128_shl_n_byte(tmp, 8));
98   partial7b = v128_add_16(partial7b, v128_shr_n_byte(tmp, 8));
99   partial6 = v128_add_16(partial6, tmp);
100 
101   /* Partial sums for lines 6 and 7. */
102   partial4a = v128_add_16(partial4a, v128_shl_n_byte(lines[6], 2));
103   partial4b = v128_add_16(partial4b, v128_shr_n_byte(lines[6], 14));
104   partial4a = v128_add_16(partial4a, lines[7]);
105   tmp = v128_add_16(lines[6], lines[7]);
106   partial5a = v128_add_16(partial5a, v128_shl_n_byte(tmp, 4));
107   partial5b = v128_add_16(partial5b, v128_shr_n_byte(tmp, 12));
108   partial7a = v128_add_16(partial7a, v128_shl_n_byte(tmp, 10));
109   partial7b = v128_add_16(partial7b, v128_shr_n_byte(tmp, 6));
110   partial6 = v128_add_16(partial6, tmp);
111 
112   /* Compute costs in terms of partial sums. */
113   partial4a =
114       fold_mul_and_sum(partial4a, partial4b, v128_from_32(210, 280, 420, 840),
115                        v128_from_32(105, 120, 140, 168));
116   partial7a =
117       fold_mul_and_sum(partial7a, partial7b, v128_from_32(210, 420, 0, 0),
118                        v128_from_32(105, 105, 105, 140));
119   partial5a =
120       fold_mul_and_sum(partial5a, partial5b, v128_from_32(210, 420, 0, 0),
121                        v128_from_32(105, 105, 105, 140));
122   partial6 = v128_madd_s16(partial6, partial6);
123   partial6 = v128_mullo_s32(partial6, v128_dup_32(105));
124 
125   partial4a = hsum4(partial4a, partial5a, partial6, partial7a);
126   v128_store_unaligned(tmp_cost1, partial4a);
127   return partial4a;
128 }
129 
130 /* transpose and reverse the order of the lines -- equivalent to a 90-degree
131    counter-clockwise rotation of the pixels. */
array_reverse_transpose_8x8(v128 * in,v128 * res)132 static INLINE void array_reverse_transpose_8x8(v128 *in, v128 *res) {
133   const v128 tr0_0 = v128_ziplo_16(in[1], in[0]);
134   const v128 tr0_1 = v128_ziplo_16(in[3], in[2]);
135   const v128 tr0_2 = v128_ziphi_16(in[1], in[0]);
136   const v128 tr0_3 = v128_ziphi_16(in[3], in[2]);
137   const v128 tr0_4 = v128_ziplo_16(in[5], in[4]);
138   const v128 tr0_5 = v128_ziplo_16(in[7], in[6]);
139   const v128 tr0_6 = v128_ziphi_16(in[5], in[4]);
140   const v128 tr0_7 = v128_ziphi_16(in[7], in[6]);
141 
142   const v128 tr1_0 = v128_ziplo_32(tr0_1, tr0_0);
143   const v128 tr1_1 = v128_ziplo_32(tr0_5, tr0_4);
144   const v128 tr1_2 = v128_ziphi_32(tr0_1, tr0_0);
145   const v128 tr1_3 = v128_ziphi_32(tr0_5, tr0_4);
146   const v128 tr1_4 = v128_ziplo_32(tr0_3, tr0_2);
147   const v128 tr1_5 = v128_ziplo_32(tr0_7, tr0_6);
148   const v128 tr1_6 = v128_ziphi_32(tr0_3, tr0_2);
149   const v128 tr1_7 = v128_ziphi_32(tr0_7, tr0_6);
150 
151   res[7] = v128_ziplo_64(tr1_1, tr1_0);
152   res[6] = v128_ziphi_64(tr1_1, tr1_0);
153   res[5] = v128_ziplo_64(tr1_3, tr1_2);
154   res[4] = v128_ziphi_64(tr1_3, tr1_2);
155   res[3] = v128_ziplo_64(tr1_5, tr1_4);
156   res[2] = v128_ziphi_64(tr1_5, tr1_4);
157   res[1] = v128_ziplo_64(tr1_7, tr1_6);
158   res[0] = v128_ziphi_64(tr1_7, tr1_6);
159 }
160 
SIMD_FUNC(cdef_find_dir)161 int SIMD_FUNC(cdef_find_dir)(const uint16_t *img, int stride, int32_t *var,
162                              int coeff_shift) {
163   int i;
164   int32_t cost[8];
165   int32_t best_cost = 0;
166   int best_dir = 0;
167   v128 lines[8];
168   for (i = 0; i < 8; i++) {
169     lines[i] = v128_load_unaligned(&img[i * stride]);
170     lines[i] =
171         v128_sub_16(v128_shr_s16(lines[i], coeff_shift), v128_dup_16(128));
172   }
173 
174   /* Compute "mostly vertical" directions. */
175   v128 dir47 = compute_directions(lines, cost + 4);
176 
177   array_reverse_transpose_8x8(lines, lines);
178 
179   /* Compute "mostly horizontal" directions. */
180   v128 dir03 = compute_directions(lines, cost);
181 
182   v128 max = v128_max_s32(dir03, dir47);
183   max = v128_max_s32(max, v128_align(max, max, 8));
184   max = v128_max_s32(max, v128_align(max, max, 4));
185   best_cost = v128_low_u32(max);
186   v128 t =
187       v128_pack_s32_s16(v128_cmpeq_32(max, dir47), v128_cmpeq_32(max, dir03));
188   best_dir = v128_movemask_8(v128_pack_s16_s8(t, t));
189   best_dir = get_msb(best_dir ^ (best_dir - 1));  // Count trailing zeros
190 
191   /* Difference between the optimal variance and the variance along the
192      orthogonal direction. Again, the sum(x^2) terms cancel out. */
193   *var = best_cost - cost[(best_dir + 4) & 7];
194   /* We'd normally divide by 840, but dividing by 1024 is close enough
195      for what we're going to do with this. */
196   *var >>= 10;
197   return best_dir;
198 }
199 
200 // sign(a-b) * min(abs(a-b), max(0, threshold - (abs(a-b) >> adjdamp)))
constrain16(v256 a,v256 b,unsigned int threshold,unsigned int adjdamp)201 SIMD_INLINE v256 constrain16(v256 a, v256 b, unsigned int threshold,
202                              unsigned int adjdamp) {
203   v256 diff = v256_sub_16(a, b);
204   const v256 sign = v256_shr_n_s16(diff, 15);
205   diff = v256_abs_s16(diff);
206   const v256 s =
207       v256_ssub_u16(v256_dup_16(threshold), v256_shr_u16(diff, adjdamp));
208   return v256_xor(v256_add_16(sign, v256_min_s16(diff, s)), sign);
209 }
210 
211 // sign(a - b) * min(abs(a - b), max(0, strength - (abs(a - b) >> adjdamp)))
constrain(v256 a,v256 b,unsigned int strength,unsigned int adjdamp)212 SIMD_INLINE v128 constrain(v256 a, v256 b, unsigned int strength,
213                            unsigned int adjdamp) {
214   const v256 diff16 = v256_sub_16(a, b);
215   v128 diff = v128_pack_s16_s8(v256_high_v128(diff16), v256_low_v128(diff16));
216   const v128 sign = v128_cmplt_s8(diff, v128_zero());
217   diff = v128_abs_s8(diff);
218   return v128_xor(
219       v128_add_8(sign,
220                  v128_min_u8(diff, v128_ssub_u8(v128_dup_8(strength),
221                                                 v128_shr_u8(diff, adjdamp)))),
222       sign);
223 }
224 
SIMD_FUNC(cdef_filter_block_4x4_8)225 void SIMD_FUNC(cdef_filter_block_4x4_8)(uint8_t *dst, int dstride,
226                                         const uint16_t *in, int pri_strength,
227                                         int sec_strength, int dir,
228                                         int pri_damping, int sec_damping,
229                                         int coeff_shift) {
230   v128 p0, p1, p2, p3;
231   v256 sum, row, tap, res;
232   v256 max, min, large = v256_dup_16(CDEF_VERY_LARGE);
233   int po1 = cdef_directions[dir][0];
234   int po2 = cdef_directions[dir][1];
235   int s1o1 = cdef_directions[(dir + 2) & 7][0];
236   int s1o2 = cdef_directions[(dir + 2) & 7][1];
237   int s2o1 = cdef_directions[(dir + 6) & 7][0];
238   int s2o2 = cdef_directions[(dir + 6) & 7][1];
239 
240   const int *pri_taps = cdef_pri_taps[(pri_strength >> coeff_shift) & 1];
241   const int *sec_taps = cdef_sec_taps[(pri_strength >> coeff_shift) & 1];
242 
243   if (pri_strength)
244     pri_damping = AOMMAX(0, pri_damping - get_msb(pri_strength));
245   if (sec_strength)
246     sec_damping = AOMMAX(0, sec_damping - get_msb(sec_strength));
247 
248   sum = v256_zero();
249   row = v256_from_v64(v64_load_aligned(&in[0 * CDEF_BSTRIDE]),
250                       v64_load_aligned(&in[1 * CDEF_BSTRIDE]),
251                       v64_load_aligned(&in[2 * CDEF_BSTRIDE]),
252                       v64_load_aligned(&in[3 * CDEF_BSTRIDE]));
253   max = min = row;
254 
255   if (pri_strength) {
256     // Primary near taps
257     tap = v256_from_v64(v64_load_unaligned(&in[0 * CDEF_BSTRIDE + po1]),
258                         v64_load_unaligned(&in[1 * CDEF_BSTRIDE + po1]),
259                         v64_load_unaligned(&in[2 * CDEF_BSTRIDE + po1]),
260                         v64_load_unaligned(&in[3 * CDEF_BSTRIDE + po1]));
261     max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
262     min = v256_min_s16(min, tap);
263     p0 = constrain(tap, row, pri_strength, pri_damping);
264     tap = v256_from_v64(v64_load_unaligned(&in[0 * CDEF_BSTRIDE - po1]),
265                         v64_load_unaligned(&in[1 * CDEF_BSTRIDE - po1]),
266                         v64_load_unaligned(&in[2 * CDEF_BSTRIDE - po1]),
267                         v64_load_unaligned(&in[3 * CDEF_BSTRIDE - po1]));
268     max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
269     min = v256_min_s16(min, tap);
270     p1 = constrain(tap, row, pri_strength, pri_damping);
271 
272     // sum += pri_taps[0] * (p0 + p1)
273     sum = v256_add_16(sum, v256_madd_us8(v256_dup_8(pri_taps[0]),
274                                          v256_from_v128(v128_ziphi_8(p0, p1),
275                                                         v128_ziplo_8(p0, p1))));
276 
277     // Primary far taps
278     tap = v256_from_v64(v64_load_unaligned(&in[0 * CDEF_BSTRIDE + po2]),
279                         v64_load_unaligned(&in[1 * CDEF_BSTRIDE + po2]),
280                         v64_load_unaligned(&in[2 * CDEF_BSTRIDE + po2]),
281                         v64_load_unaligned(&in[3 * CDEF_BSTRIDE + po2]));
282     max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
283     min = v256_min_s16(min, tap);
284     p0 = constrain(tap, row, pri_strength, pri_damping);
285     tap = v256_from_v64(v64_load_unaligned(&in[0 * CDEF_BSTRIDE - po2]),
286                         v64_load_unaligned(&in[1 * CDEF_BSTRIDE - po2]),
287                         v64_load_unaligned(&in[2 * CDEF_BSTRIDE - po2]),
288                         v64_load_unaligned(&in[3 * CDEF_BSTRIDE - po2]));
289     max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
290     min = v256_min_s16(min, tap);
291     p1 = constrain(tap, row, pri_strength, pri_damping);
292 
293     // sum += pri_taps[1] * (p0 + p1)
294     sum = v256_add_16(sum, v256_madd_us8(v256_dup_8(pri_taps[1]),
295                                          v256_from_v128(v128_ziphi_8(p0, p1),
296                                                         v128_ziplo_8(p0, p1))));
297   }
298 
299   if (sec_strength) {
300     // Secondary near taps
301     tap = v256_from_v64(v64_load_unaligned(&in[0 * CDEF_BSTRIDE + s1o1]),
302                         v64_load_unaligned(&in[1 * CDEF_BSTRIDE + s1o1]),
303                         v64_load_unaligned(&in[2 * CDEF_BSTRIDE + s1o1]),
304                         v64_load_unaligned(&in[3 * CDEF_BSTRIDE + s1o1]));
305     max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
306     min = v256_min_s16(min, tap);
307     p0 = constrain(tap, row, sec_strength, sec_damping);
308     tap = v256_from_v64(v64_load_unaligned(&in[0 * CDEF_BSTRIDE - s1o1]),
309                         v64_load_unaligned(&in[1 * CDEF_BSTRIDE - s1o1]),
310                         v64_load_unaligned(&in[2 * CDEF_BSTRIDE - s1o1]),
311                         v64_load_unaligned(&in[3 * CDEF_BSTRIDE - s1o1]));
312     max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
313     min = v256_min_s16(min, tap);
314     p1 = constrain(tap, row, sec_strength, sec_damping);
315     tap = v256_from_v64(v64_load_unaligned(&in[0 * CDEF_BSTRIDE + s2o1]),
316                         v64_load_unaligned(&in[1 * CDEF_BSTRIDE + s2o1]),
317                         v64_load_unaligned(&in[2 * CDEF_BSTRIDE + s2o1]),
318                         v64_load_unaligned(&in[3 * CDEF_BSTRIDE + s2o1]));
319     max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
320     min = v256_min_s16(min, tap);
321     p2 = constrain(tap, row, sec_strength, sec_damping);
322     tap = v256_from_v64(v64_load_unaligned(&in[0 * CDEF_BSTRIDE - s2o1]),
323                         v64_load_unaligned(&in[1 * CDEF_BSTRIDE - s2o1]),
324                         v64_load_unaligned(&in[2 * CDEF_BSTRIDE - s2o1]),
325                         v64_load_unaligned(&in[3 * CDEF_BSTRIDE - s2o1]));
326     max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
327     min = v256_min_s16(min, tap);
328     p3 = constrain(tap, row, sec_strength, sec_damping);
329 
330     // sum += sec_taps[0] * (p0 + p1 + p2 + p3)
331     p0 = v128_add_8(p0, p1);
332     p2 = v128_add_8(p2, p3);
333     sum = v256_add_16(sum, v256_madd_us8(v256_dup_8(sec_taps[0]),
334                                          v256_from_v128(v128_ziphi_8(p0, p2),
335                                                         v128_ziplo_8(p0, p2))));
336 
337     // Secondary far taps
338     tap = v256_from_v64(v64_load_unaligned(&in[0 * CDEF_BSTRIDE + s1o2]),
339                         v64_load_unaligned(&in[1 * CDEF_BSTRIDE + s1o2]),
340                         v64_load_unaligned(&in[2 * CDEF_BSTRIDE + s1o2]),
341                         v64_load_unaligned(&in[3 * CDEF_BSTRIDE + s1o2]));
342     max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
343     min = v256_min_s16(min, tap);
344     p0 = constrain(tap, row, sec_strength, sec_damping);
345     tap = v256_from_v64(v64_load_unaligned(&in[0 * CDEF_BSTRIDE - s1o2]),
346                         v64_load_unaligned(&in[1 * CDEF_BSTRIDE - s1o2]),
347                         v64_load_unaligned(&in[2 * CDEF_BSTRIDE - s1o2]),
348                         v64_load_unaligned(&in[3 * CDEF_BSTRIDE - s1o2]));
349     max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
350     min = v256_min_s16(min, tap);
351     p1 = constrain(tap, row, sec_strength, sec_damping);
352     tap = v256_from_v64(v64_load_unaligned(&in[0 * CDEF_BSTRIDE + s2o2]),
353                         v64_load_unaligned(&in[1 * CDEF_BSTRIDE + s2o2]),
354                         v64_load_unaligned(&in[2 * CDEF_BSTRIDE + s2o2]),
355                         v64_load_unaligned(&in[3 * CDEF_BSTRIDE + s2o2]));
356     max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
357     min = v256_min_s16(min, tap);
358     p2 = constrain(tap, row, sec_strength, sec_damping);
359     tap = v256_from_v64(v64_load_unaligned(&in[0 * CDEF_BSTRIDE - s2o2]),
360                         v64_load_unaligned(&in[1 * CDEF_BSTRIDE - s2o2]),
361                         v64_load_unaligned(&in[2 * CDEF_BSTRIDE - s2o2]),
362                         v64_load_unaligned(&in[3 * CDEF_BSTRIDE - s2o2]));
363     max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
364     min = v256_min_s16(min, tap);
365     p3 = constrain(tap, row, sec_strength, sec_damping);
366 
367     // sum += sec_taps[1] * (p0 + p1 + p2 + p3)
368     p0 = v128_add_8(p0, p1);
369     p2 = v128_add_8(p2, p3);
370 
371     sum = v256_add_16(sum, v256_madd_us8(v256_dup_8(sec_taps[1]),
372                                          v256_from_v128(v128_ziphi_8(p0, p2),
373                                                         v128_ziplo_8(p0, p2))));
374   }
375 
376   // res = row + ((sum - (sum < 0) + 8) >> 4)
377   sum = v256_add_16(sum, v256_cmplt_s16(sum, v256_zero()));
378   res = v256_add_16(sum, v256_dup_16(8));
379   res = v256_shr_n_s16(res, 4);
380   res = v256_add_16(row, res);
381   res = v256_min_s16(v256_max_s16(res, min), max);
382   res = v256_pack_s16_u8(res, res);
383 
384   p0 = v256_low_v128(res);
385   u32_store_aligned(&dst[0 * dstride], v64_high_u32(v128_high_v64(p0)));
386   u32_store_aligned(&dst[1 * dstride], v64_low_u32(v128_high_v64(p0)));
387   u32_store_aligned(&dst[2 * dstride], v64_high_u32(v128_low_v64(p0)));
388   u32_store_aligned(&dst[3 * dstride], v64_low_u32(v128_low_v64(p0)));
389 }
390 
SIMD_FUNC(cdef_filter_block_8x8_8)391 void SIMD_FUNC(cdef_filter_block_8x8_8)(uint8_t *dst, int dstride,
392                                         const uint16_t *in, int pri_strength,
393                                         int sec_strength, int dir,
394                                         int pri_damping, int sec_damping,
395                                         int coeff_shift) {
396   int i;
397   v128 p0, p1, p2, p3;
398   v256 sum, row, res, tap;
399   v256 max, min, large = v256_dup_16(CDEF_VERY_LARGE);
400   int po1 = cdef_directions[dir][0];
401   int po2 = cdef_directions[dir][1];
402   int s1o1 = cdef_directions[(dir + 2) & 7][0];
403   int s1o2 = cdef_directions[(dir + 2) & 7][1];
404   int s2o1 = cdef_directions[(dir + 6) & 7][0];
405   int s2o2 = cdef_directions[(dir + 6) & 7][1];
406 
407   const int *pri_taps = cdef_pri_taps[(pri_strength >> coeff_shift) & 1];
408   const int *sec_taps = cdef_sec_taps[(pri_strength >> coeff_shift) & 1];
409 
410   if (pri_strength)
411     pri_damping = AOMMAX(0, pri_damping - get_msb(pri_strength));
412   if (sec_strength)
413     sec_damping = AOMMAX(0, sec_damping - get_msb(sec_strength));
414   for (i = 0; i < 8; i += 2) {
415     sum = v256_zero();
416     row = v256_from_v128(v128_load_aligned(&in[i * CDEF_BSTRIDE]),
417                          v128_load_aligned(&in[(i + 1) * CDEF_BSTRIDE]));
418 
419     max = min = row;
420     // Primary near taps
421     tap =
422         v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE + po1]),
423                        v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + po1]));
424     max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
425     min = v256_min_s16(min, tap);
426     p0 = constrain(tap, row, pri_strength, pri_damping);
427     tap =
428         v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE - po1]),
429                        v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - po1]));
430     max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
431     min = v256_min_s16(min, tap);
432     p1 = constrain(tap, row, pri_strength, pri_damping);
433 
434     // sum += pri_taps[0] * (p0 + p1)
435     sum = v256_add_16(sum, v256_madd_us8(v256_dup_8(pri_taps[0]),
436                                          v256_from_v128(v128_ziphi_8(p0, p1),
437                                                         v128_ziplo_8(p0, p1))));
438 
439     // Primary far taps
440     tap =
441         v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE + po2]),
442                        v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + po2]));
443     max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
444     min = v256_min_s16(min, tap);
445     p0 = constrain(tap, row, pri_strength, pri_damping);
446     tap =
447         v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE - po2]),
448                        v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - po2]));
449     max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
450     min = v256_min_s16(min, tap);
451     p1 = constrain(tap, row, pri_strength, pri_damping);
452 
453     // sum += pri_taps[1] * (p0 + p1)
454     sum = v256_add_16(sum, v256_madd_us8(v256_dup_8(pri_taps[1]),
455                                          v256_from_v128(v128_ziphi_8(p0, p1),
456                                                         v128_ziplo_8(p0, p1))));
457 
458     // Secondary near taps
459     tap =
460         v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE + s1o1]),
461                        v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s1o1]));
462     max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
463     min = v256_min_s16(min, tap);
464     p0 = constrain(tap, row, sec_strength, sec_damping);
465     tap =
466         v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE - s1o1]),
467                        v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s1o1]));
468     max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
469     min = v256_min_s16(min, tap);
470     p1 = constrain(tap, row, sec_strength, sec_damping);
471     tap =
472         v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE + s2o1]),
473                        v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s2o1]));
474     max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
475     min = v256_min_s16(min, tap);
476     p2 = constrain(tap, row, sec_strength, sec_damping);
477     tap =
478         v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE - s2o1]),
479                        v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s2o1]));
480     max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
481     min = v256_min_s16(min, tap);
482     p3 = constrain(tap, row, sec_strength, sec_damping);
483 
484     // sum += sec_taps[0] * (p0 + p1 + p2 + p3)
485     p0 = v128_add_8(p0, p1);
486     p2 = v128_add_8(p2, p3);
487     sum = v256_add_16(sum, v256_madd_us8(v256_dup_8(sec_taps[0]),
488                                          v256_from_v128(v128_ziphi_8(p0, p2),
489                                                         v128_ziplo_8(p0, p2))));
490 
491     // Secondary far taps
492     tap =
493         v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE + s1o2]),
494                        v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s1o2]));
495     max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
496     min = v256_min_s16(min, tap);
497     p0 = constrain(tap, row, sec_strength, sec_damping);
498     tap =
499         v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE - s1o2]),
500                        v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s1o2]));
501     max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
502     min = v256_min_s16(min, tap);
503     p1 = constrain(tap, row, sec_strength, sec_damping);
504     tap =
505         v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE + s2o2]),
506                        v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s2o2]));
507     max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
508     min = v256_min_s16(min, tap);
509     p2 = constrain(tap, row, sec_strength, sec_damping);
510     tap =
511         v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE - s2o2]),
512                        v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s2o2]));
513     max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
514     min = v256_min_s16(min, tap);
515     p3 = constrain(tap, row, sec_strength, sec_damping);
516 
517     // sum += sec_taps[1] * (p0 + p1 + p2 + p3)
518     p0 = v128_add_8(p0, p1);
519     p2 = v128_add_8(p2, p3);
520     sum = v256_add_16(sum, v256_madd_us8(v256_dup_8(sec_taps[1]),
521                                          v256_from_v128(v128_ziphi_8(p0, p2),
522                                                         v128_ziplo_8(p0, p2))));
523 
524     // res = row + ((sum - (sum < 0) + 8) >> 4)
525     sum = v256_add_16(sum, v256_cmplt_s16(sum, v256_zero()));
526     res = v256_add_16(sum, v256_dup_16(8));
527     res = v256_shr_n_s16(res, 4);
528     res = v256_add_16(row, res);
529     res = v256_min_s16(v256_max_s16(res, min), max);
530     res = v256_pack_s16_u8(res, res);
531 
532     p0 = v256_low_v128(res);
533     v64_store_aligned(&dst[i * dstride], v128_high_v64(p0));
534     v64_store_aligned(&dst[(i + 1) * dstride], v128_low_v64(p0));
535   }
536 }
537 
SIMD_FUNC(cdef_filter_block_4x4_16)538 void SIMD_FUNC(cdef_filter_block_4x4_16)(uint16_t *dst, int dstride,
539                                          const uint16_t *in, int pri_strength,
540                                          int sec_strength, int dir,
541                                          int pri_damping, int sec_damping,
542                                          int coeff_shift) {
543   int i;
544   v256 p0, p1, p2, p3, sum, row, res;
545   v256 max, min, large = v256_dup_16(CDEF_VERY_LARGE);
546   int po1 = cdef_directions[dir][0];
547   int po2 = cdef_directions[dir][1];
548   int s1o1 = cdef_directions[(dir + 2) & 7][0];
549   int s1o2 = cdef_directions[(dir + 2) & 7][1];
550   int s2o1 = cdef_directions[(dir + 6) & 7][0];
551   int s2o2 = cdef_directions[(dir + 6) & 7][1];
552 
553   const int *pri_taps = cdef_pri_taps[(pri_strength >> coeff_shift) & 1];
554   const int *sec_taps = cdef_sec_taps[(pri_strength >> coeff_shift) & 1];
555 
556   if (pri_strength)
557     pri_damping = AOMMAX(0, pri_damping - get_msb(pri_strength));
558   if (sec_strength)
559     sec_damping = AOMMAX(0, sec_damping - get_msb(sec_strength));
560   for (i = 0; i < 4; i += 4) {
561     sum = v256_zero();
562     row = v256_from_v64(v64_load_aligned(&in[i * CDEF_BSTRIDE]),
563                         v64_load_aligned(&in[(i + 1) * CDEF_BSTRIDE]),
564                         v64_load_aligned(&in[(i + 2) * CDEF_BSTRIDE]),
565                         v64_load_aligned(&in[(i + 3) * CDEF_BSTRIDE]));
566     min = max = row;
567 
568     // Primary near taps
569     p0 = v256_from_v64(v64_load_unaligned(&in[i * CDEF_BSTRIDE + po1]),
570                        v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + po1]),
571                        v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE + po1]),
572                        v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE + po1]));
573     p1 = v256_from_v64(v64_load_unaligned(&in[i * CDEF_BSTRIDE - po1]),
574                        v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - po1]),
575                        v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE - po1]),
576                        v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE - po1]));
577     max =
578         v256_max_s16(v256_max_s16(max, v256_andn(p0, v256_cmpeq_16(p0, large))),
579                      v256_andn(p1, v256_cmpeq_16(p1, large)));
580     min = v256_min_s16(v256_min_s16(min, p0), p1);
581     p0 = constrain16(p0, row, pri_strength, pri_damping);
582     p1 = constrain16(p1, row, pri_strength, pri_damping);
583 
584     // sum += pri_taps[0] * (p0 + p1)
585     sum = v256_add_16(
586         sum, v256_mullo_s16(v256_dup_16(pri_taps[0]), v256_add_16(p0, p1)));
587 
588     // Primary far taps
589     p0 = v256_from_v64(v64_load_unaligned(&in[i * CDEF_BSTRIDE + po2]),
590                        v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + po2]),
591                        v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE + po2]),
592                        v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE + po2]));
593     p1 = v256_from_v64(v64_load_unaligned(&in[i * CDEF_BSTRIDE - po2]),
594                        v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - po2]),
595                        v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE - po2]),
596                        v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE - po2]));
597     max =
598         v256_max_s16(v256_max_s16(max, v256_andn(p0, v256_cmpeq_16(p0, large))),
599                      v256_andn(p1, v256_cmpeq_16(p1, large)));
600     min = v256_min_s16(v256_min_s16(min, p0), p1);
601     p0 = constrain16(p0, row, pri_strength, pri_damping);
602     p1 = constrain16(p1, row, pri_strength, pri_damping);
603 
604     // sum += pri_taps[1] * (p0 + p1)
605     sum = v256_add_16(
606         sum, v256_mullo_s16(v256_dup_16(pri_taps[1]), v256_add_16(p0, p1)));
607 
608     // Secondary near taps
609     p0 = v256_from_v64(v64_load_unaligned(&in[i * CDEF_BSTRIDE + s1o1]),
610                        v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s1o1]),
611                        v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE + s1o1]),
612                        v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE + s1o1]));
613     p1 = v256_from_v64(v64_load_unaligned(&in[i * CDEF_BSTRIDE - s1o1]),
614                        v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s1o1]),
615                        v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE - s1o1]),
616                        v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE - s1o1]));
617     p2 = v256_from_v64(v64_load_unaligned(&in[i * CDEF_BSTRIDE + s2o1]),
618                        v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s2o1]),
619                        v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE + s2o1]),
620                        v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE + s2o1]));
621     p3 = v256_from_v64(v64_load_unaligned(&in[i * CDEF_BSTRIDE - s2o1]),
622                        v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s2o1]),
623                        v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE - s2o1]),
624                        v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE - s2o1]));
625     max =
626         v256_max_s16(v256_max_s16(max, v256_andn(p0, v256_cmpeq_16(p0, large))),
627                      v256_andn(p1, v256_cmpeq_16(p1, large)));
628     max =
629         v256_max_s16(v256_max_s16(max, v256_andn(p2, v256_cmpeq_16(p2, large))),
630                      v256_andn(p3, v256_cmpeq_16(p3, large)));
631     min = v256_min_s16(
632         v256_min_s16(v256_min_s16(v256_min_s16(min, p0), p1), p2), p3);
633     p0 = constrain16(p0, row, sec_strength, sec_damping);
634     p1 = constrain16(p1, row, sec_strength, sec_damping);
635     p2 = constrain16(p2, row, sec_strength, sec_damping);
636     p3 = constrain16(p3, row, sec_strength, sec_damping);
637 
638     // sum += sec_taps[0] * (p0 + p1 + p2 + p3)
639     sum = v256_add_16(sum, v256_mullo_s16(v256_dup_16(sec_taps[0]),
640                                           v256_add_16(v256_add_16(p0, p1),
641                                                       v256_add_16(p2, p3))));
642 
643     // Secondary far taps
644     p0 = v256_from_v64(v64_load_unaligned(&in[i * CDEF_BSTRIDE + s1o2]),
645                        v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s1o2]),
646                        v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE + s1o2]),
647                        v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE + s1o2]));
648     p1 = v256_from_v64(v64_load_unaligned(&in[i * CDEF_BSTRIDE - s1o2]),
649                        v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s1o2]),
650                        v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE - s1o2]),
651                        v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE - s1o2]));
652     p2 = v256_from_v64(v64_load_unaligned(&in[i * CDEF_BSTRIDE + s2o2]),
653                        v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s2o2]),
654                        v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE + s2o2]),
655                        v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE + s2o2]));
656     p3 = v256_from_v64(v64_load_unaligned(&in[i * CDEF_BSTRIDE - s2o2]),
657                        v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s2o2]),
658                        v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE - s2o2]),
659                        v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE - s2o2]));
660     max =
661         v256_max_s16(v256_max_s16(max, v256_andn(p0, v256_cmpeq_16(p0, large))),
662                      v256_andn(p1, v256_cmpeq_16(p1, large)));
663     max =
664         v256_max_s16(v256_max_s16(max, v256_andn(p2, v256_cmpeq_16(p2, large))),
665                      v256_andn(p3, v256_cmpeq_16(p3, large)));
666     min = v256_min_s16(
667         v256_min_s16(v256_min_s16(v256_min_s16(min, p0), p1), p2), p3);
668     p0 = constrain16(p0, row, sec_strength, sec_damping);
669     p1 = constrain16(p1, row, sec_strength, sec_damping);
670     p2 = constrain16(p2, row, sec_strength, sec_damping);
671     p3 = constrain16(p3, row, sec_strength, sec_damping);
672 
673     // sum += sec_taps[1] * (p0 + p1 + p2 + p3)
674     sum = v256_add_16(sum, v256_mullo_s16(v256_dup_16(sec_taps[1]),
675                                           v256_add_16(v256_add_16(p0, p1),
676                                                       v256_add_16(p2, p3))));
677 
678     // res = row + ((sum - (sum < 0) + 8) >> 4)
679     sum = v256_add_16(sum, v256_cmplt_s16(sum, v256_zero()));
680     res = v256_add_16(sum, v256_dup_16(8));
681     res = v256_shr_n_s16(res, 4);
682     res = v256_add_16(row, res);
683     res = v256_min_s16(v256_max_s16(res, min), max);
684 
685     v64_store_aligned(&dst[i * dstride], v128_high_v64(v256_high_v128(res)));
686     v64_store_aligned(&dst[(i + 1) * dstride],
687                       v128_low_v64(v256_high_v128(res)));
688     v64_store_aligned(&dst[(i + 2) * dstride],
689                       v128_high_v64(v256_low_v128(res)));
690     v64_store_aligned(&dst[(i + 3) * dstride],
691                       v128_low_v64(v256_low_v128(res)));
692   }
693 }
694 
SIMD_FUNC(cdef_filter_block_8x8_16)695 void SIMD_FUNC(cdef_filter_block_8x8_16)(uint16_t *dst, int dstride,
696                                          const uint16_t *in, int pri_strength,
697                                          int sec_strength, int dir,
698                                          int pri_damping, int sec_damping,
699                                          int coeff_shift) {
700   int i;
701   v256 sum, p0, p1, p2, p3, row, res;
702   v256 max, min, large = v256_dup_16(CDEF_VERY_LARGE);
703   int po1 = cdef_directions[dir][0];
704   int po2 = cdef_directions[dir][1];
705   int s1o1 = cdef_directions[(dir + 2) & 7][0];
706   int s1o2 = cdef_directions[(dir + 2) & 7][1];
707   int s2o1 = cdef_directions[(dir + 6) & 7][0];
708   int s2o2 = cdef_directions[(dir + 6) & 7][1];
709 
710   const int *pri_taps = cdef_pri_taps[(pri_strength >> coeff_shift) & 1];
711   const int *sec_taps = cdef_sec_taps[(pri_strength >> coeff_shift) & 1];
712 
713   if (pri_strength)
714     pri_damping = AOMMAX(0, pri_damping - get_msb(pri_strength));
715   if (sec_strength)
716     sec_damping = AOMMAX(0, sec_damping - get_msb(sec_strength));
717 
718   for (i = 0; i < 8; i += 2) {
719     sum = v256_zero();
720     row = v256_from_v128(v128_load_aligned(&in[i * CDEF_BSTRIDE]),
721                          v128_load_aligned(&in[(i + 1) * CDEF_BSTRIDE]));
722 
723     min = max = row;
724     // Primary near taps
725     p0 = v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE + po1]),
726                         v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + po1]));
727     p1 = v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE - po1]),
728                         v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - po1]));
729     max =
730         v256_max_s16(v256_max_s16(max, v256_andn(p0, v256_cmpeq_16(p0, large))),
731                      v256_andn(p1, v256_cmpeq_16(p1, large)));
732     min = v256_min_s16(v256_min_s16(min, p0), p1);
733     p0 = constrain16(p0, row, pri_strength, pri_damping);
734     p1 = constrain16(p1, row, pri_strength, pri_damping);
735 
736     // sum += pri_taps[0] * (p0 + p1)
737     sum = v256_add_16(
738         sum, v256_mullo_s16(v256_dup_16(pri_taps[0]), v256_add_16(p0, p1)));
739 
740     // Primary far taps
741     p0 = v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE + po2]),
742                         v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + po2]));
743     p1 = v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE - po2]),
744                         v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - po2]));
745     max =
746         v256_max_s16(v256_max_s16(max, v256_andn(p0, v256_cmpeq_16(p0, large))),
747                      v256_andn(p1, v256_cmpeq_16(p1, large)));
748     min = v256_min_s16(v256_min_s16(min, p0), p1);
749     p0 = constrain16(p0, row, pri_strength, pri_damping);
750     p1 = constrain16(p1, row, pri_strength, pri_damping);
751 
752     // sum += pri_taps[1] * (p0 + p1)
753     sum = v256_add_16(
754         sum, v256_mullo_s16(v256_dup_16(pri_taps[1]), v256_add_16(p0, p1)));
755 
756     // Secondary near taps
757     p0 =
758         v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE + s1o1]),
759                        v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s1o1]));
760     p1 =
761         v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE - s1o1]),
762                        v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s1o1]));
763     p2 =
764         v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE + s2o1]),
765                        v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s2o1]));
766     p3 =
767         v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE - s2o1]),
768                        v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s2o1]));
769     max =
770         v256_max_s16(v256_max_s16(max, v256_andn(p0, v256_cmpeq_16(p0, large))),
771                      v256_andn(p1, v256_cmpeq_16(p1, large)));
772     max =
773         v256_max_s16(v256_max_s16(max, v256_andn(p2, v256_cmpeq_16(p2, large))),
774                      v256_andn(p3, v256_cmpeq_16(p3, large)));
775     min = v256_min_s16(
776         v256_min_s16(v256_min_s16(v256_min_s16(min, p0), p1), p2), p3);
777     p0 = constrain16(p0, row, sec_strength, sec_damping);
778     p1 = constrain16(p1, row, sec_strength, sec_damping);
779     p2 = constrain16(p2, row, sec_strength, sec_damping);
780     p3 = constrain16(p3, row, sec_strength, sec_damping);
781 
782     // sum += sec_taps[0] * (p0 + p1 + p2 + p3)
783     sum = v256_add_16(sum, v256_mullo_s16(v256_dup_16(sec_taps[0]),
784                                           v256_add_16(v256_add_16(p0, p1),
785                                                       v256_add_16(p2, p3))));
786 
787     // Secondary far taps
788     p0 =
789         v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE + s1o2]),
790                        v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s1o2]));
791     p1 =
792         v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE - s1o2]),
793                        v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s1o2]));
794     p2 =
795         v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE + s2o2]),
796                        v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s2o2]));
797     p3 =
798         v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE - s2o2]),
799                        v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s2o2]));
800     max =
801         v256_max_s16(v256_max_s16(max, v256_andn(p0, v256_cmpeq_16(p0, large))),
802                      v256_andn(p1, v256_cmpeq_16(p1, large)));
803     max =
804         v256_max_s16(v256_max_s16(max, v256_andn(p2, v256_cmpeq_16(p2, large))),
805                      v256_andn(p3, v256_cmpeq_16(p3, large)));
806     min = v256_min_s16(
807         v256_min_s16(v256_min_s16(v256_min_s16(min, p0), p1), p2), p3);
808     p0 = constrain16(p0, row, sec_strength, sec_damping);
809     p1 = constrain16(p1, row, sec_strength, sec_damping);
810     p2 = constrain16(p2, row, sec_strength, sec_damping);
811     p3 = constrain16(p3, row, sec_strength, sec_damping);
812 
813     // sum += sec_taps[1] * (p0 + p1 + p2 + p3)
814     sum = v256_add_16(sum, v256_mullo_s16(v256_dup_16(sec_taps[1]),
815                                           v256_add_16(v256_add_16(p0, p1),
816                                                       v256_add_16(p2, p3))));
817 
818     // res = row + ((sum - (sum < 0) + 8) >> 4)
819     sum = v256_add_16(sum, v256_cmplt_s16(sum, v256_zero()));
820     res = v256_add_16(sum, v256_dup_16(8));
821     res = v256_shr_n_s16(res, 4);
822     res = v256_add_16(row, res);
823     res = v256_min_s16(v256_max_s16(res, min), max);
824     v128_store_unaligned(&dst[i * dstride], v256_high_v128(res));
825     v128_store_unaligned(&dst[(i + 1) * dstride], v256_low_v128(res));
826   }
827 }
828 
SIMD_FUNC(cdef_filter_block)829 void SIMD_FUNC(cdef_filter_block)(uint8_t *dst8, uint16_t *dst16, int dstride,
830                                   const uint16_t *in, int pri_strength,
831                                   int sec_strength, int dir, int pri_damping,
832                                   int sec_damping, int bsize, int coeff_shift) {
833   if (dst8) {
834     if (bsize == BLOCK_8X8) {
835       SIMD_FUNC(cdef_filter_block_8x8_8)
836       (dst8, dstride, in, pri_strength, sec_strength, dir, pri_damping,
837        sec_damping, coeff_shift);
838     } else if (bsize == BLOCK_4X8) {
839       SIMD_FUNC(cdef_filter_block_4x4_8)
840       (dst8, dstride, in, pri_strength, sec_strength, dir, pri_damping,
841        sec_damping, coeff_shift);
842       SIMD_FUNC(cdef_filter_block_4x4_8)
843       (dst8 + 4 * dstride, dstride, in + 4 * CDEF_BSTRIDE, pri_strength,
844        sec_strength, dir, pri_damping, sec_damping, coeff_shift);
845     } else if (bsize == BLOCK_8X4) {
846       SIMD_FUNC(cdef_filter_block_4x4_8)
847       (dst8, dstride, in, pri_strength, sec_strength, dir, pri_damping,
848        sec_damping, coeff_shift);
849       SIMD_FUNC(cdef_filter_block_4x4_8)
850       (dst8 + 4, dstride, in + 4, pri_strength, sec_strength, dir, pri_damping,
851        sec_damping, coeff_shift);
852     } else {
853       SIMD_FUNC(cdef_filter_block_4x4_8)
854       (dst8, dstride, in, pri_strength, sec_strength, dir, pri_damping,
855        sec_damping, coeff_shift);
856     }
857   } else {
858     if (bsize == BLOCK_8X8) {
859       SIMD_FUNC(cdef_filter_block_8x8_16)
860       (dst16, dstride, in, pri_strength, sec_strength, dir, pri_damping,
861        sec_damping, coeff_shift);
862     } else if (bsize == BLOCK_4X8) {
863       SIMD_FUNC(cdef_filter_block_4x4_16)
864       (dst16, dstride, in, pri_strength, sec_strength, dir, pri_damping,
865        sec_damping, coeff_shift);
866       SIMD_FUNC(cdef_filter_block_4x4_16)
867       (dst16 + 4 * dstride, dstride, in + 4 * CDEF_BSTRIDE, pri_strength,
868        sec_strength, dir, pri_damping, sec_damping, coeff_shift);
869     } else if (bsize == BLOCK_8X4) {
870       SIMD_FUNC(cdef_filter_block_4x4_16)
871       (dst16, dstride, in, pri_strength, sec_strength, dir, pri_damping,
872        sec_damping, coeff_shift);
873       SIMD_FUNC(cdef_filter_block_4x4_16)
874       (dst16 + 4, dstride, in + 4, pri_strength, sec_strength, dir, pri_damping,
875        sec_damping, coeff_shift);
876     } else {
877       assert(bsize == BLOCK_4X4);
878       SIMD_FUNC(cdef_filter_block_4x4_16)
879       (dst16, dstride, in, pri_strength, sec_strength, dir, pri_damping,
880        sec_damping, coeff_shift);
881     }
882   }
883 }
884 
SIMD_FUNC(copy_rect8_8bit_to_16bit)885 void SIMD_FUNC(copy_rect8_8bit_to_16bit)(uint16_t *dst, int dstride,
886                                          const uint8_t *src, int sstride, int v,
887                                          int h) {
888   int i, j;
889   for (i = 0; i < v; i++) {
890     for (j = 0; j < (h & ~0x7); j += 8) {
891       v64 row = v64_load_unaligned(&src[i * sstride + j]);
892       v128_store_unaligned(&dst[i * dstride + j], v128_unpack_u8_s16(row));
893     }
894     for (; j < h; j++) {
895       dst[i * dstride + j] = src[i * sstride + j];
896     }
897   }
898 }
899 
SIMD_FUNC(copy_rect8_16bit_to_16bit)900 void SIMD_FUNC(copy_rect8_16bit_to_16bit)(uint16_t *dst, int dstride,
901                                           const uint16_t *src, int sstride,
902                                           int v, int h) {
903   int i, j;
904   for (i = 0; i < v; i++) {
905     for (j = 0; j < (h & ~0x7); j += 8) {
906       v128 row = v128_load_unaligned(&src[i * sstride + j]);
907       v128_store_unaligned(&dst[i * dstride + j], row);
908     }
909     for (; j < h; j++) {
910       dst[i * dstride + j] = src[i * sstride + j];
911     }
912   }
913 }
914 
915 #endif  // AOM_AV1_COMMON_CDEF_BLOCK_SIMD_H_
916