1 /*
2  * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 
12 #include <math.h>
13 
14 #include "config/aom_config.h"
15 #include "config/aom_dsp_rtcd.h"
16 
17 #include "aom_dsp/aom_dsp_common.h"
18 #include "aom_mem/aom_mem.h"
19 #include "aom_ports/mem.h"
20 #include "av1/common/av1_loopfilter.h"
21 #include "av1/common/onyxc_int.h"
22 #include "av1/common/reconinter.h"
23 #include "av1/common/seg_common.h"
24 
25 static const SEG_LVL_FEATURES seg_lvl_lf_lut[MAX_MB_PLANE][2] = {
26   { SEG_LVL_ALT_LF_Y_V, SEG_LVL_ALT_LF_Y_H },
27   { SEG_LVL_ALT_LF_U, SEG_LVL_ALT_LF_U },
28   { SEG_LVL_ALT_LF_V, SEG_LVL_ALT_LF_V }
29 };
30 
31 static const int delta_lf_id_lut[MAX_MB_PLANE][2] = {
32   { 0, 1 }, { 2, 2 }, { 3, 3 }
33 };
34 
35 enum { VERT_EDGE = 0, HORZ_EDGE = 1, NUM_EDGE_DIRS } UENUM1BYTE(EDGE_DIR);
36 
37 static const int mode_lf_lut[] = {
38   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // INTRA_MODES
39   1, 1, 0, 1,                             // INTER_MODES (GLOBALMV == 0)
40   1, 1, 1, 1, 1, 1, 0, 1  // INTER_COMPOUND_MODES (GLOBAL_GLOBALMV == 0)
41 };
42 
43 // 256 bit masks (64x64 / 4x4) for left transform size for Y plane.
44 // We use 4 uint64_t to represent the 256 bit.
45 // Each 1 represents a position where we should apply a loop filter
46 // across the left border of an 4x4 block boundary.
47 //
48 // In the case of TX_8x8->  ( in low order byte first we end up with
49 // a mask that looks like this (-- and | are used for better view)
50 //
51 //    10101010|10101010
52 //    10101010|10101010
53 //    10101010|10101010
54 //    10101010|10101010
55 //    10101010|10101010
56 //    10101010|10101010
57 //    10101010|10101010
58 //    10101010|10101010
59 //    -----------------
60 //    10101010|10101010
61 //    10101010|10101010
62 //    10101010|10101010
63 //    10101010|10101010
64 //    10101010|10101010
65 //    10101010|10101010
66 //    10101010|10101010
67 //    10101010|10101010
68 //
69 // A loopfilter should be applied to every other 4x4 horizontally.
70 
71 // 256 bit masks (64x64 / 4x4) for above transform size for Y plane.
72 // We use 4 uint64_t to represent the 256 bit.
73 // Each 1 represents a position where we should apply a loop filter
74 // across the top border of an 4x4 block boundary.
75 //
76 // In the case of TX_8x8->  ( in low order byte first we end up with
77 // a mask that looks like this
78 //
79 //    11111111|11111111
80 //    00000000|00000000
81 //    11111111|11111111
82 //    00000000|00000000
83 //    11111111|11111111
84 //    00000000|00000000
85 //    11111111|11111111
86 //    00000000|00000000
87 //    -----------------
88 //    11111111|11111111
89 //    00000000|00000000
90 //    11111111|11111111
91 //    00000000|00000000
92 //    11111111|11111111
93 //    00000000|00000000
94 //    11111111|11111111
95 //    00000000|00000000
96 //
97 // A loopfilter should be applied to every other 4x4 horizontally.
98 
99 const int mask_id_table_tx_4x4[BLOCK_SIZES_ALL] = {
100   0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, -1, -1, -1, 13, 14, 15, 16, 17, 18
101 };
102 
103 const int mask_id_table_tx_8x8[BLOCK_SIZES_ALL] = {
104   -1, -1, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1, -1, -1, -1, -1, 10, 11, 12, 13
105 };
106 
107 const int mask_id_table_tx_16x16[BLOCK_SIZES_ALL] = {
108   -1, -1, -1, -1, -1, -1, 0, 1, 2, 3, 4, 5, 6, -1, -1, -1, -1, -1, -1, -1, 7, 8
109 };
110 
111 const int mask_id_table_tx_32x32[BLOCK_SIZES_ALL] = { -1, -1, -1, -1, -1, -1,
112                                                       -1, -1, -1, 0,  1,  2,
113                                                       3,  -1, -1, -1, -1, -1,
114                                                       -1, -1, -1, -1 };
115 const int mask_id_table_vert_border[BLOCK_SIZES_ALL] = { 0,  47, 49, 19, 51, 53,
116                                                          33, 55, 57, 42, 59, 60,
117                                                          46, -1, -1, -1, 61, 62,
118                                                          63, 64, 65, 66 };
119 
120 const FilterMask left_mask_univariant_reordered[67] = {
121   // TX_4X4
122   { { 0x0000000000000001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
123       0x0000000000000000ULL } },  // block size 4X4, TX_4X4
124   { { 0x0000000000010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
125       0x0000000000000000ULL } },  // block size 4X8, TX_4X4
126   { { 0x0000000000000003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
127       0x0000000000000000ULL } },  // block size 8X4, TX_4X4
128   { { 0x0000000000030003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
129       0x0000000000000000ULL } },  // block size 8X8, TX_4X4
130   { { 0x0003000300030003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
131       0x0000000000000000ULL } },  // block size 8X16, TX_4X4
132   { { 0x00000000000f000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
133       0x0000000000000000ULL } },  // block size 16X8, TX_4X4
134   { { 0x000f000f000f000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
135       0x0000000000000000ULL } },  // block size 16X16, TX_4X4
136   { { 0x000f000f000f000fULL, 0x000f000f000f000fULL, 0x0000000000000000ULL,
137       0x0000000000000000ULL } },  // block size 16X32, TX_4X4
138   { { 0x00ff00ff00ff00ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
139       0x0000000000000000ULL } },  // block size 32X16, TX_4X4
140   { { 0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL, 0x0000000000000000ULL,
141       0x0000000000000000ULL } },  // block size 32X32, TX_4X4
142   { { 0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL,
143       0x00ff00ff00ff00ffULL } },  // block size 32X64, TX_4X4
144   { { 0xffffffffffffffffULL, 0xffffffffffffffffULL, 0x0000000000000000ULL,
145       0x0000000000000000ULL } },  // block size 64X32, TX_4X4
146   { { 0xffffffffffffffffULL, 0xffffffffffffffffULL, 0xffffffffffffffffULL,
147       0xffffffffffffffffULL } },  // block size 64X64, TX_4X4
148   { { 0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
149       0x0000000000000000ULL } },  // block size 4X16, TX_4X4
150   { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
151       0x0000000000000000ULL } },  // block size 16X4, TX_4X4
152   { { 0x0003000300030003ULL, 0x0003000300030003ULL, 0x0000000000000000ULL,
153       0x0000000000000000ULL } },  // block size 8X32, TX_4X4
154   { { 0x0000000000ff00ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
155       0x0000000000000000ULL } },  // block size 32X8, TX_4X4
156   { { 0x000f000f000f000fULL, 0x000f000f000f000fULL, 0x000f000f000f000fULL,
157       0x000f000f000f000fULL } },  // block size 16X64, TX_4X4
158   { { 0xffffffffffffffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
159       0x0000000000000000ULL } },  // block size 64X16, TX_4X4
160   // TX_8X8
161   { { 0x0000000000010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
162       0x0000000000000000ULL } },  // block size 8X8, TX_8X8
163   { { 0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
164       0x0000000000000000ULL } },  // block size 8X16, TX_8X8
165   { { 0x0000000000050005ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
166       0x0000000000000000ULL } },  // block size 16X8, TX_8X8
167   { { 0x0005000500050005ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
168       0x0000000000000000ULL } },  // block size 16X16, TX_8X8
169   { { 0x0005000500050005ULL, 0x0005000500050005ULL, 0x0000000000000000ULL,
170       0x0000000000000000ULL } },  // block size 16X32, TX_8X8
171   { { 0x0055005500550055ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
172       0x0000000000000000ULL } },  // block size 32X16, TX_8X8
173   { { 0x0055005500550055ULL, 0x0055005500550055ULL, 0x0000000000000000ULL,
174       0x0000000000000000ULL } },  // block size 32X32, TX_8X8
175   { { 0x0055005500550055ULL, 0x0055005500550055ULL, 0x0055005500550055ULL,
176       0x0055005500550055ULL } },  // block size 32X64, TX_8X8
177   { { 0x5555555555555555ULL, 0x5555555555555555ULL, 0x0000000000000000ULL,
178       0x0000000000000000ULL } },  // block size 64X32, TX_8X8
179   { { 0x5555555555555555ULL, 0x5555555555555555ULL, 0x5555555555555555ULL,
180       0x5555555555555555ULL } },  // block size 64X64, TX_8X8
181   { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0000000000000000ULL,
182       0x0000000000000000ULL } },  // block size 8X32, TX_8X8
183   { { 0x0000000000550055ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
184       0x0000000000000000ULL } },  // block size 32X8, TX_8X8
185   { { 0x0005000500050005ULL, 0x0005000500050005ULL, 0x0005000500050005ULL,
186       0x0005000500050005ULL } },  // block size 16X64, TX_8X8
187   { { 0x5555555555555555ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
188       0x0000000000000000ULL } },  // block size 64X16, TX_8X8
189   // TX_16X16
190   { { 0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
191       0x0000000000000000ULL } },  // block size 16X16, TX_16X16
192   { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0000000000000000ULL,
193       0x0000000000000000ULL } },  // block size 16X32, TX_16X16
194   { { 0x0011001100110011ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
195       0x0000000000000000ULL } },  // block size 32X16, TX_16X16
196   { { 0x0011001100110011ULL, 0x0011001100110011ULL, 0x0000000000000000ULL,
197       0x0000000000000000ULL } },  // block size 32X32, TX_16X16
198   { { 0x0011001100110011ULL, 0x0011001100110011ULL, 0x0011001100110011ULL,
199       0x0011001100110011ULL } },  // block size 32X64, TX_16X16
200   { { 0x1111111111111111ULL, 0x1111111111111111ULL, 0x0000000000000000ULL,
201       0x0000000000000000ULL } },  // block size 64X32, TX_16X16
202   { { 0x1111111111111111ULL, 0x1111111111111111ULL, 0x1111111111111111ULL,
203       0x1111111111111111ULL } },  // block size 64X64, TX_16X16
204   { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0001000100010001ULL,
205       0x0001000100010001ULL } },  // block size 16X64, TX_16X16
206   { { 0x1111111111111111ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
207       0x0000000000000000ULL } },  // block size 64X16, TX_16X16
208   // TX_32X32
209   { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0000000000000000ULL,
210       0x0000000000000000ULL } },  // block size 32X32, TX_32X32
211   { { 0x0101010101010101ULL, 0x0101010101010101ULL, 0x0101010101010101ULL,
212       0x0101010101010101ULL } },  // block size 32X64, TX_32X32
213   { { 0x0101010101010101ULL, 0x0101010101010101ULL, 0x0000000000000000ULL,
214       0x0000000000000000ULL } },  // block size 64X32, TX_32X32
215   { { 0x0101010101010101ULL, 0x0101010101010101ULL, 0x0101010101010101ULL,
216       0x0101010101010101ULL } },  // block size 64X64, TX_32X32
217   // TX_64X64
218   { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0001000100010001ULL,
219       0x0001000100010001ULL } },  // block size 64X64, TX_64X64
220   // 2:1, 1:2 transform sizes.
221   { { 0x0000000000010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
222       0x0000000000000000ULL } },  // block size 4X8, TX_4X8
223   { { 0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
224       0x0000000000000000ULL } },  // block size 4X16, TX_4X8
225   { { 0x0000000000000001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
226       0x0000000000000000ULL } },  // block size 8X4, TX_8X4
227   { { 0x0000000000000005ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
228       0x0000000000000000ULL } },  // block size 16X4, TX_8X4
229   { { 0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
230       0x0000000000000000ULL } },  // block size 8X16, TX_8X16
231   { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0000000000000000ULL,
232       0x0000000000000000ULL } },  // block size 8X32, TX_8X16
233   { { 0x0000000000010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
234       0x0000000000000000ULL } },  // block size 16X8, TX_16X8
235   { { 0x0000000000110011ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
236       0x0000000000000000ULL } },  // block size 32X8, TX_16X8
237   { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0000000000000000ULL,
238       0x0000000000000000ULL } },  // block size 16X32, TX_16X32
239   { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0001000100010001ULL,
240       0x0001000100010001ULL } },  // block size 16X64, TX_16X32
241   { { 0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
242       0x0000000000000000ULL } },  // block size 32X16, TX_32X16
243   { { 0x0101010101010101ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
244       0x0000000000000000ULL } },  // block size 64X16, TX_32X16
245   { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0001000100010001ULL,
246       0x0001000100010001ULL } },  // block size 32X64, TX_32X64
247   { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0000000000000000ULL,
248       0x0000000000000000ULL } },  // block size 64X32, TX_64X32
249   // 4:1, 1:4 transform sizes.
250   { { 0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
251       0x0000000000000000ULL } },  // block size 4X16, TX_4X16
252   { { 0x0000000000000001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
253       0x0000000000000000ULL } },  // block size 16X4, TX_16X4
254   { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0000000000000000ULL,
255       0x0000000000000000ULL } },  // block size 8X32, TX_8X32
256   { { 0x0000000000010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
257       0x0000000000000000ULL } },  // block size 32X8, TX_32X8
258   { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0001000100010001ULL,
259       0x0001000100010001ULL } },  // block size 16X64, TX_16X64
260   { { 0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
261       0x0000000000000000ULL } },  // block size 64X16, TX_64X16
262 };
263 
264 const FilterMask above_mask_univariant_reordered[67] = {
265   // TX_4X4
266   { { 0x0000000000000001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
267       0x0000000000000000ULL } },  // block size 4X4, TX_4X4
268   { { 0x0000000000010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
269       0x0000000000000000ULL } },  // block size 4X8, TX_4X4
270   { { 0x0000000000000003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
271       0x0000000000000000ULL } },  // block size 8X4, TX_4X4
272   { { 0x0000000000030003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
273       0x0000000000000000ULL } },  // block size 8X8, TX_4X4
274   { { 0x0003000300030003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
275       0x0000000000000000ULL } },  // block size 8X16, TX_4X4
276   { { 0x00000000000f000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
277       0x0000000000000000ULL } },  // block size 16X8, TX_4X4
278   { { 0x000f000f000f000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
279       0x0000000000000000ULL } },  // block size 16X16, TX_4X4
280   { { 0x000f000f000f000fULL, 0x000f000f000f000fULL, 0x0000000000000000ULL,
281       0x0000000000000000ULL } },  // block size 16X32, TX_4X4
282   { { 0x00ff00ff00ff00ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
283       0x0000000000000000ULL } },  // block size 32X16, TX_4X4
284   { { 0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL, 0x0000000000000000ULL,
285       0x0000000000000000ULL } },  // block size 32X32, TX_4X4
286   { { 0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL,
287       0x00ff00ff00ff00ffULL } },  // block size 32X64, TX_4X4
288   { { 0xffffffffffffffffULL, 0xffffffffffffffffULL, 0x0000000000000000ULL,
289       0x0000000000000000ULL } },  // block size 64X32, TX_4X4
290   { { 0xffffffffffffffffULL, 0xffffffffffffffffULL, 0xffffffffffffffffULL,
291       0xffffffffffffffffULL } },  // block size 64X64, TX_4x4
292   { { 0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
293       0x0000000000000000ULL } },  // block size 4X16, TX_4X4
294   { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
295       0x0000000000000000ULL } },  // block size 16X4, TX_4X4
296   { { 0x0003000300030003ULL, 0x0003000300030003ULL, 0x0000000000000000ULL,
297       0x0000000000000000ULL } },  // block size 8X32, TX_4X4
298   { { 0x0000000000ff00ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
299       0x0000000000000000ULL } },  // block size 32X8, TX_4X4
300   { { 0x000f000f000f000fULL, 0x000f000f000f000fULL, 0x000f000f000f000fULL,
301       0x000f000f000f000fULL } },  // block size 16X64, TX_4X4
302   { { 0xffffffffffffffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
303       0x0000000000000000ULL } },  // block size 64X16, TX_4X4
304   // TX_8X8
305   { { 0x0000000000000003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
306       0x0000000000000000ULL } },  // block size 8X8, TX_8X8
307   { { 0x0000000300000003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
308       0x0000000000000000ULL } },  // block size 8X16, TX_8X8
309   { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
310       0x0000000000000000ULL } },  // block size 16X8, TX_8X8
311   { { 0x0000000f0000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
312       0x0000000000000000ULL } },  // block size 16X16, TX_8X8
313   { { 0x0000000f0000000fULL, 0x0000000f0000000fULL, 0x0000000000000000ULL,
314       0x0000000000000000ULL } },  // block size 16X32, TX_8X8
315   { { 0x000000ff000000ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
316       0x0000000000000000ULL } },  // block size 32X16, TX_8X8
317   { { 0x000000ff000000ffULL, 0x000000ff000000ffULL, 0x0000000000000000ULL,
318       0x0000000000000000ULL } },  // block size 32X32, TX_8X8
319   { { 0x000000ff000000ffULL, 0x000000ff000000ffULL, 0x000000ff000000ffULL,
320       0x000000ff000000ffULL } },  // block size 32X64, TX_8X8
321   { { 0x0000ffff0000ffffULL, 0x0000ffff0000ffffULL, 0x0000000000000000ULL,
322       0x0000000000000000ULL } },  // block size 64X32, TX_8X8
323   { { 0x0000ffff0000ffffULL, 0x0000ffff0000ffffULL, 0x0000ffff0000ffffULL,
324       0x0000ffff0000ffffULL } },  // block size 64X64, TX_8X8
325   { { 0x0000000300000003ULL, 0x0000000300000003ULL, 0x0000000000000000ULL,
326       0x0000000000000000ULL } },  // block size 8X32, TX_8X8
327   { { 0x00000000000000ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
328       0x0000000000000000ULL } },  // block size 32X8, TX_8X8
329   { { 0x0000000f0000000fULL, 0x0000000f0000000fULL, 0x0000000f0000000fULL,
330       0x0000000f0000000fULL } },  // block size 16X64, TX_8X8
331   { { 0x0000ffff0000ffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
332       0x0000000000000000ULL } },  // block size 64X16, TX_8X8
333   // TX_16X16
334   { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
335       0x0000000000000000ULL } },  // block size 16X16, TX_16X16
336   { { 0x000000000000000fULL, 0x000000000000000fULL, 0x0000000000000000ULL,
337       0x0000000000000000ULL } },  // block size 16X32, TX_16X16
338   { { 0x00000000000000ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
339       0x0000000000000000ULL } },  // block size 32X16, TX_16X16
340   { { 0x00000000000000ffULL, 0x00000000000000ffULL, 0x0000000000000000ULL,
341       0x0000000000000000ULL } },  // block size 32X32, TX_16X16
342   { { 0x00000000000000ffULL, 0x00000000000000ffULL, 0x00000000000000ffULL,
343       0x00000000000000ffULL } },  // block size 32X64, TX_16X16
344   { { 0x000000000000ffffULL, 0x000000000000ffffULL, 0x0000000000000000ULL,
345       0x0000000000000000ULL } },  // block size 64X32, TX_16X16
346   { { 0x000000000000ffffULL, 0x000000000000ffffULL, 0x000000000000ffffULL,
347       0x000000000000ffffULL } },  // block size 64X64, TX_16X16
348   { { 0x000000000000000fULL, 0x000000000000000fULL, 0x000000000000000fULL,
349       0x000000000000000fULL } },  // block size 16X64, TX_16X16
350   { { 0x000000000000ffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
351       0x0000000000000000ULL } },  // block size 64X16, TX_16X16
352   // TX_32X32
353   { { 0x00000000000000ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
354       0x0000000000000000ULL } },  // block size 32X32, TX_32X32
355   { { 0x00000000000000ffULL, 0x0000000000000000ULL, 0x00000000000000ffULL,
356       0x0000000000000000ULL } },  // block size 32X64, TX_32X32
357   { { 0x000000000000ffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
358       0x0000000000000000ULL } },  // block size 64X32, TX_32X32
359   { { 0x000000000000ffffULL, 0x0000000000000000ULL, 0x000000000000ffffULL,
360       0x0000000000000000ULL } },  // block size 64X64, TX_32X32
361   // TX_64X64
362   { { 0x000000000000ffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
363       0x0000000000000000ULL } },  // block size 64X64, TX_64X64
364   // 2:1, 1:2 transform sizes.
365   { { 0x0000000000000001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
366       0x0000000000000000ULL } },  // block size 4X8, TX_4X8
367   { { 0x0000000100000001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
368       0x0000000000000000ULL } },  // block size 4X16, TX_4X8
369   { { 0x0000000000000003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
370       0x0000000000000000ULL } },  // block size 8X4, TX_8X4
371   { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
372       0x0000000000000000ULL } },  // block size 16X4, TX_8X4
373   { { 0x0000000000000003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
374       0x0000000000000000ULL } },  // block size 8X16, TX_8X16
375   { { 0x0000000000000003ULL, 0x0000000000000003ULL, 0x0000000000000000ULL,
376       0x0000000000000000ULL } },  // block size 8X32, TX_8X16
377   { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
378       0x0000000000000000ULL } },  // block size 16X8, TX_16X8
379   { { 0x00000000000000ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
380       0x0000000000000000ULL } },  // block size 32X8, TX_16X8
381   { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
382       0x0000000000000000ULL } },  // block size 16X32, TX_16X32
383   { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x000000000000000fULL,
384       0x0000000000000000ULL } },  // block size 16X64, TX_16X32
385   { { 0x00000000000000ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
386       0x0000000000000000ULL } },  // block size 32X16, TX_32X16
387   { { 0x000000000000ffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
388       0x0000000000000000ULL } },  // block size 64X16, TX_32X16
389   { { 0x00000000000000ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
390       0x0000000000000000ULL } },  // block size 32X64, TX_32X64
391   { { 0x000000000000ffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
392       0x0000000000000000ULL } },  // block size 64X32, TX_64X32
393   // 4:1, 1:4 transform sizes.
394   { { 0x0000000000000001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
395       0x0000000000000000ULL } },  // block size 4X16, TX_4X16
396   { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
397       0x0000000000000000ULL } },  // block size 16X4, TX_16X4
398   { { 0x0000000000000003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
399       0x0000000000000000ULL } },  // block size 8X32, TX_8X32
400   { { 0x00000000000000ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
401       0x0000000000000000ULL } },  // block size 32X8, TX_32X8
402   { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
403       0x0000000000000000ULL } },  // block size 16X64, TX_16X64
404   { { 0x000000000000ffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
405       0x0000000000000000ULL } },  // block size 64X16, TX_64X16
406 };
407 
408 #if LOOP_FILTER_BITMASK
get_loop_filter_mask(const AV1_COMMON * const cm,int mi_row,int mi_col)409 LoopFilterMask *get_loop_filter_mask(const AV1_COMMON *const cm, int mi_row,
410                                      int mi_col) {
411   assert(cm->lf.lfm != NULL);
412   const int row = mi_row >> MIN_MIB_SIZE_LOG2;  // 64x64
413   const int col = mi_col >> MIN_MIB_SIZE_LOG2;
414   return &cm->lf.lfm[row * cm->lf.lfm_stride + col];
415 }
416 
417 typedef void (*LpfFunc)(uint8_t *s, int p, const uint8_t *blimit,
418                         const uint8_t *limit, const uint8_t *thresh);
419 
420 typedef void (*LpfDualFunc)(uint8_t *s, int p, const uint8_t *blimit0,
421                             const uint8_t *limit0, const uint8_t *thresh0,
422                             const uint8_t *blimit1, const uint8_t *limit1,
423                             const uint8_t *thresh1);
424 
425 typedef void (*HbdLpfFunc)(uint16_t *s, int p, const uint8_t *blimit,
426                            const uint8_t *limit, const uint8_t *thresh, int bd);
427 
428 typedef void (*HbdLpfDualFunc)(uint16_t *s, int p, const uint8_t *blimit0,
429                                const uint8_t *limit0, const uint8_t *thresh0,
430                                const uint8_t *blimit1, const uint8_t *limit1,
431                                const uint8_t *thresh1, int bd);
432 #endif  // LOOP_FILTER_BITMASK
433 
update_sharpness(loop_filter_info_n * lfi,int sharpness_lvl)434 static void update_sharpness(loop_filter_info_n *lfi, int sharpness_lvl) {
435   int lvl;
436 
437   // For each possible value for the loop filter fill out limits
438   for (lvl = 0; lvl <= MAX_LOOP_FILTER; lvl++) {
439     // Set loop filter parameters that control sharpness.
440     int block_inside_limit = lvl >> ((sharpness_lvl > 0) + (sharpness_lvl > 4));
441 
442     if (sharpness_lvl > 0) {
443       if (block_inside_limit > (9 - sharpness_lvl))
444         block_inside_limit = (9 - sharpness_lvl);
445     }
446 
447     if (block_inside_limit < 1) block_inside_limit = 1;
448 
449     memset(lfi->lfthr[lvl].lim, block_inside_limit, SIMD_WIDTH);
450     memset(lfi->lfthr[lvl].mblim, (2 * (lvl + 2) + block_inside_limit),
451            SIMD_WIDTH);
452   }
453 }
454 
get_filter_level(const AV1_COMMON * cm,const loop_filter_info_n * lfi_n,const int dir_idx,int plane,const MB_MODE_INFO * mbmi)455 uint8_t get_filter_level(const AV1_COMMON *cm, const loop_filter_info_n *lfi_n,
456                          const int dir_idx, int plane,
457                          const MB_MODE_INFO *mbmi) {
458   const int segment_id = mbmi->segment_id;
459   if (cm->delta_q_info.delta_lf_present_flag) {
460     int delta_lf;
461     if (cm->delta_q_info.delta_lf_multi) {
462       const int delta_lf_idx = delta_lf_id_lut[plane][dir_idx];
463       delta_lf = mbmi->delta_lf[delta_lf_idx];
464     } else {
465       delta_lf = mbmi->delta_lf_from_base;
466     }
467     int base_level;
468     if (plane == 0)
469       base_level = cm->lf.filter_level[dir_idx];
470     else if (plane == 1)
471       base_level = cm->lf.filter_level_u;
472     else
473       base_level = cm->lf.filter_level_v;
474     int lvl_seg = clamp(delta_lf + base_level, 0, MAX_LOOP_FILTER);
475     assert(plane >= 0 && plane <= 2);
476     const int seg_lf_feature_id = seg_lvl_lf_lut[plane][dir_idx];
477     if (segfeature_active(&cm->seg, segment_id, seg_lf_feature_id)) {
478       const int data = get_segdata(&cm->seg, segment_id, seg_lf_feature_id);
479       lvl_seg = clamp(lvl_seg + data, 0, MAX_LOOP_FILTER);
480     }
481 
482     if (cm->lf.mode_ref_delta_enabled) {
483       const int scale = 1 << (lvl_seg >> 5);
484       lvl_seg += cm->lf.ref_deltas[mbmi->ref_frame[0]] * scale;
485       if (mbmi->ref_frame[0] > INTRA_FRAME)
486         lvl_seg += cm->lf.mode_deltas[mode_lf_lut[mbmi->mode]] * scale;
487       lvl_seg = clamp(lvl_seg, 0, MAX_LOOP_FILTER);
488     }
489     return lvl_seg;
490   } else {
491     return lfi_n->lvl[plane][segment_id][dir_idx][mbmi->ref_frame[0]]
492                      [mode_lf_lut[mbmi->mode]];
493   }
494 }
495 
av1_loop_filter_init(AV1_COMMON * cm)496 void av1_loop_filter_init(AV1_COMMON *cm) {
497   assert(MB_MODE_COUNT == NELEMENTS(mode_lf_lut));
498   loop_filter_info_n *lfi = &cm->lf_info;
499   struct loopfilter *lf = &cm->lf;
500   int lvl;
501 
502   lf->combine_vert_horz_lf = 1;
503 
504   // init limits for given sharpness
505   update_sharpness(lfi, lf->sharpness_level);
506 
507   // init hev threshold const vectors
508   for (lvl = 0; lvl <= MAX_LOOP_FILTER; lvl++)
509     memset(lfi->lfthr[lvl].hev_thr, (lvl >> 4), SIMD_WIDTH);
510 }
511 
512 // Update the loop filter for the current frame.
513 // This should be called before loop_filter_rows(),
514 // av1_loop_filter_frame() calls this function directly.
av1_loop_filter_frame_init(AV1_COMMON * cm,int plane_start,int plane_end)515 void av1_loop_filter_frame_init(AV1_COMMON *cm, int plane_start,
516                                 int plane_end) {
517   int filt_lvl[MAX_MB_PLANE], filt_lvl_r[MAX_MB_PLANE];
518   int plane;
519   int seg_id;
520   // n_shift is the multiplier for lf_deltas
521   // the multiplier is 1 for when filter_lvl is between 0 and 31;
522   // 2 when filter_lvl is between 32 and 63
523   loop_filter_info_n *const lfi = &cm->lf_info;
524   struct loopfilter *const lf = &cm->lf;
525   const struct segmentation *const seg = &cm->seg;
526 
527   // update sharpness limits
528   update_sharpness(lfi, lf->sharpness_level);
529 
530   filt_lvl[0] = cm->lf.filter_level[0];
531   filt_lvl[1] = cm->lf.filter_level_u;
532   filt_lvl[2] = cm->lf.filter_level_v;
533 
534   filt_lvl_r[0] = cm->lf.filter_level[1];
535   filt_lvl_r[1] = cm->lf.filter_level_u;
536   filt_lvl_r[2] = cm->lf.filter_level_v;
537 
538   assert(plane_start >= AOM_PLANE_Y);
539   assert(plane_end <= MAX_MB_PLANE);
540 
541   for (plane = plane_start; plane < plane_end; plane++) {
542     if (plane == 0 && !filt_lvl[0] && !filt_lvl_r[0])
543       break;
544     else if (plane == 1 && !filt_lvl[1])
545       continue;
546     else if (plane == 2 && !filt_lvl[2])
547       continue;
548 
549     for (seg_id = 0; seg_id < MAX_SEGMENTS; seg_id++) {
550       for (int dir = 0; dir < 2; ++dir) {
551         int lvl_seg = (dir == 0) ? filt_lvl[plane] : filt_lvl_r[plane];
552         const int seg_lf_feature_id = seg_lvl_lf_lut[plane][dir];
553         if (segfeature_active(seg, seg_id, seg_lf_feature_id)) {
554           const int data = get_segdata(&cm->seg, seg_id, seg_lf_feature_id);
555           lvl_seg = clamp(lvl_seg + data, 0, MAX_LOOP_FILTER);
556         }
557 
558         if (!lf->mode_ref_delta_enabled) {
559           // we could get rid of this if we assume that deltas are set to
560           // zero when not in use; encoder always uses deltas
561           memset(lfi->lvl[plane][seg_id][dir], lvl_seg,
562                  sizeof(lfi->lvl[plane][seg_id][dir]));
563         } else {
564           int ref, mode;
565           const int scale = 1 << (lvl_seg >> 5);
566           const int intra_lvl = lvl_seg + lf->ref_deltas[INTRA_FRAME] * scale;
567           lfi->lvl[plane][seg_id][dir][INTRA_FRAME][0] =
568               clamp(intra_lvl, 0, MAX_LOOP_FILTER);
569 
570           for (ref = LAST_FRAME; ref < REF_FRAMES; ++ref) {
571             for (mode = 0; mode < MAX_MODE_LF_DELTAS; ++mode) {
572               const int inter_lvl = lvl_seg + lf->ref_deltas[ref] * scale +
573                                     lf->mode_deltas[mode] * scale;
574               lfi->lvl[plane][seg_id][dir][ref][mode] =
575                   clamp(inter_lvl, 0, MAX_LOOP_FILTER);
576             }
577           }
578         }
579       }
580     }
581   }
582 }
583 
584 #if LOOP_FILTER_BITMASK
585 // A 64x64 tx block requires 256 bits to represent each 4x4 tx block.
586 // Every 4 rows is represented by one uint64_t mask. Hence,
587 // there are 4 uint64_t bitmask[4] to represent the 64x64 block.
588 //
589 // Given a location by (mi_col, mi_row), This function returns the index
590 // 0, 1, 2, 3 to select which bitmask[] to use, and the shift value.
591 //
592 // For example, mi_row is the offset of pixels in mi size (4),
593 // (mi_row / 4) returns which uint64_t.
594 // After locating which uint64_t, mi_row % 4 is the
595 // row offset, and each row has 16 = 1 << stride_log2 4x4 units.
596 // Therefore, shift = (row << stride_log2) + mi_col;
get_index_shift(int mi_col,int mi_row,int * index)597 int get_index_shift(int mi_col, int mi_row, int *index) {
598   // *index = mi_row >> 2;
599   // rows = mi_row % 4;
600   // stride_log2 = 4;
601   // shift = (rows << stride_log2) + mi_col;
602   *index = mi_row >> 2;
603   return ((mi_row & 3) << 4) | mi_col;
604 }
605 
check_mask(const FilterMask * lfm)606 static void check_mask(const FilterMask *lfm) {
607 #ifndef NDEBUG
608   for (int i = 0; i < 4; ++i) {
609     assert(!(lfm[TX_4X4].bits[i] & lfm[TX_8X8].bits[i]));
610     assert(!(lfm[TX_4X4].bits[i] & lfm[TX_16X16].bits[i]));
611     assert(!(lfm[TX_4X4].bits[i] & lfm[TX_32X32].bits[i]));
612     assert(!(lfm[TX_4X4].bits[i] & lfm[TX_64X64].bits[i]));
613     assert(!(lfm[TX_8X8].bits[i] & lfm[TX_16X16].bits[i]));
614     assert(!(lfm[TX_8X8].bits[i] & lfm[TX_32X32].bits[i]));
615     assert(!(lfm[TX_8X8].bits[i] & lfm[TX_64X64].bits[i]));
616     assert(!(lfm[TX_16X16].bits[i] & lfm[TX_32X32].bits[i]));
617     assert(!(lfm[TX_16X16].bits[i] & lfm[TX_64X64].bits[i]));
618     assert(!(lfm[TX_32X32].bits[i] & lfm[TX_64X64].bits[i]));
619   }
620 #else
621   (void)lfm;
622 #endif
623 }
624 
check_loop_filter_masks(const LoopFilterMask * lfm,int plane)625 static void check_loop_filter_masks(const LoopFilterMask *lfm, int plane) {
626   if (plane == 0) {
627     // Assert if we try to apply 2 different loop filters at the same
628     // position.
629     check_mask(lfm->left_y);
630     check_mask(lfm->above_y);
631   } else if (plane == 1) {
632     check_mask(lfm->left_u);
633     check_mask(lfm->above_u);
634   } else {
635     check_mask(lfm->left_v);
636     check_mask(lfm->above_v);
637   }
638 }
639 
update_masks(EDGE_DIR dir,int plane,uint64_t * mask,TX_SIZE sqr_tx_size,LoopFilterMask * lfm)640 static void update_masks(EDGE_DIR dir, int plane, uint64_t *mask,
641                          TX_SIZE sqr_tx_size, LoopFilterMask *lfm) {
642   if (dir == VERT_EDGE) {
643     switch (plane) {
644       case 0:
645         for (int i = 0; i < 4; ++i) lfm->left_y[sqr_tx_size].bits[i] |= mask[i];
646         break;
647       case 1:
648         for (int i = 0; i < 4; ++i) lfm->left_u[sqr_tx_size].bits[i] |= mask[i];
649         break;
650       case 2:
651         for (int i = 0; i < 4; ++i) lfm->left_v[sqr_tx_size].bits[i] |= mask[i];
652         break;
653       default: assert(plane <= 2);
654     }
655   } else {
656     switch (plane) {
657       case 0:
658         for (int i = 0; i < 4; ++i)
659           lfm->above_y[sqr_tx_size].bits[i] |= mask[i];
660         break;
661       case 1:
662         for (int i = 0; i < 4; ++i)
663           lfm->above_u[sqr_tx_size].bits[i] |= mask[i];
664         break;
665       case 2:
666         for (int i = 0; i < 4; ++i)
667           lfm->above_v[sqr_tx_size].bits[i] |= mask[i];
668         break;
669       default: assert(plane <= 2);
670     }
671   }
672 }
673 
is_frame_boundary(AV1_COMMON * const cm,int plane,int mi_row,int mi_col,int ssx,int ssy,EDGE_DIR dir)674 static int is_frame_boundary(AV1_COMMON *const cm, int plane, int mi_row,
675                              int mi_col, int ssx, int ssy, EDGE_DIR dir) {
676   if (plane && (ssx || ssy)) {
677     if (ssx && ssy) {  // format 420
678       if ((mi_row << MI_SIZE_LOG2) > cm->height ||
679           (mi_col << MI_SIZE_LOG2) > cm->width)
680         return 1;
681     } else if (ssx) {  // format 422
682       if ((mi_row << MI_SIZE_LOG2) >= cm->height ||
683           (mi_col << MI_SIZE_LOG2) > cm->width)
684         return 1;
685     }
686   } else {
687     if ((mi_row << MI_SIZE_LOG2) >= cm->height ||
688         (mi_col << MI_SIZE_LOG2) >= cm->width)
689       return 1;
690   }
691 
692   int row_or_col;
693   if (plane == 0) {
694     row_or_col = dir == VERT_EDGE ? mi_col : mi_row;
695   } else {
696     // chroma sub8x8 block uses bottom/right mi of co-located 8x8 luma block.
697     // So if mi_col == 1, it is actually the frame boundary.
698     if (dir == VERT_EDGE) {
699       row_or_col = ssx ? (mi_col & 0x0FFFFFFE) : mi_col;
700     } else {
701       row_or_col = ssy ? (mi_row & 0x0FFFFFFE) : mi_row;
702     }
703   }
704   return row_or_col == 0;
705 }
706 
setup_masks(AV1_COMMON * const cm,int mi_row,int mi_col,int plane,int ssx,int ssy,TX_SIZE tx_size)707 static void setup_masks(AV1_COMMON *const cm, int mi_row, int mi_col, int plane,
708                         int ssx, int ssy, TX_SIZE tx_size) {
709   LoopFilterMask *lfm = get_loop_filter_mask(cm, mi_row, mi_col);
710   const int x = (mi_col << (MI_SIZE_LOG2 - ssx));
711   const int y = (mi_row << (MI_SIZE_LOG2 - ssy));
712   // decide whether current vertical/horizontal edge needs loop filtering
713   for (EDGE_DIR dir = VERT_EDGE; dir <= HORZ_EDGE; ++dir) {
714     // chroma sub8x8 block uses bottom/right mi of co-located 8x8 luma block.
715     mi_row |= ssy;
716     mi_col |= ssx;
717 
718     MB_MODE_INFO **mi = cm->mi_grid_visible + mi_row * cm->mi_stride + mi_col;
719     const MB_MODE_INFO *const mbmi = mi[0];
720     const int curr_skip = mbmi->skip && is_inter_block(mbmi);
721     const BLOCK_SIZE bsize = mbmi->sb_type;
722     const BLOCK_SIZE bsizec = scale_chroma_bsize(bsize, ssx, ssy);
723     const BLOCK_SIZE plane_bsize = ss_size_lookup[bsizec][ssx][ssy];
724     const uint8_t level = get_filter_level(cm, &cm->lf_info, dir, plane, mbmi);
725     const int prediction_masks = dir == VERT_EDGE
726                                      ? block_size_wide[plane_bsize] - 1
727                                      : block_size_high[plane_bsize] - 1;
728     const int is_coding_block_border =
729         dir == VERT_EDGE ? !(x & prediction_masks) : !(y & prediction_masks);
730 
731     // TODO(chengchen): step can be optimized.
732     const int row_step = mi_size_high[TX_4X4] << ssy;
733     const int col_step = mi_size_wide[TX_4X4] << ssx;
734     const int mi_height =
735         dir == VERT_EDGE ? tx_size_high_unit[tx_size] << ssy : row_step;
736     const int mi_width =
737         dir == VERT_EDGE ? col_step : tx_size_wide_unit[tx_size] << ssx;
738 
739     // assign filter levels
740     for (int r = mi_row; r < mi_row + mi_height; r += row_step) {
741       for (int c = mi_col; c < mi_col + mi_width; c += col_step) {
742         // do not filter frame boundary
743         // Note: when chroma planes' size are half of luma plane,
744         // chroma plane mi corresponds to even position.
745         // If frame size is not even, we still need to filter this chroma
746         // position. Therefore the boundary condition check needs to be
747         // separated to two cases.
748         if (plane && (ssx || ssy)) {
749           if (ssx && ssy) {  // format 420
750             if ((r << MI_SIZE_LOG2) > cm->height ||
751                 (c << MI_SIZE_LOG2) > cm->width)
752               continue;
753           } else if (ssx) {  // format 422
754             if ((r << MI_SIZE_LOG2) >= cm->height ||
755                 (c << MI_SIZE_LOG2) > cm->width)
756               continue;
757           }
758         } else {
759           if ((r << MI_SIZE_LOG2) >= cm->height ||
760               (c << MI_SIZE_LOG2) >= cm->width)
761             continue;
762         }
763 
764         const int row = r % MI_SIZE_64X64;
765         const int col = c % MI_SIZE_64X64;
766         if (plane == 0) {
767           if (dir == VERT_EDGE)
768             lfm->lfl_y_ver[row][col] = level;
769           else
770             lfm->lfl_y_hor[row][col] = level;
771         } else if (plane == 1) {
772           lfm->lfl_u_ver[row][col] = level;
773           lfm->lfl_u_hor[row][col] = level;
774         } else {
775           lfm->lfl_v_ver[row][col] = level;
776           lfm->lfl_v_hor[row][col] = level;
777         }
778       }
779     }
780 
781     for (int r = mi_row; r < mi_row + mi_height; r += row_step) {
782       for (int c = mi_col; c < mi_col + mi_width; c += col_step) {
783         // do not filter frame boundary
784         if (is_frame_boundary(cm, plane, r, c, ssx, ssy, dir)) continue;
785 
786         uint64_t mask[4] = { 0 };
787         const int prev_row = dir == VERT_EDGE ? r : r - (1 << ssy);
788         const int prev_col = dir == VERT_EDGE ? c - (1 << ssx) : c;
789         MB_MODE_INFO **mi_prev =
790             cm->mi_grid_visible + prev_row * cm->mi_stride + prev_col;
791         const MB_MODE_INFO *const mbmi_prev = mi_prev[0];
792         const int prev_skip = mbmi_prev->skip && is_inter_block(mbmi_prev);
793         const uint8_t level_prev =
794             get_filter_level(cm, &cm->lf_info, dir, plane, mbmi_prev);
795         const int is_edge =
796             (level || level_prev) &&
797             (!curr_skip || !prev_skip || is_coding_block_border);
798 
799         if (is_edge) {
800           const TX_SIZE prev_tx_size =
801               plane ? av1_get_max_uv_txsize(mbmi_prev->sb_type, ssx, ssy)
802                     : mbmi_prev->tx_size;
803           TX_SIZE min_tx_size = (dir == VERT_EDGE)
804                                     ? AOMMIN(txsize_horz_map[tx_size],
805                                              txsize_horz_map[prev_tx_size])
806                                     : AOMMIN(txsize_vert_map[tx_size],
807                                              txsize_vert_map[prev_tx_size]);
808           min_tx_size = AOMMIN(min_tx_size, TX_16X16);
809           assert(min_tx_size < TX_SIZES);
810           const int row = r % MI_SIZE_64X64;
811           const int col = c % MI_SIZE_64X64;
812           int index = 0;
813           const int shift = get_index_shift(col, row, &index);
814           assert(index < 4 && index >= 0);
815           mask[index] |= ((uint64_t)1 << shift);
816           // set mask on corresponding bit
817           update_masks(dir, plane, mask, min_tx_size, lfm);
818         }
819       }
820     }
821   }
822 }
823 
setup_tx_block_mask(AV1_COMMON * const cm,int mi_row,int mi_col,int blk_row,int blk_col,BLOCK_SIZE plane_bsize,TX_SIZE tx_size,int plane,int ssx,int ssy)824 static void setup_tx_block_mask(AV1_COMMON *const cm, int mi_row, int mi_col,
825                                 int blk_row, int blk_col,
826                                 BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
827                                 int plane, int ssx, int ssy) {
828   blk_row <<= ssy;
829   blk_col <<= ssx;
830   if (((mi_row + blk_row) << MI_SIZE_LOG2) >= cm->height ||
831       ((mi_col + blk_col) << MI_SIZE_LOG2) >= cm->width)
832     return;
833 
834   // U/V plane, tx_size is always the largest size
835   if (plane) {
836     assert(tx_size_wide[tx_size] <= 32 && tx_size_high[tx_size] <= 32);
837     setup_masks(cm, mi_row + blk_row, mi_col + blk_col, plane, ssx, ssy,
838                 tx_size);
839     return;
840   }
841 
842   MB_MODE_INFO **mi = cm->mi_grid_visible + mi_row * cm->mi_stride + mi_col;
843   const MB_MODE_INFO *const mbmi = mi[0];
844   // For Y plane:
845   // If intra block, tx size is univariant.
846   // If inter block, tx size follows inter_tx_size.
847   TX_SIZE plane_tx_size = tx_size;
848   const int is_inter = is_inter_block(mbmi);
849 
850   if (plane == 0) {
851     if (is_inter) {
852       if (mbmi->skip) {
853         // TODO(chengchen): change av1_get_transform_size() to be consistant.
854         // plane_tx_size = get_max_rect_tx_size(plane_bsize);
855         plane_tx_size = mbmi->tx_size;
856       } else {
857         plane_tx_size = mbmi->inter_tx_size[av1_get_txb_size_index(
858             plane_bsize, blk_row, blk_col)];
859       }
860     } else {
861       MB_MODE_INFO **mi_this = cm->mi_grid_visible +
862                                (mi_row + blk_row) * cm->mi_stride + mi_col +
863                                blk_col;
864       const MB_MODE_INFO *const mbmi_this = mi_this[0];
865       plane_tx_size = mbmi_this->tx_size;
866     }
867   }
868 
869   assert(txsize_to_bsize[plane_tx_size] <= plane_bsize);
870 
871   if (plane || plane_tx_size == tx_size) {
872     setup_masks(cm, mi_row + blk_row, mi_col + blk_col, plane, ssx, ssy,
873                 tx_size);
874   } else {
875     const TX_SIZE sub_txs = sub_tx_size_map[tx_size];
876     const int bsw = tx_size_wide_unit[sub_txs];
877     const int bsh = tx_size_high_unit[sub_txs];
878     for (int row = 0; row < tx_size_high_unit[tx_size]; row += bsh) {
879       for (int col = 0; col < tx_size_wide_unit[tx_size]; col += bsw) {
880         const int offsetr = blk_row + row;
881         const int offsetc = blk_col + col;
882         setup_tx_block_mask(cm, mi_row, mi_col, offsetr, offsetc, plane_bsize,
883                             sub_txs, plane, ssx, ssy);
884       }
885     }
886   }
887 }
888 
setup_fix_block_mask(AV1_COMMON * const cm,int mi_row,int mi_col,int plane,int ssx,int ssy)889 static void setup_fix_block_mask(AV1_COMMON *const cm, int mi_row, int mi_col,
890                                  int plane, int ssx, int ssy) {
891   MB_MODE_INFO **mi =
892       cm->mi_grid_visible + (mi_row | ssy) * cm->mi_stride + (mi_col | ssx);
893   const MB_MODE_INFO *const mbmi = mi[0];
894 
895   const BLOCK_SIZE bsize = mbmi->sb_type;
896   const BLOCK_SIZE bsizec = scale_chroma_bsize(bsize, ssx, ssy);
897   const BLOCK_SIZE plane_bsize = ss_size_lookup[bsizec][ssx][ssy];
898 
899   const int block_width = mi_size_wide[plane_bsize];
900   const int block_height = mi_size_high[plane_bsize];
901 
902   TX_SIZE max_txsize = max_txsize_rect_lookup[plane_bsize];
903   // The decoder is designed so that it can process 64x64 luma pixels at a
904   // time. If this is a chroma plane with subsampling and bsize corresponds to
905   // a subsampled BLOCK_128X128 then the lookup above will give TX_64X64. That
906   // mustn't be used for the subsampled plane (because it would be bigger than
907   // a 64x64 luma block) so we round down to TX_32X32.
908   if (plane && txsize_sqr_up_map[max_txsize] == TX_64X64) {
909     if (max_txsize == TX_16X64)
910       max_txsize = TX_16X32;
911     else if (max_txsize == TX_64X16)
912       max_txsize = TX_32X16;
913     else
914       max_txsize = TX_32X32;
915   }
916 
917   const BLOCK_SIZE txb_size = txsize_to_bsize[max_txsize];
918   const int bw = block_size_wide[txb_size] >> tx_size_wide_log2[0];
919   const int bh = block_size_high[txb_size] >> tx_size_wide_log2[0];
920   const BLOCK_SIZE max_unit_bsize = ss_size_lookup[BLOCK_64X64][ssx][ssy];
921   int mu_blocks_wide = block_size_wide[max_unit_bsize] >> tx_size_wide_log2[0];
922   int mu_blocks_high = block_size_high[max_unit_bsize] >> tx_size_high_log2[0];
923 
924   mu_blocks_wide = AOMMIN(block_width, mu_blocks_wide);
925   mu_blocks_high = AOMMIN(block_height, mu_blocks_high);
926 
927   // Y: Largest tx_size is 64x64, while superblock size can be 128x128.
928   // Here we ensure that setup_tx_block_mask process at most a 64x64 block.
929   // U/V: largest tx size is 32x32.
930   for (int idy = 0; idy < block_height; idy += mu_blocks_high) {
931     for (int idx = 0; idx < block_width; idx += mu_blocks_wide) {
932       const int unit_height = AOMMIN(mu_blocks_high + idy, block_height);
933       const int unit_width = AOMMIN(mu_blocks_wide + idx, block_width);
934       for (int blk_row = idy; blk_row < unit_height; blk_row += bh) {
935         for (int blk_col = idx; blk_col < unit_width; blk_col += bw) {
936           setup_tx_block_mask(cm, mi_row, mi_col, blk_row, blk_col, plane_bsize,
937                               max_txsize, plane, ssx, ssy);
938         }
939       }
940     }
941   }
942 }
943 
setup_block_mask(AV1_COMMON * const cm,int mi_row,int mi_col,BLOCK_SIZE bsize,int plane,int ssx,int ssy)944 static void setup_block_mask(AV1_COMMON *const cm, int mi_row, int mi_col,
945                              BLOCK_SIZE bsize, int plane, int ssx, int ssy) {
946   if ((mi_row << MI_SIZE_LOG2) >= cm->height ||
947       (mi_col << MI_SIZE_LOG2) >= cm->width)
948     return;
949 
950   const PARTITION_TYPE partition = get_partition(cm, mi_row, mi_col, bsize);
951   const BLOCK_SIZE subsize = get_partition_subsize(bsize, partition);
952   const int hbs = mi_size_wide[bsize] / 2;
953   const int quarter_step = mi_size_wide[bsize] / 4;
954   const int allow_sub8x8 = (ssx || ssy) ? bsize > BLOCK_8X8 : 1;
955   const int has_next_row =
956       (((mi_row + hbs) << MI_SIZE_LOG2) < cm->height) & allow_sub8x8;
957   const int has_next_col =
958       (((mi_col + hbs) << MI_SIZE_LOG2) < cm->width) & allow_sub8x8;
959   int i;
960 
961   switch (partition) {
962     case PARTITION_NONE:
963       setup_fix_block_mask(cm, mi_row, mi_col, plane, ssx, ssy);
964       break;
965     case PARTITION_HORZ:
966       setup_fix_block_mask(cm, mi_row, mi_col, plane, ssx, ssy);
967       if (has_next_row)
968         setup_fix_block_mask(cm, mi_row + hbs, mi_col, plane, ssx, ssy);
969       break;
970     case PARTITION_VERT:
971       setup_fix_block_mask(cm, mi_row, mi_col, plane, ssx, ssy);
972       if (has_next_col)
973         setup_fix_block_mask(cm, mi_row, mi_col + hbs, plane, ssx, ssy);
974       break;
975     case PARTITION_SPLIT:
976       setup_block_mask(cm, mi_row, mi_col, subsize, plane, ssx, ssy);
977       if (has_next_col)
978         setup_block_mask(cm, mi_row, mi_col + hbs, subsize, plane, ssx, ssy);
979       if (has_next_row)
980         setup_block_mask(cm, mi_row + hbs, mi_col, subsize, plane, ssx, ssy);
981       if (has_next_col & has_next_row)
982         setup_block_mask(cm, mi_row + hbs, mi_col + hbs, subsize, plane, ssx,
983                          ssy);
984       break;
985     case PARTITION_HORZ_A:
986       setup_fix_block_mask(cm, mi_row, mi_col, plane, ssx, ssy);
987       if (has_next_col)
988         setup_fix_block_mask(cm, mi_row, mi_col + hbs, plane, ssx, ssy);
989       if (has_next_row)
990         setup_fix_block_mask(cm, mi_row + hbs, mi_col, plane, ssx, ssy);
991       break;
992     case PARTITION_HORZ_B:
993       setup_fix_block_mask(cm, mi_row, mi_col, plane, ssx, ssy);
994       if (has_next_row)
995         setup_fix_block_mask(cm, mi_row + hbs, mi_col, plane, ssx, ssy);
996       if (has_next_col & has_next_row)
997         setup_fix_block_mask(cm, mi_row + hbs, mi_col + hbs, plane, ssx, ssy);
998       break;
999     case PARTITION_VERT_A:
1000       setup_fix_block_mask(cm, mi_row, mi_col, plane, ssx, ssy);
1001       if (has_next_row)
1002         setup_fix_block_mask(cm, mi_row + hbs, mi_col, plane, ssx, ssy);
1003       if (has_next_col)
1004         setup_fix_block_mask(cm, mi_row, mi_col + hbs, plane, ssx, ssy);
1005       break;
1006     case PARTITION_VERT_B:
1007       setup_fix_block_mask(cm, mi_row, mi_col, plane, ssx, ssy);
1008       if (has_next_col)
1009         setup_fix_block_mask(cm, mi_row, mi_col + hbs, plane, ssx, ssy);
1010       if (has_next_row)
1011         setup_fix_block_mask(cm, mi_row + hbs, mi_col + hbs, plane, ssx, ssy);
1012       break;
1013     case PARTITION_HORZ_4:
1014       for (i = 0; i < 4; ++i) {
1015         int this_mi_row = mi_row + i * quarter_step;
1016         if (i > 0 && (this_mi_row << MI_SIZE_LOG2) >= cm->height) break;
1017         // chroma plane filter the odd location
1018         if (plane && bsize == BLOCK_16X16 && (i & 0x01)) continue;
1019 
1020         setup_fix_block_mask(cm, this_mi_row, mi_col, plane, ssx, ssy);
1021       }
1022       break;
1023     case PARTITION_VERT_4:
1024       for (i = 0; i < 4; ++i) {
1025         int this_mi_col = mi_col + i * quarter_step;
1026         if (i > 0 && this_mi_col >= cm->mi_cols) break;
1027         // chroma plane filter the odd location
1028         if (plane && bsize == BLOCK_16X16 && (i & 0x01)) continue;
1029 
1030         setup_fix_block_mask(cm, mi_row, this_mi_col, plane, ssx, ssy);
1031       }
1032       break;
1033     default: assert(0);
1034   }
1035 }
1036 
1037 // TODO(chengchen): if lossless, do not need to setup mask. But when
1038 // segments enabled, each segment has different lossless settings.
av1_setup_bitmask(AV1_COMMON * const cm,int mi_row,int mi_col,int plane,int subsampling_x,int subsampling_y,int row_end,int col_end)1039 void av1_setup_bitmask(AV1_COMMON *const cm, int mi_row, int mi_col, int plane,
1040                        int subsampling_x, int subsampling_y, int row_end,
1041                        int col_end) {
1042   const int num_64x64 = cm->seq_params.mib_size >> MIN_MIB_SIZE_LOG2;
1043   for (int y = 0; y < num_64x64; ++y) {
1044     for (int x = 0; x < num_64x64; ++x) {
1045       const int row = mi_row + y * MI_SIZE_64X64;
1046       const int col = mi_col + x * MI_SIZE_64X64;
1047       if (row >= row_end || col >= col_end) continue;
1048       if ((row << MI_SIZE_LOG2) >= cm->height ||
1049           (col << MI_SIZE_LOG2) >= cm->width)
1050         continue;
1051 
1052       LoopFilterMask *lfm = get_loop_filter_mask(cm, row, col);
1053       if (lfm == NULL) return;
1054 
1055       // init mask to zero
1056       if (plane == 0) {
1057         av1_zero(lfm->left_y);
1058         av1_zero(lfm->above_y);
1059         av1_zero(lfm->lfl_y_ver);
1060         av1_zero(lfm->lfl_y_hor);
1061       } else if (plane == 1) {
1062         av1_zero(lfm->left_u);
1063         av1_zero(lfm->above_u);
1064         av1_zero(lfm->lfl_u_ver);
1065         av1_zero(lfm->lfl_u_hor);
1066       } else {
1067         av1_zero(lfm->left_v);
1068         av1_zero(lfm->above_v);
1069         av1_zero(lfm->lfl_v_ver);
1070         av1_zero(lfm->lfl_v_hor);
1071       }
1072     }
1073   }
1074 
1075   // set up bitmask for each superblock
1076   setup_block_mask(cm, mi_row, mi_col, cm->seq_params.sb_size, plane,
1077                    subsampling_x, subsampling_y);
1078 
1079   for (int y = 0; y < num_64x64; ++y) {
1080     for (int x = 0; x < num_64x64; ++x) {
1081       const int row = mi_row + y * MI_SIZE_64X64;
1082       const int col = mi_col + x * MI_SIZE_64X64;
1083       if (row >= row_end || col >= col_end) continue;
1084       if ((row << MI_SIZE_LOG2) >= cm->height ||
1085           (col << MI_SIZE_LOG2) >= cm->width)
1086         continue;
1087 
1088       LoopFilterMask *lfm = get_loop_filter_mask(cm, row, col);
1089       if (lfm == NULL) return;
1090 
1091       // check if the mask is valid
1092       check_loop_filter_masks(lfm, plane);
1093 
1094       {
1095         // Let 16x16 hold 32x32 (Y/U/V) and 64x64(Y only).
1096         // Even tx size is greater, we only apply max length filter, which
1097         // is 16.
1098         if (plane == 0) {
1099           for (int j = 0; j < 4; ++j) {
1100             lfm->left_y[TX_16X16].bits[j] |= lfm->left_y[TX_32X32].bits[j];
1101             lfm->left_y[TX_16X16].bits[j] |= lfm->left_y[TX_64X64].bits[j];
1102             lfm->above_y[TX_16X16].bits[j] |= lfm->above_y[TX_32X32].bits[j];
1103             lfm->above_y[TX_16X16].bits[j] |= lfm->above_y[TX_64X64].bits[j];
1104 
1105             // set 32x32 and 64x64 to 0
1106             lfm->left_y[TX_32X32].bits[j] = 0;
1107             lfm->left_y[TX_64X64].bits[j] = 0;
1108             lfm->above_y[TX_32X32].bits[j] = 0;
1109             lfm->above_y[TX_64X64].bits[j] = 0;
1110           }
1111         } else if (plane == 1) {
1112           for (int j = 0; j < 4; ++j) {
1113             lfm->left_u[TX_16X16].bits[j] |= lfm->left_u[TX_32X32].bits[j];
1114             lfm->above_u[TX_16X16].bits[j] |= lfm->above_u[TX_32X32].bits[j];
1115 
1116             // set 32x32 to 0
1117             lfm->left_u[TX_32X32].bits[j] = 0;
1118             lfm->above_u[TX_32X32].bits[j] = 0;
1119           }
1120         } else {
1121           for (int j = 0; j < 4; ++j) {
1122             lfm->left_v[TX_16X16].bits[j] |= lfm->left_v[TX_32X32].bits[j];
1123             lfm->above_v[TX_16X16].bits[j] |= lfm->above_v[TX_32X32].bits[j];
1124 
1125             // set 32x32 to 0
1126             lfm->left_v[TX_32X32].bits[j] = 0;
1127             lfm->above_v[TX_32X32].bits[j] = 0;
1128           }
1129         }
1130       }
1131 
1132       // check if the mask is valid
1133       check_loop_filter_masks(lfm, plane);
1134     }
1135   }
1136 }
1137 
filter_selectively_vert_row2(int subsampling_factor,uint8_t * s,int pitch,int plane,uint64_t mask_16x16_0,uint64_t mask_8x8_0,uint64_t mask_4x4_0,uint64_t mask_16x16_1,uint64_t mask_8x8_1,uint64_t mask_4x4_1,const loop_filter_info_n * lfi_n,uint8_t * lfl,uint8_t * lfl2)1138 static void filter_selectively_vert_row2(
1139     int subsampling_factor, uint8_t *s, int pitch, int plane,
1140     uint64_t mask_16x16_0, uint64_t mask_8x8_0, uint64_t mask_4x4_0,
1141     uint64_t mask_16x16_1, uint64_t mask_8x8_1, uint64_t mask_4x4_1,
1142     const loop_filter_info_n *lfi_n, uint8_t *lfl, uint8_t *lfl2) {
1143   uint64_t mask;
1144   const int step = 1 << subsampling_factor;
1145 
1146   for (mask = mask_16x16_0 | mask_8x8_0 | mask_4x4_0 | mask_16x16_1 |
1147               mask_8x8_1 | mask_4x4_1;
1148        mask; mask >>= step) {
1149     const loop_filter_thresh *lfi0 = lfi_n->lfthr + *lfl;
1150     const loop_filter_thresh *lfi1 = lfi_n->lfthr + *lfl2;
1151 
1152     if (mask & 1) {
1153       if ((mask_16x16_0 | mask_16x16_1) & 1) {
1154         // chroma plane filters less pixels introduced in deblock_13tap
1155         // experiment
1156         LpfFunc lpf_vertical = plane ? aom_lpf_vertical_6 : aom_lpf_vertical_14;
1157 
1158         if ((mask_16x16_0 & mask_16x16_1) & 1) {
1159           if (plane) {
1160             aom_lpf_vertical_6_dual(s, pitch, lfi0->mblim, lfi0->lim,
1161                                     lfi0->hev_thr, lfi1->mblim, lfi1->lim,
1162                                     lfi1->hev_thr);
1163           } else {
1164             aom_lpf_vertical_14_dual(s, pitch, lfi0->mblim, lfi0->lim,
1165                                      lfi0->hev_thr, lfi1->mblim, lfi1->lim,
1166                                      lfi1->hev_thr);
1167           }
1168         } else if (mask_16x16_0 & 1) {
1169           lpf_vertical(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr);
1170         } else {
1171           lpf_vertical(s + 4 * pitch, pitch, lfi1->mblim, lfi1->lim,
1172                        lfi1->hev_thr);
1173         }
1174       }
1175 
1176       if ((mask_8x8_0 | mask_8x8_1) & 1) {
1177         // chroma plane filters less pixels introduced in deblock_13tap
1178         // experiment
1179         LpfFunc lpf_vertical = plane ? aom_lpf_vertical_6 : aom_lpf_vertical_8;
1180 
1181         if ((mask_8x8_0 & mask_8x8_1) & 1) {
1182           if (plane) {
1183             aom_lpf_vertical_6_dual(s, pitch, lfi0->mblim, lfi0->lim,
1184                                     lfi0->hev_thr, lfi1->mblim, lfi1->lim,
1185                                     lfi1->hev_thr);
1186           } else {
1187             aom_lpf_vertical_8_dual(s, pitch, lfi0->mblim, lfi0->lim,
1188                                     lfi0->hev_thr, lfi1->mblim, lfi1->lim,
1189                                     lfi1->hev_thr);
1190           }
1191         } else if (mask_8x8_0 & 1) {
1192           lpf_vertical(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr);
1193         } else {
1194           lpf_vertical(s + 4 * pitch, pitch, lfi1->mblim, lfi1->lim,
1195                        lfi1->hev_thr);
1196         }
1197       }
1198 
1199       if ((mask_4x4_0 | mask_4x4_1) & 1) {
1200         if ((mask_4x4_0 & mask_4x4_1) & 1) {
1201           aom_lpf_vertical_4_dual(s, pitch, lfi0->mblim, lfi0->lim,
1202                                   lfi0->hev_thr, lfi1->mblim, lfi1->lim,
1203                                   lfi1->hev_thr);
1204         } else if (mask_4x4_0 & 1) {
1205           aom_lpf_vertical_4(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr);
1206         } else {
1207           aom_lpf_vertical_4(s + 4 * pitch, pitch, lfi1->mblim, lfi1->lim,
1208                              lfi1->hev_thr);
1209         }
1210       }
1211     }
1212 
1213     s += 4;
1214     lfl += step;
1215     lfl2 += step;
1216     mask_16x16_0 >>= step;
1217     mask_8x8_0 >>= step;
1218     mask_4x4_0 >>= step;
1219     mask_16x16_1 >>= step;
1220     mask_8x8_1 >>= step;
1221     mask_4x4_1 >>= step;
1222   }
1223 }
1224 
highbd_filter_selectively_vert_row2(int subsampling_factor,uint16_t * s,int pitch,int plane,uint64_t mask_16x16_0,uint64_t mask_8x8_0,uint64_t mask_4x4_0,uint64_t mask_16x16_1,uint64_t mask_8x8_1,uint64_t mask_4x4_1,const loop_filter_info_n * lfi_n,uint8_t * lfl,uint8_t * lfl2,int bd)1225 static void highbd_filter_selectively_vert_row2(
1226     int subsampling_factor, uint16_t *s, int pitch, int plane,
1227     uint64_t mask_16x16_0, uint64_t mask_8x8_0, uint64_t mask_4x4_0,
1228     uint64_t mask_16x16_1, uint64_t mask_8x8_1, uint64_t mask_4x4_1,
1229     const loop_filter_info_n *lfi_n, uint8_t *lfl, uint8_t *lfl2, int bd) {
1230   uint64_t mask;
1231   const int step = 1 << subsampling_factor;
1232 
1233   for (mask = mask_16x16_0 | mask_8x8_0 | mask_4x4_0 | mask_16x16_1 |
1234               mask_8x8_1 | mask_4x4_1;
1235        mask; mask >>= step) {
1236     const loop_filter_thresh *lfi0 = lfi_n->lfthr + *lfl;
1237     const loop_filter_thresh *lfi1 = lfi_n->lfthr + *lfl2;
1238 
1239     if (mask & 1) {
1240       if ((mask_16x16_0 | mask_16x16_1) & 1) {
1241         // chroma plane filters less pixels introduced in deblock_13tap
1242         // experiment
1243         HbdLpfFunc highbd_lpf_vertical =
1244             plane ? aom_highbd_lpf_vertical_6 : aom_highbd_lpf_vertical_14;
1245 
1246         if ((mask_16x16_0 & mask_16x16_1) & 1) {
1247           if (plane) {
1248             aom_highbd_lpf_vertical_6_dual(s, pitch, lfi0->mblim, lfi0->lim,
1249                                            lfi0->hev_thr, lfi1->mblim,
1250                                            lfi1->lim, lfi1->hev_thr, bd);
1251           } else {
1252             aom_highbd_lpf_vertical_14_dual(s, pitch, lfi0->mblim, lfi0->lim,
1253                                             lfi0->hev_thr, lfi1->mblim,
1254                                             lfi1->lim, lfi1->hev_thr, bd);
1255           }
1256         } else if (mask_16x16_0 & 1) {
1257           highbd_lpf_vertical(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr,
1258                               bd);
1259         } else {
1260           highbd_lpf_vertical(s + 4 * pitch, pitch, lfi1->mblim, lfi1->lim,
1261                               lfi1->hev_thr, bd);
1262         }
1263       }
1264 
1265       if ((mask_8x8_0 | mask_8x8_1) & 1) {
1266         HbdLpfFunc highbd_lpf_vertical =
1267             plane ? aom_highbd_lpf_vertical_6 : aom_highbd_lpf_vertical_8;
1268 
1269         if ((mask_8x8_0 & mask_8x8_1) & 1) {
1270           if (plane) {
1271             aom_highbd_lpf_vertical_6_dual(s, pitch, lfi0->mblim, lfi0->lim,
1272                                            lfi0->hev_thr, lfi1->mblim,
1273                                            lfi1->lim, lfi1->hev_thr, bd);
1274           } else {
1275             aom_highbd_lpf_vertical_8_dual(s, pitch, lfi0->mblim, lfi0->lim,
1276                                            lfi0->hev_thr, lfi1->mblim,
1277                                            lfi1->lim, lfi1->hev_thr, bd);
1278           }
1279         } else if (mask_8x8_0 & 1) {
1280           highbd_lpf_vertical(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr,
1281                               bd);
1282         } else {
1283           highbd_lpf_vertical(s + 4 * pitch, pitch, lfi1->mblim, lfi1->lim,
1284                               lfi1->hev_thr, bd);
1285         }
1286       }
1287 
1288       if ((mask_4x4_0 | mask_4x4_1) & 1) {
1289         if ((mask_4x4_0 & mask_4x4_1) & 1) {
1290           aom_highbd_lpf_vertical_4_dual(s, pitch, lfi0->mblim, lfi0->lim,
1291                                          lfi0->hev_thr, lfi1->mblim, lfi1->lim,
1292                                          lfi1->hev_thr, bd);
1293         } else if (mask_4x4_0 & 1) {
1294           aom_highbd_lpf_vertical_4(s, pitch, lfi0->mblim, lfi0->lim,
1295                                     lfi0->hev_thr, bd);
1296         } else {
1297           aom_highbd_lpf_vertical_4(s + 4 * pitch, pitch, lfi1->mblim,
1298                                     lfi1->lim, lfi1->hev_thr, bd);
1299         }
1300       }
1301     }
1302 
1303     s += 4;
1304     lfl += step;
1305     lfl2 += step;
1306     mask_16x16_0 >>= step;
1307     mask_8x8_0 >>= step;
1308     mask_4x4_0 >>= step;
1309     mask_16x16_1 >>= step;
1310     mask_8x8_1 >>= step;
1311     mask_4x4_1 >>= step;
1312   }
1313 }
1314 
filter_selectively_horiz(uint8_t * s,int pitch,int plane,int subsampling,uint64_t mask_16x16,uint64_t mask_8x8,uint64_t mask_4x4,const loop_filter_info_n * lfi_n,const uint8_t * lfl)1315 static void filter_selectively_horiz(uint8_t *s, int pitch, int plane,
1316                                      int subsampling, uint64_t mask_16x16,
1317                                      uint64_t mask_8x8, uint64_t mask_4x4,
1318                                      const loop_filter_info_n *lfi_n,
1319                                      const uint8_t *lfl) {
1320   uint64_t mask;
1321   int count;
1322   const int step = 1 << subsampling;
1323   const unsigned int two_block_mask = subsampling ? 5 : 3;
1324   int offset = 0;
1325 
1326   for (mask = mask_16x16 | mask_8x8 | mask_4x4; mask; mask >>= step * count) {
1327     const loop_filter_thresh *lfi = lfi_n->lfthr + *lfl;
1328     // Next block's thresholds, when it is within current 64x64 block.
1329     // If it is out of bound, its mask is zero, and it points to current edge's
1330     // filter parameters, instead of next edge's.
1331     int next_edge = step;
1332     if (offset + next_edge >= MI_SIZE_64X64) next_edge = 0;
1333     const loop_filter_thresh *lfin = lfi_n->lfthr + *(lfl + next_edge);
1334 
1335     count = 1;
1336     if (mask & 1) {
1337       if (mask_16x16 & 1) {
1338         // chroma plane filters less pixels introduced in deblock_13tap
1339         // experiment
1340         LpfFunc lpf_horizontal =
1341             plane ? aom_lpf_horizontal_6 : aom_lpf_horizontal_14;
1342 
1343         if ((mask_16x16 & two_block_mask) == two_block_mask) {
1344           if (plane) {
1345             aom_lpf_horizontal_6_dual(s, pitch, lfi->mblim, lfi->lim,
1346                                       lfi->hev_thr, lfin->mblim, lfin->lim,
1347                                       lfin->hev_thr);
1348           } else {
1349             aom_lpf_horizontal_14_dual(s, pitch, lfi->mblim, lfi->lim,
1350                                        lfi->hev_thr, lfin->mblim, lfin->lim,
1351                                        lfin->hev_thr);
1352           }
1353           count = 2;
1354         } else {
1355           lpf_horizontal(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr);
1356         }
1357       } else if (mask_8x8 & 1) {
1358         // chroma plane filters less pixels introduced in deblock_13tap
1359         // experiment
1360         LpfFunc lpf_horizontal =
1361             plane ? aom_lpf_horizontal_6 : aom_lpf_horizontal_8;
1362 
1363         if ((mask_8x8 & two_block_mask) == two_block_mask) {
1364           if (plane) {
1365             aom_lpf_horizontal_6_dual(s, pitch, lfi->mblim, lfi->lim,
1366                                       lfi->hev_thr, lfin->mblim, lfin->lim,
1367                                       lfin->hev_thr);
1368           } else {
1369             aom_lpf_horizontal_8_dual(s, pitch, lfi->mblim, lfi->lim,
1370                                       lfi->hev_thr, lfin->mblim, lfin->lim,
1371                                       lfin->hev_thr);
1372           }
1373           count = 2;
1374         } else {
1375           lpf_horizontal(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr);
1376         }
1377       } else if (mask_4x4 & 1) {
1378         if ((mask_4x4 & two_block_mask) == two_block_mask) {
1379           aom_lpf_horizontal_4_dual(s, pitch, lfi->mblim, lfi->lim,
1380                                     lfi->hev_thr, lfin->mblim, lfin->lim,
1381                                     lfin->hev_thr);
1382           count = 2;
1383         } else {
1384           aom_lpf_horizontal_4(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr);
1385         }
1386       }
1387     }
1388 
1389     s += 4 * count;
1390     lfl += step * count;
1391     mask_16x16 >>= step * count;
1392     mask_8x8 >>= step * count;
1393     mask_4x4 >>= step * count;
1394     offset += step * count;
1395   }
1396 }
1397 
highbd_filter_selectively_horiz(uint16_t * s,int pitch,int plane,int subsampling,uint64_t mask_16x16,uint64_t mask_8x8,uint64_t mask_4x4,const loop_filter_info_n * lfi_n,uint8_t * lfl,int bd)1398 static void highbd_filter_selectively_horiz(
1399     uint16_t *s, int pitch, int plane, int subsampling, uint64_t mask_16x16,
1400     uint64_t mask_8x8, uint64_t mask_4x4, const loop_filter_info_n *lfi_n,
1401     uint8_t *lfl, int bd) {
1402   uint64_t mask;
1403   int count;
1404   const int step = 1 << subsampling;
1405   const unsigned int two_block_mask = subsampling ? 5 : 3;
1406   int offset = 0;
1407 
1408   for (mask = mask_16x16 | mask_8x8 | mask_4x4; mask; mask >>= step * count) {
1409     const loop_filter_thresh *lfi = lfi_n->lfthr + *lfl;
1410     // Next block's thresholds, when it is within current 64x64 block.
1411     // If it is out of bound, its mask is zero, and it points to current edge's
1412     // filter parameters, instead of next edge's.
1413     int next_edge = step;
1414     if (offset + next_edge >= MI_SIZE_64X64) next_edge = 0;
1415     const loop_filter_thresh *lfin = lfi_n->lfthr + *(lfl + next_edge);
1416 
1417     count = 1;
1418     if (mask & 1) {
1419       if (mask_16x16 & 1) {
1420         HbdLpfFunc highbd_lpf_horizontal =
1421             plane ? aom_highbd_lpf_horizontal_6 : aom_highbd_lpf_horizontal_14;
1422 
1423         if ((mask_16x16 & two_block_mask) == two_block_mask) {
1424           if (plane) {
1425             aom_highbd_lpf_horizontal_6_dual_c(s, pitch, lfi->mblim, lfi->lim,
1426                                                lfi->hev_thr, lfin->mblim,
1427                                                lfin->lim, lfin->hev_thr, bd);
1428           } else {
1429             aom_highbd_lpf_horizontal_14_dual(s, pitch, lfi->mblim, lfi->lim,
1430                                               lfi->hev_thr, lfin->mblim,
1431                                               lfin->lim, lfin->hev_thr, bd);
1432           }
1433           count = 2;
1434         } else {
1435           highbd_lpf_horizontal(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr,
1436                                 bd);
1437         }
1438       } else if (mask_8x8 & 1) {
1439         HbdLpfFunc highbd_lpf_horizontal =
1440             plane ? aom_highbd_lpf_horizontal_6 : aom_highbd_lpf_horizontal_8;
1441 
1442         if ((mask_8x8 & two_block_mask) == two_block_mask) {
1443           if (plane) {
1444             aom_highbd_lpf_horizontal_6_dual_c(s, pitch, lfi->mblim, lfi->lim,
1445                                                lfi->hev_thr, lfin->mblim,
1446                                                lfin->lim, lfin->hev_thr, bd);
1447           } else {
1448             aom_highbd_lpf_horizontal_8_dual_c(s, pitch, lfi->mblim, lfi->lim,
1449                                                lfi->hev_thr, lfin->mblim,
1450                                                lfin->lim, lfin->hev_thr, bd);
1451           }
1452           count = 2;
1453         } else {
1454           highbd_lpf_horizontal(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr,
1455                                 bd);
1456         }
1457       } else if (mask_4x4 & 1) {
1458         if ((mask_4x4 & two_block_mask) == two_block_mask) {
1459           aom_highbd_lpf_horizontal_4_dual_c(s, pitch, lfi->mblim, lfi->lim,
1460                                              lfi->hev_thr, lfin->mblim,
1461                                              lfin->lim, lfin->hev_thr, bd);
1462           count = 2;
1463         } else {
1464           aom_highbd_lpf_horizontal_4(s, pitch, lfi->mblim, lfi->lim,
1465                                       lfi->hev_thr, bd);
1466         }
1467       }
1468     }
1469 
1470     s += 4 * count;
1471     lfl += step * count;
1472     mask_16x16 >>= step * count;
1473     mask_8x8 >>= step * count;
1474     mask_4x4 >>= step * count;
1475     offset += step * count;
1476   }
1477 }
1478 
av1_build_bitmask_vert_info(AV1_COMMON * const cm,const struct macroblockd_plane * const plane_ptr,int plane)1479 void av1_build_bitmask_vert_info(
1480     AV1_COMMON *const cm, const struct macroblockd_plane *const plane_ptr,
1481     int plane) {
1482   const int subsampling_x = plane_ptr->subsampling_x;
1483   const int subsampling_y = plane_ptr->subsampling_y;
1484   const int row_step = (MI_SIZE >> MI_SIZE_LOG2);
1485   const int is_uv = plane > 0;
1486   TX_SIZE tx_size = TX_16X16, prev_tx_size = TX_16X16;
1487   uint8_t level, prev_level = 1;
1488   uint64_t skip, prev_skip = 0;
1489   uint64_t is_coding_block_border;
1490 
1491   for (int r = 0; (r << MI_SIZE_LOG2) < plane_ptr->dst.height; r += row_step) {
1492     const int mi_row = r << subsampling_y;
1493     const int row = mi_row % MI_SIZE_64X64;
1494     const int row_uv = row | subsampling_y;
1495     int index = 0;
1496     const int shift = get_index_shift(0, row, &index);
1497 
1498     for (int c = 0; (c << MI_SIZE_LOG2) < plane_ptr->dst.width;
1499          c += (tx_size_wide_unit[TX_64X64] >> subsampling_x)) {
1500       const int mi_col = c << subsampling_x;
1501       LoopFilterMask *lfm = get_loop_filter_mask(cm, mi_row, mi_col);
1502 
1503       for (int col_in_unit = 0;
1504            col_in_unit < (tx_size_wide_unit[TX_64X64] >> subsampling_x);) {
1505         const int x = (c + col_in_unit) << MI_SIZE_LOG2;
1506         if (x >= plane_ptr->dst.width) break;
1507         const int col = col_in_unit << subsampling_x;
1508         const int col_uv = col | subsampling_x;
1509         const uint64_t mask = ((uint64_t)1 << (shift | col));
1510         skip = lfm->skip.bits[index] & mask;
1511         is_coding_block_border = lfm->is_vert_border.bits[index] & mask;
1512         switch (plane) {
1513           case 0: level = lfm->lfl_y_ver[row_uv][col_uv]; break;
1514           case 1: level = lfm->lfl_u_ver[row_uv][col_uv]; break;
1515           case 2: level = lfm->lfl_v_ver[row_uv][col_uv]; break;
1516           default: assert(plane >= 0 && plane <= 2); return;
1517         }
1518         for (TX_SIZE ts = TX_4X4; ts <= TX_64X64; ++ts) {
1519           if (is_uv && ts == TX_64X64) continue;
1520           if (lfm->tx_size_ver[is_uv][ts].bits[index] & mask) {
1521             tx_size = ts;
1522             break;
1523           }
1524         }
1525         if ((c + col_in_unit > 0) && (level || prev_level) &&
1526             (!prev_skip || !skip || is_coding_block_border)) {
1527           const TX_SIZE min_tx_size =
1528               AOMMIN(TX_16X16, AOMMIN(tx_size, prev_tx_size));
1529           const int shift_1 = get_index_shift(col_uv, row_uv, &index);
1530           const uint64_t mask_1 = ((uint64_t)1 << shift_1);
1531           switch (plane) {
1532             case 0: lfm->left_y[min_tx_size].bits[index] |= mask_1; break;
1533             case 1: lfm->left_u[min_tx_size].bits[index] |= mask_1; break;
1534             case 2: lfm->left_v[min_tx_size].bits[index] |= mask_1; break;
1535             default: assert(plane >= 0 && plane <= 2); return;
1536           }
1537           if (level == 0 && prev_level != 0) {
1538             switch (plane) {
1539               case 0: lfm->lfl_y_ver[row_uv][col_uv] = prev_level; break;
1540               case 1: lfm->lfl_u_ver[row_uv][col_uv] = prev_level; break;
1541               case 2: lfm->lfl_v_ver[row_uv][col_uv] = prev_level; break;
1542               default: assert(plane >= 0 && plane <= 2); return;
1543             }
1544           }
1545         }
1546 
1547         // update prev info
1548         prev_level = level;
1549         prev_skip = skip;
1550         prev_tx_size = tx_size;
1551         // advance
1552         col_in_unit += tx_size_wide_unit[tx_size];
1553       }
1554     }
1555   }
1556 }
1557 
av1_build_bitmask_horz_info(AV1_COMMON * const cm,const struct macroblockd_plane * const plane_ptr,int plane)1558 void av1_build_bitmask_horz_info(
1559     AV1_COMMON *const cm, const struct macroblockd_plane *const plane_ptr,
1560     int plane) {
1561   const int subsampling_x = plane_ptr->subsampling_x;
1562   const int subsampling_y = plane_ptr->subsampling_y;
1563   const int col_step = (MI_SIZE >> MI_SIZE_LOG2);
1564   const int is_uv = plane > 0;
1565   TX_SIZE tx_size = TX_16X16, prev_tx_size = TX_16X16;
1566   uint8_t level, prev_level = 1;
1567   uint64_t skip, prev_skip = 0;
1568   uint64_t is_coding_block_border;
1569 
1570   for (int c = 0; (c << MI_SIZE_LOG2) < plane_ptr->dst.width; c += col_step) {
1571     const int mi_col = c << subsampling_x;
1572     const int col = mi_col % MI_SIZE_64X64;
1573     const int col_uv = col | subsampling_x;
1574 
1575     for (int r = 0; (r << MI_SIZE_LOG2) < plane_ptr->dst.height;
1576          r += (tx_size_high_unit[TX_64X64] >> subsampling_y)) {
1577       const int mi_row = r << subsampling_y;
1578       LoopFilterMask *lfm = get_loop_filter_mask(cm, mi_row, mi_col);
1579 
1580       for (int r_in_unit = 0;
1581            r_in_unit < (tx_size_high_unit[TX_64X64] >> subsampling_y);) {
1582         const int y = (r + r_in_unit) << MI_SIZE_LOG2;
1583         if (y >= plane_ptr->dst.height) break;
1584         const int row = r_in_unit << subsampling_y;
1585         const int row_uv = row | subsampling_y;
1586         int index = 0;
1587         const int shift = get_index_shift(col, row, &index);
1588         const uint64_t mask = ((uint64_t)1 << shift);
1589         skip = lfm->skip.bits[index] & mask;
1590         is_coding_block_border = lfm->is_horz_border.bits[index] & mask;
1591         switch (plane) {
1592           case 0: level = lfm->lfl_y_hor[row_uv][col_uv]; break;
1593           case 1: level = lfm->lfl_u_hor[row_uv][col_uv]; break;
1594           case 2: level = lfm->lfl_v_hor[row_uv][col_uv]; break;
1595           default: assert(plane >= 0 && plane <= 2); return;
1596         }
1597         for (TX_SIZE ts = TX_4X4; ts <= TX_64X64; ++ts) {
1598           if (is_uv && ts == TX_64X64) continue;
1599           if (lfm->tx_size_hor[is_uv][ts].bits[index] & mask) {
1600             tx_size = ts;
1601             break;
1602           }
1603         }
1604         if ((r + r_in_unit > 0) && (level || prev_level) &&
1605             (!prev_skip || !skip || is_coding_block_border)) {
1606           const TX_SIZE min_tx_size =
1607               AOMMIN(TX_16X16, AOMMIN(tx_size, prev_tx_size));
1608           const int shift_1 = get_index_shift(col_uv, row_uv, &index);
1609           const uint64_t mask_1 = ((uint64_t)1 << shift_1);
1610 
1611           switch (plane) {
1612             case 0: lfm->above_y[min_tx_size].bits[index] |= mask_1; break;
1613             case 1: lfm->above_u[min_tx_size].bits[index] |= mask_1; break;
1614             case 2: lfm->above_v[min_tx_size].bits[index] |= mask_1; break;
1615             default: assert(plane >= 0 && plane <= 2); return;
1616           }
1617           if (level == 0 && prev_level != 0) {
1618             switch (plane) {
1619               case 0: lfm->lfl_y_hor[row_uv][col_uv] = prev_level; break;
1620               case 1: lfm->lfl_u_hor[row_uv][col_uv] = prev_level; break;
1621               case 2: lfm->lfl_v_hor[row_uv][col_uv] = prev_level; break;
1622               default: assert(plane >= 0 && plane <= 2); return;
1623             }
1624           }
1625         }
1626 
1627         // update prev info
1628         prev_level = level;
1629         prev_skip = skip;
1630         prev_tx_size = tx_size;
1631         // advance
1632         r_in_unit += tx_size_high_unit[tx_size];
1633       }
1634     }
1635   }
1636 }
1637 
av1_filter_block_plane_bitmask_vert(AV1_COMMON * const cm,struct macroblockd_plane * const plane_ptr,int pl,int mi_row,int mi_col)1638 void av1_filter_block_plane_bitmask_vert(
1639     AV1_COMMON *const cm, struct macroblockd_plane *const plane_ptr, int pl,
1640     int mi_row, int mi_col) {
1641   struct buf_2d *const dst = &plane_ptr->dst;
1642   uint8_t *const buf0 = dst->buf;
1643   const int ssx = plane_ptr->subsampling_x;
1644   const int ssy = plane_ptr->subsampling_y;
1645   const int mask_cutoff = 0xffff;
1646   const int row_step = 1 << ssy;
1647   const int two_row_step = 2 << ssy;
1648   const int row_stride = dst->stride << MI_SIZE_LOG2;
1649   const int two_row_stride = row_stride << 1;
1650   uint64_t mask_16x16 = 0;
1651   uint64_t mask_8x8 = 0;
1652   uint64_t mask_4x4 = 0;
1653   uint8_t *lfl;
1654   uint8_t *lfl2;
1655   LoopFilterMask *lfm = get_loop_filter_mask(cm, mi_row, mi_col);
1656   assert(lfm);
1657 
1658   // 1. vertical filtering. filter two rows at a time
1659   for (int r = 0;
1660        ((mi_row + r) << MI_SIZE_LOG2) < cm->height && r < MI_SIZE_64X64;
1661        r += two_row_step) {
1662     const int row = r | ssy;
1663     const int row_next = row + row_step;
1664     const int col = ssx;
1665     int index = 0;
1666     const int shift = get_index_shift(col, row, &index);
1667     int index_next = 0;
1668     const int shift_next = get_index_shift(col, row_next, &index_next);
1669     const int has_next_row = row_next < cm->mi_rows;
1670     switch (pl) {
1671       case 0:
1672         mask_16x16 = lfm->left_y[TX_16X16].bits[index];
1673         mask_8x8 = lfm->left_y[TX_8X8].bits[index];
1674         mask_4x4 = lfm->left_y[TX_4X4].bits[index];
1675         lfl = &lfm->lfl_y_ver[row][col];
1676         lfl2 = &lfm->lfl_y_ver[row_next][col];
1677         break;
1678       case 1:
1679         mask_16x16 = lfm->left_u[TX_16X16].bits[index];
1680         mask_8x8 = lfm->left_u[TX_8X8].bits[index];
1681         mask_4x4 = lfm->left_u[TX_4X4].bits[index];
1682         lfl = &lfm->lfl_u_ver[row][col];
1683         lfl2 = &lfm->lfl_u_ver[row_next][col];
1684         break;
1685       case 2:
1686         mask_16x16 = lfm->left_v[TX_16X16].bits[index];
1687         mask_8x8 = lfm->left_v[TX_8X8].bits[index];
1688         mask_4x4 = lfm->left_v[TX_4X4].bits[index];
1689         lfl = &lfm->lfl_v_ver[row][col];
1690         lfl2 = &lfm->lfl_v_ver[row_next][col];
1691         break;
1692       default: assert(pl >= 0 && pl <= 2); return;
1693     }
1694     uint64_t mask_16x16_0 = (mask_16x16 >> shift) & mask_cutoff;
1695     uint64_t mask_8x8_0 = (mask_8x8 >> shift) & mask_cutoff;
1696     uint64_t mask_4x4_0 = (mask_4x4 >> shift) & mask_cutoff;
1697     uint64_t mask_16x16_1 = (mask_16x16 >> shift_next) & mask_cutoff;
1698     uint64_t mask_8x8_1 = (mask_8x8 >> shift_next) & mask_cutoff;
1699     uint64_t mask_4x4_1 = (mask_4x4 >> shift_next) & mask_cutoff;
1700     if (!has_next_row) {
1701       mask_16x16_1 = 0;
1702       mask_8x8_1 = 0;
1703       mask_4x4_1 = 0;
1704     }
1705 
1706     if (cm->seq_params.use_highbitdepth)
1707       highbd_filter_selectively_vert_row2(
1708           ssx, CONVERT_TO_SHORTPTR(dst->buf), dst->stride, pl, mask_16x16_0,
1709           mask_8x8_0, mask_4x4_0, mask_16x16_1, mask_8x8_1, mask_4x4_1,
1710           &cm->lf_info, lfl, lfl2, (int)cm->seq_params.bit_depth);
1711     else
1712       filter_selectively_vert_row2(
1713           ssx, dst->buf, dst->stride, pl, mask_16x16_0, mask_8x8_0, mask_4x4_0,
1714           mask_16x16_1, mask_8x8_1, mask_4x4_1, &cm->lf_info, lfl, lfl2);
1715     dst->buf += two_row_stride;
1716   }
1717   // reset buf pointer for horizontal filtering
1718   dst->buf = buf0;
1719 }
1720 
av1_filter_block_plane_bitmask_horz(AV1_COMMON * const cm,struct macroblockd_plane * const plane_ptr,int pl,int mi_row,int mi_col)1721 void av1_filter_block_plane_bitmask_horz(
1722     AV1_COMMON *const cm, struct macroblockd_plane *const plane_ptr, int pl,
1723     int mi_row, int mi_col) {
1724   struct buf_2d *const dst = &plane_ptr->dst;
1725   uint8_t *const buf0 = dst->buf;
1726   const int ssx = plane_ptr->subsampling_x;
1727   const int ssy = plane_ptr->subsampling_y;
1728   const int mask_cutoff = 0xffff;
1729   const int row_step = 1 << ssy;
1730   const int row_stride = dst->stride << MI_SIZE_LOG2;
1731   uint64_t mask_16x16 = 0;
1732   uint64_t mask_8x8 = 0;
1733   uint64_t mask_4x4 = 0;
1734   uint8_t *lfl;
1735   LoopFilterMask *lfm = get_loop_filter_mask(cm, mi_row, mi_col);
1736   assert(lfm);
1737   for (int r = 0;
1738        ((mi_row + r) << MI_SIZE_LOG2) < cm->height && r < MI_SIZE_64X64;
1739        r += row_step) {
1740     if (mi_row + r == 0) {
1741       dst->buf += row_stride;
1742       continue;
1743     }
1744     const int row = r | ssy;
1745     const int col = ssx;
1746     int index = 0;
1747     const int shift = get_index_shift(col, row, &index);
1748     switch (pl) {
1749       case 0:
1750         mask_16x16 = lfm->above_y[TX_16X16].bits[index];
1751         mask_8x8 = lfm->above_y[TX_8X8].bits[index];
1752         mask_4x4 = lfm->above_y[TX_4X4].bits[index];
1753         lfl = &lfm->lfl_y_hor[row][col];
1754         break;
1755       case 1:
1756         mask_16x16 = lfm->above_u[TX_16X16].bits[index];
1757         mask_8x8 = lfm->above_u[TX_8X8].bits[index];
1758         mask_4x4 = lfm->above_u[TX_4X4].bits[index];
1759         lfl = &lfm->lfl_u_hor[row][col];
1760         break;
1761       case 2:
1762         mask_16x16 = lfm->above_v[TX_16X16].bits[index];
1763         mask_8x8 = lfm->above_v[TX_8X8].bits[index];
1764         mask_4x4 = lfm->above_v[TX_4X4].bits[index];
1765         lfl = &lfm->lfl_v_hor[row][col];
1766         break;
1767       default: assert(pl >= 0 && pl <= 2); return;
1768     }
1769     mask_16x16 = (mask_16x16 >> shift) & mask_cutoff;
1770     mask_8x8 = (mask_8x8 >> shift) & mask_cutoff;
1771     mask_4x4 = (mask_4x4 >> shift) & mask_cutoff;
1772 
1773     if (cm->seq_params.use_highbitdepth)
1774       highbd_filter_selectively_horiz(
1775           CONVERT_TO_SHORTPTR(dst->buf), dst->stride, pl, ssx, mask_16x16,
1776           mask_8x8, mask_4x4, &cm->lf_info, lfl, (int)cm->seq_params.bit_depth);
1777     else
1778       filter_selectively_horiz(dst->buf, dst->stride, pl, ssx, mask_16x16,
1779                                mask_8x8, mask_4x4, &cm->lf_info, lfl);
1780     dst->buf += row_stride;
1781   }
1782   // reset buf pointer for next block
1783   dst->buf = buf0;
1784 }
1785 
av1_filter_block_plane_ver(AV1_COMMON * const cm,struct macroblockd_plane * const plane_ptr,int pl,int mi_row,int mi_col)1786 void av1_filter_block_plane_ver(AV1_COMMON *const cm,
1787                                 struct macroblockd_plane *const plane_ptr,
1788                                 int pl, int mi_row, int mi_col) {
1789   struct buf_2d *const dst = &plane_ptr->dst;
1790   int r, c;
1791   const int ssx = plane_ptr->subsampling_x;
1792   const int ssy = plane_ptr->subsampling_y;
1793   const int mask_cutoff = 0xffff;
1794   const int single_step = 1 << ssy;
1795   const int r_step = 2 << ssy;
1796   uint64_t mask_16x16 = 0;
1797   uint64_t mask_8x8 = 0;
1798   uint64_t mask_4x4 = 0;
1799   uint8_t *lfl;
1800   uint8_t *lfl2;
1801 
1802   // filter two rows at a time
1803   for (r = 0; r < cm->seq_params.mib_size &&
1804               ((mi_row + r) << MI_SIZE_LOG2 < cm->height);
1805        r += r_step) {
1806     for (c = 0; c < cm->seq_params.mib_size &&
1807                 ((mi_col + c) << MI_SIZE_LOG2 < cm->width);
1808          c += MI_SIZE_64X64) {
1809       dst->buf += ((c << MI_SIZE_LOG2) >> ssx);
1810       LoopFilterMask *lfm = get_loop_filter_mask(cm, mi_row + r, mi_col + c);
1811       assert(lfm);
1812       const int row = ((mi_row + r) | ssy) % MI_SIZE_64X64;
1813       const int col = ((mi_col + c) | ssx) % MI_SIZE_64X64;
1814       int index = 0;
1815       const int shift = get_index_shift(col, row, &index);
1816       // current and next row should belong to the same mask_idx and index
1817       // next row's shift
1818       const int row_next = row + single_step;
1819       int index_next = 0;
1820       const int shift_next = get_index_shift(col, row_next, &index_next);
1821       switch (pl) {
1822         case 0:
1823           mask_16x16 = lfm->left_y[TX_16X16].bits[index];
1824           mask_8x8 = lfm->left_y[TX_8X8].bits[index];
1825           mask_4x4 = lfm->left_y[TX_4X4].bits[index];
1826           lfl = &lfm->lfl_y_ver[row][col];
1827           lfl2 = &lfm->lfl_y_ver[row_next][col];
1828           break;
1829         case 1:
1830           mask_16x16 = lfm->left_u[TX_16X16].bits[index];
1831           mask_8x8 = lfm->left_u[TX_8X8].bits[index];
1832           mask_4x4 = lfm->left_u[TX_4X4].bits[index];
1833           lfl = &lfm->lfl_u_ver[row][col];
1834           lfl2 = &lfm->lfl_u_ver[row_next][col];
1835           break;
1836         case 2:
1837           mask_16x16 = lfm->left_v[TX_16X16].bits[index];
1838           mask_8x8 = lfm->left_v[TX_8X8].bits[index];
1839           mask_4x4 = lfm->left_v[TX_4X4].bits[index];
1840           lfl = &lfm->lfl_v_ver[row][col];
1841           lfl2 = &lfm->lfl_v_ver[row_next][col];
1842           break;
1843         default: assert(pl >= 0 && pl <= 2); return;
1844       }
1845       uint64_t mask_16x16_0 = (mask_16x16 >> shift) & mask_cutoff;
1846       uint64_t mask_8x8_0 = (mask_8x8 >> shift) & mask_cutoff;
1847       uint64_t mask_4x4_0 = (mask_4x4 >> shift) & mask_cutoff;
1848       uint64_t mask_16x16_1 = (mask_16x16 >> shift_next) & mask_cutoff;
1849       uint64_t mask_8x8_1 = (mask_8x8 >> shift_next) & mask_cutoff;
1850       uint64_t mask_4x4_1 = (mask_4x4 >> shift_next) & mask_cutoff;
1851 
1852       if (cm->seq_params.use_highbitdepth)
1853         highbd_filter_selectively_vert_row2(
1854             ssx, CONVERT_TO_SHORTPTR(dst->buf), dst->stride, pl, mask_16x16_0,
1855             mask_8x8_0, mask_4x4_0, mask_16x16_1, mask_8x8_1, mask_4x4_1,
1856             &cm->lf_info, lfl, lfl2, (int)cm->seq_params.bit_depth);
1857       else
1858         filter_selectively_vert_row2(ssx, dst->buf, dst->stride, pl,
1859                                      mask_16x16_0, mask_8x8_0, mask_4x4_0,
1860                                      mask_16x16_1, mask_8x8_1, mask_4x4_1,
1861                                      &cm->lf_info, lfl, lfl2);
1862       dst->buf -= ((c << MI_SIZE_LOG2) >> ssx);
1863     }
1864     dst->buf += 2 * MI_SIZE * dst->stride;
1865   }
1866 }
1867 
av1_filter_block_plane_hor(AV1_COMMON * const cm,struct macroblockd_plane * const plane_ptr,int pl,int mi_row,int mi_col)1868 void av1_filter_block_plane_hor(AV1_COMMON *const cm,
1869                                 struct macroblockd_plane *const plane_ptr,
1870                                 int pl, int mi_row, int mi_col) {
1871   struct buf_2d *const dst = &plane_ptr->dst;
1872   int r, c;
1873   const int ssx = plane_ptr->subsampling_x;
1874   const int ssy = plane_ptr->subsampling_y;
1875   const int mask_cutoff = 0xffff;
1876   const int r_step = 1 << ssy;
1877   uint64_t mask_16x16 = 0;
1878   uint64_t mask_8x8 = 0;
1879   uint64_t mask_4x4 = 0;
1880   uint8_t *lfl;
1881 
1882   for (r = 0; r < cm->seq_params.mib_size &&
1883               ((mi_row + r) << MI_SIZE_LOG2 < cm->height);
1884        r += r_step) {
1885     for (c = 0; c < cm->seq_params.mib_size &&
1886                 ((mi_col + c) << MI_SIZE_LOG2 < cm->width);
1887          c += MI_SIZE_64X64) {
1888       if (mi_row + r == 0) continue;
1889 
1890       dst->buf += ((c << MI_SIZE_LOG2) >> ssx);
1891       LoopFilterMask *lfm = get_loop_filter_mask(cm, mi_row + r, mi_col + c);
1892       assert(lfm);
1893       const int row = ((mi_row + r) | ssy) % MI_SIZE_64X64;
1894       const int col = ((mi_col + c) | ssx) % MI_SIZE_64X64;
1895       int index = 0;
1896       const int shift = get_index_shift(col, row, &index);
1897       switch (pl) {
1898         case 0:
1899           mask_16x16 = lfm->above_y[TX_16X16].bits[index];
1900           mask_8x8 = lfm->above_y[TX_8X8].bits[index];
1901           mask_4x4 = lfm->above_y[TX_4X4].bits[index];
1902           lfl = &lfm->lfl_y_hor[row][col];
1903           break;
1904         case 1:
1905           mask_16x16 = lfm->above_u[TX_16X16].bits[index];
1906           mask_8x8 = lfm->above_u[TX_8X8].bits[index];
1907           mask_4x4 = lfm->above_u[TX_4X4].bits[index];
1908           lfl = &lfm->lfl_u_hor[row][col];
1909           break;
1910         case 2:
1911           mask_16x16 = lfm->above_v[TX_16X16].bits[index];
1912           mask_8x8 = lfm->above_v[TX_8X8].bits[index];
1913           mask_4x4 = lfm->above_v[TX_4X4].bits[index];
1914           lfl = &lfm->lfl_v_hor[row][col];
1915           break;
1916         default: assert(pl >= 0 && pl <= 2); return;
1917       }
1918       mask_16x16 = (mask_16x16 >> shift) & mask_cutoff;
1919       mask_8x8 = (mask_8x8 >> shift) & mask_cutoff;
1920       mask_4x4 = (mask_4x4 >> shift) & mask_cutoff;
1921 
1922       if (cm->seq_params.use_highbitdepth)
1923         highbd_filter_selectively_horiz(CONVERT_TO_SHORTPTR(dst->buf),
1924                                         dst->stride, pl, ssx, mask_16x16,
1925                                         mask_8x8, mask_4x4, &cm->lf_info, lfl,
1926                                         (int)cm->seq_params.bit_depth);
1927       else
1928         filter_selectively_horiz(dst->buf, dst->stride, pl, ssx, mask_16x16,
1929                                  mask_8x8, mask_4x4, &cm->lf_info, lfl);
1930       dst->buf -= ((c << MI_SIZE_LOG2) >> ssx);
1931     }
1932     dst->buf += MI_SIZE * dst->stride;
1933   }
1934 }
1935 #endif  // LOOP_FILTER_BITMASK
1936 
get_transform_size(const MACROBLOCKD * const xd,const MB_MODE_INFO * const mbmi,const EDGE_DIR edge_dir,const int mi_row,const int mi_col,const int plane,const struct macroblockd_plane * plane_ptr)1937 static TX_SIZE get_transform_size(const MACROBLOCKD *const xd,
1938                                   const MB_MODE_INFO *const mbmi,
1939                                   const EDGE_DIR edge_dir, const int mi_row,
1940                                   const int mi_col, const int plane,
1941                                   const struct macroblockd_plane *plane_ptr) {
1942   assert(mbmi != NULL);
1943   if (xd && xd->lossless[mbmi->segment_id]) return TX_4X4;
1944 
1945   TX_SIZE tx_size =
1946       (plane == AOM_PLANE_Y)
1947           ? mbmi->tx_size
1948           : av1_get_max_uv_txsize(mbmi->sb_type, plane_ptr->subsampling_x,
1949                                   plane_ptr->subsampling_y);
1950   assert(tx_size < TX_SIZES_ALL);
1951   if ((plane == AOM_PLANE_Y) && is_inter_block(mbmi) && !mbmi->skip) {
1952     const BLOCK_SIZE sb_type = mbmi->sb_type;
1953     const int blk_row = mi_row & (mi_size_high[sb_type] - 1);
1954     const int blk_col = mi_col & (mi_size_wide[sb_type] - 1);
1955     const TX_SIZE mb_tx_size =
1956         mbmi->inter_tx_size[av1_get_txb_size_index(sb_type, blk_row, blk_col)];
1957     assert(mb_tx_size < TX_SIZES_ALL);
1958     tx_size = mb_tx_size;
1959   }
1960 
1961   // since in case of chrominance or non-square transorm need to convert
1962   // transform size into transform size in particular direction.
1963   // for vertical edge, filter direction is horizontal, for horizontal
1964   // edge, filter direction is vertical.
1965   tx_size = (VERT_EDGE == edge_dir) ? txsize_horz_map[tx_size]
1966                                     : txsize_vert_map[tx_size];
1967   return tx_size;
1968 }
1969 
1970 typedef struct AV1_DEBLOCKING_PARAMETERS {
1971   // length of the filter applied to the outer edge
1972   uint32_t filter_length;
1973   // deblocking limits
1974   const uint8_t *lim;
1975   const uint8_t *mblim;
1976   const uint8_t *hev_thr;
1977 } AV1_DEBLOCKING_PARAMETERS;
1978 
1979 // Return TX_SIZE from get_transform_size(), so it is plane and direction
1980 // awared
set_lpf_parameters(AV1_DEBLOCKING_PARAMETERS * const params,const ptrdiff_t mode_step,const AV1_COMMON * const cm,const MACROBLOCKD * const xd,const EDGE_DIR edge_dir,const uint32_t x,const uint32_t y,const int plane,const struct macroblockd_plane * const plane_ptr)1981 static TX_SIZE set_lpf_parameters(
1982     AV1_DEBLOCKING_PARAMETERS *const params, const ptrdiff_t mode_step,
1983     const AV1_COMMON *const cm, const MACROBLOCKD *const xd,
1984     const EDGE_DIR edge_dir, const uint32_t x, const uint32_t y,
1985     const int plane, const struct macroblockd_plane *const plane_ptr) {
1986   // reset to initial values
1987   params->filter_length = 0;
1988 
1989   // no deblocking is required
1990   const uint32_t width = plane_ptr->dst.width;
1991   const uint32_t height = plane_ptr->dst.height;
1992   if ((width <= x) || (height <= y)) {
1993     // just return the smallest transform unit size
1994     return TX_4X4;
1995   }
1996 
1997   const uint32_t scale_horz = plane_ptr->subsampling_x;
1998   const uint32_t scale_vert = plane_ptr->subsampling_y;
1999   // for sub8x8 block, chroma prediction mode is obtained from the bottom/right
2000   // mi structure of the co-located 8x8 luma block. so for chroma plane, mi_row
2001   // and mi_col should map to the bottom/right mi structure, i.e, both mi_row
2002   // and mi_col should be odd number for chroma plane.
2003   const int mi_row = scale_vert | ((y << scale_vert) >> MI_SIZE_LOG2);
2004   const int mi_col = scale_horz | ((x << scale_horz) >> MI_SIZE_LOG2);
2005   MB_MODE_INFO **mi = cm->mi_grid_visible + mi_row * cm->mi_stride + mi_col;
2006   const MB_MODE_INFO *mbmi = mi[0];
2007   // If current mbmi is not correctly setup, return an invalid value to stop
2008   // filtering. One example is that if this tile is not coded, then its mbmi
2009   // it not set up.
2010   if (mbmi == NULL) return TX_INVALID;
2011 
2012   const TX_SIZE ts =
2013       get_transform_size(xd, mi[0], edge_dir, mi_row, mi_col, plane, plane_ptr);
2014 
2015   {
2016     const uint32_t coord = (VERT_EDGE == edge_dir) ? (x) : (y);
2017     const uint32_t transform_masks =
2018         edge_dir == VERT_EDGE ? tx_size_wide[ts] - 1 : tx_size_high[ts] - 1;
2019     const int32_t tu_edge = (coord & transform_masks) ? (0) : (1);
2020 
2021     if (!tu_edge) return ts;
2022 
2023     // prepare outer edge parameters. deblock the edge if it's an edge of a TU
2024     {
2025       const uint32_t curr_level =
2026           get_filter_level(cm, &cm->lf_info, edge_dir, plane, mbmi);
2027       const int curr_skipped = mbmi->skip && is_inter_block(mbmi);
2028       uint32_t level = curr_level;
2029       if (coord) {
2030         {
2031           const MB_MODE_INFO *const mi_prev = *(mi - mode_step);
2032           if (mi_prev == NULL) return TX_INVALID;
2033           const int pv_row =
2034               (VERT_EDGE == edge_dir) ? (mi_row) : (mi_row - (1 << scale_vert));
2035           const int pv_col =
2036               (VERT_EDGE == edge_dir) ? (mi_col - (1 << scale_horz)) : (mi_col);
2037           const TX_SIZE pv_ts = get_transform_size(
2038               xd, mi_prev, edge_dir, pv_row, pv_col, plane, plane_ptr);
2039 
2040           const uint32_t pv_lvl =
2041               get_filter_level(cm, &cm->lf_info, edge_dir, plane, mi_prev);
2042 
2043           const int pv_skip = mi_prev->skip && is_inter_block(mi_prev);
2044           const BLOCK_SIZE bsize =
2045               get_plane_block_size(mbmi->sb_type, plane_ptr->subsampling_x,
2046                                    plane_ptr->subsampling_y);
2047           const int prediction_masks = edge_dir == VERT_EDGE
2048                                            ? block_size_wide[bsize] - 1
2049                                            : block_size_high[bsize] - 1;
2050           const int32_t pu_edge = !(coord & prediction_masks);
2051           // if the current and the previous blocks are skipped,
2052           // deblock the edge if the edge belongs to a PU's edge only.
2053           if ((curr_level || pv_lvl) &&
2054               (!pv_skip || !curr_skipped || pu_edge)) {
2055             const TX_SIZE min_ts = AOMMIN(ts, pv_ts);
2056             if (TX_4X4 >= min_ts) {
2057               params->filter_length = 4;
2058             } else if (TX_8X8 == min_ts) {
2059               if (plane != 0)
2060                 params->filter_length = 6;
2061               else
2062                 params->filter_length = 8;
2063             } else {
2064               params->filter_length = 14;
2065               // No wide filtering for chroma plane
2066               if (plane != 0) {
2067                 params->filter_length = 6;
2068               }
2069             }
2070 
2071             // update the level if the current block is skipped,
2072             // but the previous one is not
2073             level = (curr_level) ? (curr_level) : (pv_lvl);
2074           }
2075         }
2076       }
2077       // prepare common parameters
2078       if (params->filter_length) {
2079         const loop_filter_thresh *const limits = cm->lf_info.lfthr + level;
2080         params->lim = limits->lim;
2081         params->mblim = limits->mblim;
2082         params->hev_thr = limits->hev_thr;
2083       }
2084     }
2085   }
2086 
2087   return ts;
2088 }
2089 
av1_filter_block_plane_vert(const AV1_COMMON * const cm,const MACROBLOCKD * const xd,const int plane,const MACROBLOCKD_PLANE * const plane_ptr,const uint32_t mi_row,const uint32_t mi_col)2090 void av1_filter_block_plane_vert(const AV1_COMMON *const cm,
2091                                  const MACROBLOCKD *const xd, const int plane,
2092                                  const MACROBLOCKD_PLANE *const plane_ptr,
2093                                  const uint32_t mi_row, const uint32_t mi_col) {
2094   const int row_step = MI_SIZE >> MI_SIZE_LOG2;
2095   const uint32_t scale_horz = plane_ptr->subsampling_x;
2096   const uint32_t scale_vert = plane_ptr->subsampling_y;
2097   uint8_t *const dst_ptr = plane_ptr->dst.buf;
2098   const int dst_stride = plane_ptr->dst.stride;
2099   const int y_range = (MAX_MIB_SIZE >> scale_vert);
2100   const int x_range = (MAX_MIB_SIZE >> scale_horz);
2101   const int use_highbitdepth = cm->seq_params.use_highbitdepth;
2102   const aom_bit_depth_t bit_depth = cm->seq_params.bit_depth;
2103   for (int y = 0; y < y_range; y += row_step) {
2104     uint8_t *p = dst_ptr + y * MI_SIZE * dst_stride;
2105     for (int x = 0; x < x_range;) {
2106       // inner loop always filter vertical edges in a MI block. If MI size
2107       // is 8x8, it will filter the vertical edge aligned with a 8x8 block.
2108       // If 4x4 trasnform is used, it will then filter the internal edge
2109       //  aligned with a 4x4 block
2110       const uint32_t curr_x = ((mi_col * MI_SIZE) >> scale_horz) + x * MI_SIZE;
2111       const uint32_t curr_y = ((mi_row * MI_SIZE) >> scale_vert) + y * MI_SIZE;
2112       uint32_t advance_units;
2113       TX_SIZE tx_size;
2114       AV1_DEBLOCKING_PARAMETERS params;
2115       memset(&params, 0, sizeof(params));
2116 
2117       tx_size =
2118           set_lpf_parameters(&params, ((ptrdiff_t)1 << scale_horz), cm, xd,
2119                              VERT_EDGE, curr_x, curr_y, plane, plane_ptr);
2120       if (tx_size == TX_INVALID) {
2121         params.filter_length = 0;
2122         tx_size = TX_4X4;
2123       }
2124 
2125       switch (params.filter_length) {
2126         // apply 4-tap filtering
2127         case 4:
2128           if (use_highbitdepth)
2129             aom_highbd_lpf_vertical_4(CONVERT_TO_SHORTPTR(p), dst_stride,
2130                                       params.mblim, params.lim, params.hev_thr,
2131                                       bit_depth);
2132           else
2133             aom_lpf_vertical_4(p, dst_stride, params.mblim, params.lim,
2134                                params.hev_thr);
2135           break;
2136         case 6:  // apply 6-tap filter for chroma plane only
2137           assert(plane != 0);
2138           if (use_highbitdepth)
2139             aom_highbd_lpf_vertical_6(CONVERT_TO_SHORTPTR(p), dst_stride,
2140                                       params.mblim, params.lim, params.hev_thr,
2141                                       bit_depth);
2142           else
2143             aom_lpf_vertical_6(p, dst_stride, params.mblim, params.lim,
2144                                params.hev_thr);
2145           break;
2146         // apply 8-tap filtering
2147         case 8:
2148           if (use_highbitdepth)
2149             aom_highbd_lpf_vertical_8(CONVERT_TO_SHORTPTR(p), dst_stride,
2150                                       params.mblim, params.lim, params.hev_thr,
2151                                       bit_depth);
2152           else
2153             aom_lpf_vertical_8(p, dst_stride, params.mblim, params.lim,
2154                                params.hev_thr);
2155           break;
2156         // apply 14-tap filtering
2157         case 14:
2158           if (use_highbitdepth)
2159             aom_highbd_lpf_vertical_14(CONVERT_TO_SHORTPTR(p), dst_stride,
2160                                        params.mblim, params.lim, params.hev_thr,
2161                                        bit_depth);
2162           else
2163             aom_lpf_vertical_14(p, dst_stride, params.mblim, params.lim,
2164                                 params.hev_thr);
2165           break;
2166         // no filtering
2167         default: break;
2168       }
2169       // advance the destination pointer
2170       advance_units = tx_size_wide_unit[tx_size];
2171       x += advance_units;
2172       p += advance_units * MI_SIZE;
2173     }
2174   }
2175 }
2176 
av1_filter_block_plane_horz(const AV1_COMMON * const cm,const MACROBLOCKD * const xd,const int plane,const MACROBLOCKD_PLANE * const plane_ptr,const uint32_t mi_row,const uint32_t mi_col)2177 void av1_filter_block_plane_horz(const AV1_COMMON *const cm,
2178                                  const MACROBLOCKD *const xd, const int plane,
2179                                  const MACROBLOCKD_PLANE *const plane_ptr,
2180                                  const uint32_t mi_row, const uint32_t mi_col) {
2181   const int col_step = MI_SIZE >> MI_SIZE_LOG2;
2182   const uint32_t scale_horz = plane_ptr->subsampling_x;
2183   const uint32_t scale_vert = plane_ptr->subsampling_y;
2184   uint8_t *const dst_ptr = plane_ptr->dst.buf;
2185   const int dst_stride = plane_ptr->dst.stride;
2186   const int y_range = (MAX_MIB_SIZE >> scale_vert);
2187   const int x_range = (MAX_MIB_SIZE >> scale_horz);
2188   const int use_highbitdepth = cm->seq_params.use_highbitdepth;
2189   const aom_bit_depth_t bit_depth = cm->seq_params.bit_depth;
2190   for (int x = 0; x < x_range; x += col_step) {
2191     uint8_t *p = dst_ptr + x * MI_SIZE;
2192     for (int y = 0; y < y_range;) {
2193       // inner loop always filter vertical edges in a MI block. If MI size
2194       // is 8x8, it will first filter the vertical edge aligned with a 8x8
2195       // block. If 4x4 trasnform is used, it will then filter the internal
2196       // edge aligned with a 4x4 block
2197       const uint32_t curr_x = ((mi_col * MI_SIZE) >> scale_horz) + x * MI_SIZE;
2198       const uint32_t curr_y = ((mi_row * MI_SIZE) >> scale_vert) + y * MI_SIZE;
2199       uint32_t advance_units;
2200       TX_SIZE tx_size;
2201       AV1_DEBLOCKING_PARAMETERS params;
2202       memset(&params, 0, sizeof(params));
2203 
2204       tx_size =
2205           set_lpf_parameters(&params, (cm->mi_stride << scale_vert), cm, xd,
2206                              HORZ_EDGE, curr_x, curr_y, plane, plane_ptr);
2207       if (tx_size == TX_INVALID) {
2208         params.filter_length = 0;
2209         tx_size = TX_4X4;
2210       }
2211 
2212       switch (params.filter_length) {
2213         // apply 4-tap filtering
2214         case 4:
2215           if (use_highbitdepth)
2216             aom_highbd_lpf_horizontal_4(CONVERT_TO_SHORTPTR(p), dst_stride,
2217                                         params.mblim, params.lim,
2218                                         params.hev_thr, bit_depth);
2219           else
2220             aom_lpf_horizontal_4(p, dst_stride, params.mblim, params.lim,
2221                                  params.hev_thr);
2222           break;
2223         // apply 6-tap filtering
2224         case 6:
2225           assert(plane != 0);
2226           if (use_highbitdepth)
2227             aom_highbd_lpf_horizontal_6(CONVERT_TO_SHORTPTR(p), dst_stride,
2228                                         params.mblim, params.lim,
2229                                         params.hev_thr, bit_depth);
2230           else
2231             aom_lpf_horizontal_6(p, dst_stride, params.mblim, params.lim,
2232                                  params.hev_thr);
2233           break;
2234         // apply 8-tap filtering
2235         case 8:
2236           if (use_highbitdepth)
2237             aom_highbd_lpf_horizontal_8(CONVERT_TO_SHORTPTR(p), dst_stride,
2238                                         params.mblim, params.lim,
2239                                         params.hev_thr, bit_depth);
2240           else
2241             aom_lpf_horizontal_8(p, dst_stride, params.mblim, params.lim,
2242                                  params.hev_thr);
2243           break;
2244         // apply 14-tap filtering
2245         case 14:
2246           if (use_highbitdepth)
2247             aom_highbd_lpf_horizontal_14(CONVERT_TO_SHORTPTR(p), dst_stride,
2248                                          params.mblim, params.lim,
2249                                          params.hev_thr, bit_depth);
2250           else
2251             aom_lpf_horizontal_14(p, dst_stride, params.mblim, params.lim,
2252                                   params.hev_thr);
2253           break;
2254         // no filtering
2255         default: break;
2256       }
2257 
2258       // advance the destination pointer
2259       advance_units = tx_size_high_unit[tx_size];
2260       y += advance_units;
2261       p += advance_units * dst_stride * MI_SIZE;
2262     }
2263   }
2264 }
2265 
av1_filter_block_plane_vert_test(const AV1_COMMON * const cm,const MACROBLOCKD * const xd,const int plane,const MACROBLOCKD_PLANE * const plane_ptr,const uint32_t mi_row,const uint32_t mi_col)2266 void av1_filter_block_plane_vert_test(const AV1_COMMON *const cm,
2267                                       const MACROBLOCKD *const xd,
2268                                       const int plane,
2269                                       const MACROBLOCKD_PLANE *const plane_ptr,
2270                                       const uint32_t mi_row,
2271                                       const uint32_t mi_col) {
2272   const int row_step = MI_SIZE >> MI_SIZE_LOG2;
2273   const uint32_t scale_horz = plane_ptr->subsampling_x;
2274   const uint32_t scale_vert = plane_ptr->subsampling_y;
2275   uint8_t *const dst_ptr = plane_ptr->dst.buf;
2276   const int dst_stride = plane_ptr->dst.stride;
2277   const int y_range = cm->mi_rows >> scale_vert;
2278   const int x_range = cm->mi_cols >> scale_horz;
2279   for (int y = 0; y < y_range; y += row_step) {
2280     uint8_t *p = dst_ptr + y * MI_SIZE * dst_stride;
2281     for (int x = 0; x < x_range;) {
2282       // inner loop always filter vertical edges in a MI block. If MI size
2283       // is 8x8, it will filter the vertical edge aligned with a 8x8 block.
2284       // If 4x4 trasnform is used, it will then filter the internal edge
2285       //  aligned with a 4x4 block
2286       const uint32_t curr_x = ((mi_col * MI_SIZE) >> scale_horz) + x * MI_SIZE;
2287       const uint32_t curr_y = ((mi_row * MI_SIZE) >> scale_vert) + y * MI_SIZE;
2288       uint32_t advance_units;
2289       TX_SIZE tx_size;
2290       AV1_DEBLOCKING_PARAMETERS params;
2291       memset(&params, 0, sizeof(params));
2292 
2293       tx_size =
2294           set_lpf_parameters(&params, ((ptrdiff_t)1 << scale_horz), cm, xd,
2295                              VERT_EDGE, curr_x, curr_y, plane, plane_ptr);
2296       if (tx_size == TX_INVALID) {
2297         params.filter_length = 0;
2298         tx_size = TX_4X4;
2299       }
2300 
2301       // advance the destination pointer
2302       advance_units = tx_size_wide_unit[tx_size];
2303       x += advance_units;
2304       p += advance_units * MI_SIZE;
2305     }
2306   }
2307 }
2308 
av1_filter_block_plane_horz_test(const AV1_COMMON * const cm,const MACROBLOCKD * const xd,const int plane,const MACROBLOCKD_PLANE * const plane_ptr,const uint32_t mi_row,const uint32_t mi_col)2309 void av1_filter_block_plane_horz_test(const AV1_COMMON *const cm,
2310                                       const MACROBLOCKD *const xd,
2311                                       const int plane,
2312                                       const MACROBLOCKD_PLANE *const plane_ptr,
2313                                       const uint32_t mi_row,
2314                                       const uint32_t mi_col) {
2315   const int col_step = MI_SIZE >> MI_SIZE_LOG2;
2316   const uint32_t scale_horz = plane_ptr->subsampling_x;
2317   const uint32_t scale_vert = plane_ptr->subsampling_y;
2318   uint8_t *const dst_ptr = plane_ptr->dst.buf;
2319   const int dst_stride = plane_ptr->dst.stride;
2320   const int y_range = cm->mi_rows >> scale_vert;
2321   const int x_range = cm->mi_cols >> scale_horz;
2322   for (int x = 0; x < x_range; x += col_step) {
2323     uint8_t *p = dst_ptr + x * MI_SIZE;
2324     for (int y = 0; y < y_range;) {
2325       // inner loop always filter vertical edges in a MI block. If MI size
2326       // is 8x8, it will first filter the vertical edge aligned with a 8x8
2327       // block. If 4x4 trasnform is used, it will then filter the internal
2328       // edge aligned with a 4x4 block
2329       const uint32_t curr_x = ((mi_col * MI_SIZE) >> scale_horz) + x * MI_SIZE;
2330       const uint32_t curr_y = ((mi_row * MI_SIZE) >> scale_vert) + y * MI_SIZE;
2331       uint32_t advance_units;
2332       TX_SIZE tx_size;
2333       AV1_DEBLOCKING_PARAMETERS params;
2334       memset(&params, 0, sizeof(params));
2335 
2336       tx_size =
2337           set_lpf_parameters(&params, (cm->mi_stride << scale_vert), cm, xd,
2338                              HORZ_EDGE, curr_x, curr_y, plane, plane_ptr);
2339       if (tx_size == TX_INVALID) {
2340         params.filter_length = 0;
2341         tx_size = TX_4X4;
2342       }
2343 
2344       // advance the destination pointer
2345       advance_units = tx_size_high_unit[tx_size];
2346       y += advance_units;
2347       p += advance_units * dst_stride * MI_SIZE;
2348     }
2349   }
2350 }
2351 
loop_filter_rows(YV12_BUFFER_CONFIG * frame_buffer,AV1_COMMON * cm,MACROBLOCKD * xd,int start,int stop,int is_decoding,int plane_start,int plane_end)2352 static void loop_filter_rows(YV12_BUFFER_CONFIG *frame_buffer, AV1_COMMON *cm,
2353                              MACROBLOCKD *xd, int start, int stop,
2354 #if LOOP_FILTER_BITMASK
2355                              int is_decoding,
2356 #endif
2357                              int plane_start, int plane_end) {
2358   struct macroblockd_plane *pd = xd->plane;
2359   const int col_start = 0;
2360   const int col_end = cm->mi_cols;
2361   int mi_row, mi_col;
2362   int plane;
2363 
2364 #if LOOP_FILTER_BITMASK
2365   if (is_decoding) {
2366     cm->is_decoding = is_decoding;
2367     for (plane = plane_start; plane < plane_end; plane++) {
2368       if (plane == 0 && !(cm->lf.filter_level[0]) && !(cm->lf.filter_level[1]))
2369         break;
2370       else if (plane == 1 && !(cm->lf.filter_level_u))
2371         continue;
2372       else if (plane == 2 && !(cm->lf.filter_level_v))
2373         continue;
2374 
2375       av1_setup_dst_planes(pd, cm->seq_params.sb_size, frame_buffer, 0, 0,
2376                            plane, plane + 1);
2377 
2378       av1_build_bitmask_vert_info(cm, &pd[plane], plane);
2379       av1_build_bitmask_horz_info(cm, &pd[plane], plane);
2380 
2381       // apply loop filtering which only goes through buffer once
2382       for (mi_row = start; mi_row < stop; mi_row += MI_SIZE_64X64) {
2383         for (mi_col = col_start; mi_col < col_end; mi_col += MI_SIZE_64X64) {
2384           av1_setup_dst_planes(pd, BLOCK_64X64, frame_buffer, mi_row, mi_col,
2385                                plane, plane + 1);
2386           av1_filter_block_plane_bitmask_vert(cm, &pd[plane], plane, mi_row,
2387                                               mi_col);
2388           if (mi_col - MI_SIZE_64X64 >= 0) {
2389             av1_setup_dst_planes(pd, BLOCK_64X64, frame_buffer, mi_row,
2390                                  mi_col - MI_SIZE_64X64, plane, plane + 1);
2391             av1_filter_block_plane_bitmask_horz(cm, &pd[plane], plane, mi_row,
2392                                                 mi_col - MI_SIZE_64X64);
2393           }
2394         }
2395         av1_setup_dst_planes(pd, BLOCK_64X64, frame_buffer, mi_row,
2396                              mi_col - MI_SIZE_64X64, plane, plane + 1);
2397         av1_filter_block_plane_bitmask_horz(cm, &pd[plane], plane, mi_row,
2398                                             mi_col - MI_SIZE_64X64);
2399       }
2400     }
2401     return;
2402   }
2403 #endif
2404 
2405   for (plane = plane_start; plane < plane_end; plane++) {
2406     if (plane == 0 && !(cm->lf.filter_level[0]) && !(cm->lf.filter_level[1]))
2407       break;
2408     else if (plane == 1 && !(cm->lf.filter_level_u))
2409       continue;
2410     else if (plane == 2 && !(cm->lf.filter_level_v))
2411       continue;
2412 
2413     if (cm->lf.combine_vert_horz_lf) {
2414       // filter all vertical and horizontal edges in every 128x128 super block
2415       for (mi_row = start; mi_row < stop; mi_row += MAX_MIB_SIZE) {
2416         for (mi_col = col_start; mi_col < col_end; mi_col += MAX_MIB_SIZE) {
2417           // filter vertical edges
2418           av1_setup_dst_planes(pd, cm->seq_params.sb_size, frame_buffer, mi_row,
2419                                mi_col, plane, plane + 1);
2420           av1_filter_block_plane_vert(cm, xd, plane, &pd[plane], mi_row,
2421                                       mi_col);
2422           // filter horizontal edges
2423           if (mi_col - MAX_MIB_SIZE >= 0) {
2424             av1_setup_dst_planes(pd, cm->seq_params.sb_size, frame_buffer,
2425                                  mi_row, mi_col - MAX_MIB_SIZE, plane,
2426                                  plane + 1);
2427             av1_filter_block_plane_horz(cm, xd, plane, &pd[plane], mi_row,
2428                                         mi_col - MAX_MIB_SIZE);
2429           }
2430         }
2431         // filter horizontal edges
2432         av1_setup_dst_planes(pd, cm->seq_params.sb_size, frame_buffer, mi_row,
2433                              mi_col - MAX_MIB_SIZE, plane, plane + 1);
2434         av1_filter_block_plane_horz(cm, xd, plane, &pd[plane], mi_row,
2435                                     mi_col - MAX_MIB_SIZE);
2436       }
2437     } else {
2438       // filter all vertical edges in every 128x128 super block
2439       for (mi_row = start; mi_row < stop; mi_row += MAX_MIB_SIZE) {
2440         for (mi_col = col_start; mi_col < col_end; mi_col += MAX_MIB_SIZE) {
2441           av1_setup_dst_planes(pd, cm->seq_params.sb_size, frame_buffer, mi_row,
2442                                mi_col, plane, plane + 1);
2443           av1_filter_block_plane_vert(cm, xd, plane, &pd[plane], mi_row,
2444                                       mi_col);
2445         }
2446       }
2447 
2448       // filter all horizontal edges in every 128x128 super block
2449       for (mi_row = start; mi_row < stop; mi_row += MAX_MIB_SIZE) {
2450         for (mi_col = col_start; mi_col < col_end; mi_col += MAX_MIB_SIZE) {
2451           av1_setup_dst_planes(pd, cm->seq_params.sb_size, frame_buffer, mi_row,
2452                                mi_col, plane, plane + 1);
2453           av1_filter_block_plane_horz(cm, xd, plane, &pd[plane], mi_row,
2454                                       mi_col);
2455         }
2456       }
2457     }
2458   }
2459 }
2460 
av1_loop_filter_frame(YV12_BUFFER_CONFIG * frame,AV1_COMMON * cm,MACROBLOCKD * xd,int is_decoding,int plane_start,int plane_end,int partial_frame)2461 void av1_loop_filter_frame(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm,
2462                            MACROBLOCKD *xd,
2463 #if LOOP_FILTER_BITMASK
2464                            int is_decoding,
2465 #endif
2466                            int plane_start, int plane_end, int partial_frame) {
2467   int start_mi_row, end_mi_row, mi_rows_to_filter;
2468 
2469   start_mi_row = 0;
2470   mi_rows_to_filter = cm->mi_rows;
2471   if (partial_frame && cm->mi_rows > 8) {
2472     start_mi_row = cm->mi_rows >> 1;
2473     start_mi_row &= 0xfffffff8;
2474     mi_rows_to_filter = AOMMAX(cm->mi_rows / 8, 8);
2475   }
2476   end_mi_row = start_mi_row + mi_rows_to_filter;
2477   av1_loop_filter_frame_init(cm, plane_start, plane_end);
2478   loop_filter_rows(frame, cm, xd, start_mi_row, end_mi_row,
2479 #if LOOP_FILTER_BITMASK
2480                    is_decoding,
2481 #endif
2482                    plane_start, plane_end);
2483 }
2484