1 // Copyright 2019 The libgav1 Authors
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include "src/tile.h"
16 
17 #include <algorithm>
18 #include <array>
19 #include <cassert>
20 #include <climits>
21 #include <cstdlib>
22 #include <cstring>
23 #include <memory>
24 #include <new>
25 #include <numeric>
26 #include <type_traits>
27 #include <utility>
28 
29 #include "src/frame_scratch_buffer.h"
30 #include "src/motion_vector.h"
31 #include "src/reconstruction.h"
32 #include "src/utils/bit_mask_set.h"
33 #include "src/utils/common.h"
34 #include "src/utils/constants.h"
35 #include "src/utils/logging.h"
36 #include "src/utils/segmentation.h"
37 #include "src/utils/stack.h"
38 
39 namespace libgav1 {
40 namespace {
41 
42 // Import all the constants in the anonymous namespace.
43 #include "src/scan_tables.inc"
44 
45 // Range above kNumQuantizerBaseLevels which the exponential golomb coding
46 // process is activated.
47 constexpr int kQuantizerCoefficientBaseRange = 12;
48 constexpr int kNumQuantizerBaseLevels = 2;
49 constexpr int kCoeffBaseRangeMaxIterations =
50     kQuantizerCoefficientBaseRange / (kCoeffBaseRangeSymbolCount - 1);
51 constexpr int kEntropyContextLeft = 0;
52 constexpr int kEntropyContextTop = 1;
53 
54 constexpr uint8_t kAllZeroContextsByTopLeft[5][5] = {{1, 2, 2, 2, 3},
55                                                      {2, 4, 4, 4, 5},
56                                                      {2, 4, 4, 4, 5},
57                                                      {2, 4, 4, 4, 5},
58                                                      {3, 5, 5, 5, 6}};
59 
60 // The space complexity of DFS is O(branching_factor * max_depth). For the
61 // parameter tree, branching_factor = 4 (there could be up to 4 children for
62 // every node) and max_depth (excluding the root) = 5 (to go from a 128x128
63 // block all the way to a 4x4 block). The worse-case stack size is 16, by
64 // counting the number of 'o' nodes in the diagram:
65 //
66 //   |                    128x128  The highest level (corresponding to the
67 //   |                             root of the tree) has no node in the stack.
68 //   |-----------------+
69 //   |     |     |     |
70 //   |     o     o     o  64x64
71 //   |
72 //   |-----------------+
73 //   |     |     |     |
74 //   |     o     o     o  32x32    Higher levels have three nodes in the stack,
75 //   |                             because we pop one node off the stack before
76 //   |-----------------+           pushing its four children onto the stack.
77 //   |     |     |     |
78 //   |     o     o     o  16x16
79 //   |
80 //   |-----------------+
81 //   |     |     |     |
82 //   |     o     o     o  8x8
83 //   |
84 //   |-----------------+
85 //   |     |     |     |
86 //   o     o     o     o  4x4      Only the lowest level has four nodes in the
87 //                                 stack.
88 constexpr int kDfsStackSize = 16;
89 
90 // Mask indicating whether the transform sets contain a particular transform
91 // type. If |tx_type| is present in |tx_set|, then the |tx_type|th LSB is set.
92 constexpr BitMaskSet kTransformTypeInSetMask[kNumTransformSets] = {
93     BitMaskSet(0x1),    BitMaskSet(0xE0F), BitMaskSet(0x20F),
94     BitMaskSet(0xFFFF), BitMaskSet(0xFFF), BitMaskSet(0x201)};
95 
96 constexpr PredictionMode
97     kFilterIntraModeToIntraPredictor[kNumFilterIntraPredictors] = {
98         kPredictionModeDc, kPredictionModeVertical, kPredictionModeHorizontal,
99         kPredictionModeD157, kPredictionModeDc};
100 
101 // Mask used to determine the index for mode_deltas lookup.
102 constexpr BitMaskSet kPredictionModeDeltasMask(
103     kPredictionModeNearestMv, kPredictionModeNearMv, kPredictionModeNewMv,
104     kPredictionModeNearestNearestMv, kPredictionModeNearNearMv,
105     kPredictionModeNearestNewMv, kPredictionModeNewNearestMv,
106     kPredictionModeNearNewMv, kPredictionModeNewNearMv,
107     kPredictionModeNewNewMv);
108 
109 // This is computed as:
110 // min(transform_width_log2, 5) + min(transform_height_log2, 5) - 4.
111 constexpr uint8_t kEobMultiSizeLookup[kNumTransformSizes] = {
112     0, 1, 2, 1, 2, 3, 4, 2, 3, 4, 5, 5, 4, 5, 6, 6, 5, 6, 6};
113 
114 /* clang-format off */
115 constexpr uint8_t kCoeffBaseContextOffset[kNumTransformSizes][5][5] = {
116     {{0, 1, 6, 6, 0}, {1, 6, 6, 21, 0}, {6, 6, 21, 21, 0}, {6, 21, 21, 21, 0},
117      {0, 0, 0, 0, 0}},
118     {{0, 11, 11, 11, 0}, {11, 11, 11, 11, 0}, {6, 6, 21, 21, 0},
119      {6, 21, 21, 21, 0}, {21, 21, 21, 21, 0}},
120     {{0, 11, 11, 11, 0}, {11, 11, 11, 11, 0}, {6, 6, 21, 21, 0},
121      {6, 21, 21, 21, 0}, {21, 21, 21, 21, 0}},
122     {{0, 16, 6, 6, 21}, {16, 16, 6, 21, 21}, {16, 16, 21, 21, 21},
123      {16, 16, 21, 21, 21}, {0, 0, 0, 0, 0}},
124     {{0, 1, 6, 6, 21}, {1, 6, 6, 21, 21}, {6, 6, 21, 21, 21},
125      {6, 21, 21, 21, 21}, {21, 21, 21, 21, 21}},
126     {{0, 11, 11, 11, 11}, {11, 11, 11, 11, 11}, {6, 6, 21, 21, 21},
127      {6, 21, 21, 21, 21}, {21, 21, 21, 21, 21}},
128     {{0, 11, 11, 11, 11}, {11, 11, 11, 11, 11}, {6, 6, 21, 21, 21},
129      {6, 21, 21, 21, 21}, {21, 21, 21, 21, 21}},
130     {{0, 16, 6, 6, 21}, {16, 16, 6, 21, 21}, {16, 16, 21, 21, 21},
131      {16, 16, 21, 21, 21}, {0, 0, 0, 0, 0}},
132     {{0, 16, 6, 6, 21}, {16, 16, 6, 21, 21}, {16, 16, 21, 21, 21},
133      {16, 16, 21, 21, 21}, {16, 16, 21, 21, 21}},
134     {{0, 1, 6, 6, 21}, {1, 6, 6, 21, 21}, {6, 6, 21, 21, 21},
135      {6, 21, 21, 21, 21}, {21, 21, 21, 21, 21}},
136     {{0, 11, 11, 11, 11}, {11, 11, 11, 11, 11}, {6, 6, 21, 21, 21},
137      {6, 21, 21, 21, 21}, {21, 21, 21, 21, 21}},
138     {{0, 11, 11, 11, 11}, {11, 11, 11, 11, 11}, {6, 6, 21, 21, 21},
139      {6, 21, 21, 21, 21}, {21, 21, 21, 21, 21}},
140     {{0, 16, 6, 6, 21}, {16, 16, 6, 21, 21}, {16, 16, 21, 21, 21},
141      {16, 16, 21, 21, 21}, {16, 16, 21, 21, 21}},
142     {{0, 16, 6, 6, 21}, {16, 16, 6, 21, 21}, {16, 16, 21, 21, 21},
143      {16, 16, 21, 21, 21}, {16, 16, 21, 21, 21}},
144     {{0, 1, 6, 6, 21}, {1, 6, 6, 21, 21}, {6, 6, 21, 21, 21},
145      {6, 21, 21, 21, 21}, {21, 21, 21, 21, 21}},
146     {{0, 11, 11, 11, 11}, {11, 11, 11, 11, 11}, {6, 6, 21, 21, 21},
147      {6, 21, 21, 21, 21}, {21, 21, 21, 21, 21}},
148     {{0, 16, 6, 6, 21}, {16, 16, 6, 21, 21}, {16, 16, 21, 21, 21},
149      {16, 16, 21, 21, 21}, {16, 16, 21, 21, 21}},
150     {{0, 16, 6, 6, 21}, {16, 16, 6, 21, 21}, {16, 16, 21, 21, 21},
151      {16, 16, 21, 21, 21}, {16, 16, 21, 21, 21}},
152     {{0, 1, 6, 6, 21}, {1, 6, 6, 21, 21}, {6, 6, 21, 21, 21},
153      {6, 21, 21, 21, 21}, {21, 21, 21, 21, 21}}};
154 /* clang-format on */
155 
156 // Extended the table size from 3 to 16 by repeating the last element to avoid
157 // the clips to row or column indices.
158 constexpr uint8_t kCoeffBasePositionContextOffset[16] = {
159     26, 31, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36};
160 
161 constexpr PredictionMode kInterIntraToIntraMode[kNumInterIntraModes] = {
162     kPredictionModeDc, kPredictionModeVertical, kPredictionModeHorizontal,
163     kPredictionModeSmooth};
164 
165 // Number of horizontal luma samples before intra block copy can be used.
166 constexpr int kIntraBlockCopyDelayPixels = 256;
167 // Number of 64 by 64 blocks before intra block copy can be used.
168 constexpr int kIntraBlockCopyDelay64x64Blocks = kIntraBlockCopyDelayPixels / 64;
169 
170 // Index [i][j] corresponds to the transform size of width 1 << (i + 2) and
171 // height 1 << (j + 2).
172 constexpr TransformSize k4x4SizeToTransformSize[5][5] = {
173     {kTransformSize4x4, kTransformSize4x8, kTransformSize4x16,
174      kNumTransformSizes, kNumTransformSizes},
175     {kTransformSize8x4, kTransformSize8x8, kTransformSize8x16,
176      kTransformSize8x32, kNumTransformSizes},
177     {kTransformSize16x4, kTransformSize16x8, kTransformSize16x16,
178      kTransformSize16x32, kTransformSize16x64},
179     {kNumTransformSizes, kTransformSize32x8, kTransformSize32x16,
180      kTransformSize32x32, kTransformSize32x64},
181     {kNumTransformSizes, kNumTransformSizes, kTransformSize64x16,
182      kTransformSize64x32, kTransformSize64x64}};
183 
184 // Defined in section 9.3 of the spec.
185 constexpr TransformType kModeToTransformType[kIntraPredictionModesUV] = {
186     kTransformTypeDctDct,   kTransformTypeDctAdst,  kTransformTypeAdstDct,
187     kTransformTypeDctDct,   kTransformTypeAdstAdst, kTransformTypeDctAdst,
188     kTransformTypeAdstDct,  kTransformTypeAdstDct,  kTransformTypeDctAdst,
189     kTransformTypeAdstAdst, kTransformTypeDctAdst,  kTransformTypeAdstDct,
190     kTransformTypeAdstAdst, kTransformTypeDctDct};
191 
192 // Defined in section 5.11.47 of the spec. This array does not contain an entry
193 // for kTransformSetDctOnly, so the first dimension needs to be
194 // |kNumTransformSets| - 1.
195 constexpr TransformType kInverseTransformTypeBySet[kNumTransformSets - 1][16] =
196     {{kTransformTypeIdentityIdentity, kTransformTypeDctDct,
197       kTransformTypeIdentityDct, kTransformTypeDctIdentity,
198       kTransformTypeAdstAdst, kTransformTypeDctAdst, kTransformTypeAdstDct},
199      {kTransformTypeIdentityIdentity, kTransformTypeDctDct,
200       kTransformTypeAdstAdst, kTransformTypeDctAdst, kTransformTypeAdstDct},
201      {kTransformTypeIdentityIdentity, kTransformTypeIdentityDct,
202       kTransformTypeDctIdentity, kTransformTypeIdentityAdst,
203       kTransformTypeAdstIdentity, kTransformTypeIdentityFlipadst,
204       kTransformTypeFlipadstIdentity, kTransformTypeDctDct,
205       kTransformTypeDctAdst, kTransformTypeAdstDct, kTransformTypeDctFlipadst,
206       kTransformTypeFlipadstDct, kTransformTypeAdstAdst,
207       kTransformTypeFlipadstFlipadst, kTransformTypeFlipadstAdst,
208       kTransformTypeAdstFlipadst},
209      {kTransformTypeIdentityIdentity, kTransformTypeIdentityDct,
210       kTransformTypeDctIdentity, kTransformTypeDctDct, kTransformTypeDctAdst,
211       kTransformTypeAdstDct, kTransformTypeDctFlipadst,
212       kTransformTypeFlipadstDct, kTransformTypeAdstAdst,
213       kTransformTypeFlipadstFlipadst, kTransformTypeFlipadstAdst,
214       kTransformTypeAdstFlipadst},
215      {kTransformTypeIdentityIdentity, kTransformTypeDctDct}};
216 
217 // Replaces all occurrences of 64x* and *x64 with 32x* and *x32 respectively.
218 constexpr TransformSize kAdjustedTransformSize[kNumTransformSizes] = {
219     kTransformSize4x4,   kTransformSize4x8,   kTransformSize4x16,
220     kTransformSize8x4,   kTransformSize8x8,   kTransformSize8x16,
221     kTransformSize8x32,  kTransformSize16x4,  kTransformSize16x8,
222     kTransformSize16x16, kTransformSize16x32, kTransformSize16x32,
223     kTransformSize32x8,  kTransformSize32x16, kTransformSize32x32,
224     kTransformSize32x32, kTransformSize32x16, kTransformSize32x32,
225     kTransformSize32x32};
226 
227 // This is the same as Max_Tx_Size_Rect array in the spec but with *x64 and 64*x
228 // transforms replaced with *x32 and 32x* respectively.
229 constexpr TransformSize kUVTransformSize[kMaxBlockSizes] = {
230     kTransformSize4x4,   kTransformSize4x8,   kTransformSize4x16,
231     kTransformSize8x4,   kTransformSize8x8,   kTransformSize8x16,
232     kTransformSize8x32,  kTransformSize16x4,  kTransformSize16x8,
233     kTransformSize16x16, kTransformSize16x32, kTransformSize16x32,
234     kTransformSize32x8,  kTransformSize32x16, kTransformSize32x32,
235     kTransformSize32x32, kTransformSize32x16, kTransformSize32x32,
236     kTransformSize32x32, kTransformSize32x32, kTransformSize32x32,
237     kTransformSize32x32};
238 
239 // ith entry of this array is computed as:
240 // DivideBy2(TransformSizeToSquareTransformIndex(kTransformSizeSquareMin[i]) +
241 //           TransformSizeToSquareTransformIndex(kTransformSizeSquareMax[i]) +
242 //           1)
243 constexpr uint8_t kTransformSizeContext[kNumTransformSizes] = {
244     0, 1, 1, 1, 1, 2, 2, 1, 2, 2, 3, 3, 2, 3, 3, 4, 3, 4, 4};
245 
246 constexpr int8_t kSgrProjDefaultMultiplier[2] = {-32, 31};
247 
248 constexpr int8_t kWienerDefaultFilter[kNumWienerCoefficients] = {3, -7, 15};
249 
250 // Maps compound prediction modes into single modes. For e.g.
251 // kPredictionModeNearestNewMv will map to kPredictionModeNearestMv for index 0
252 // and kPredictionModeNewMv for index 1. It is used to simplify the logic in
253 // AssignMv (and avoid duplicate code). This is section 5.11.30. in the spec.
254 constexpr PredictionMode
255     kCompoundToSinglePredictionMode[kNumCompoundInterPredictionModes][2] = {
256         {kPredictionModeNearestMv, kPredictionModeNearestMv},
257         {kPredictionModeNearMv, kPredictionModeNearMv},
258         {kPredictionModeNearestMv, kPredictionModeNewMv},
259         {kPredictionModeNewMv, kPredictionModeNearestMv},
260         {kPredictionModeNearMv, kPredictionModeNewMv},
261         {kPredictionModeNewMv, kPredictionModeNearMv},
262         {kPredictionModeGlobalMv, kPredictionModeGlobalMv},
263         {kPredictionModeNewMv, kPredictionModeNewMv},
264 };
GetSinglePredictionMode(int index,PredictionMode y_mode)265 PredictionMode GetSinglePredictionMode(int index, PredictionMode y_mode) {
266   if (y_mode < kPredictionModeNearestNearestMv) {
267     return y_mode;
268   }
269   const int lookup_index = y_mode - kPredictionModeNearestNearestMv;
270   assert(lookup_index >= 0);
271   return kCompoundToSinglePredictionMode[lookup_index][index];
272 }
273 
274 // log2(dqDenom) in section 7.12.3 of the spec. We use the log2 value because
275 // dqDenom is always a power of two and hence right shift can be used instead of
276 // division.
277 constexpr uint8_t kQuantizationShift[kNumTransformSizes] = {
278     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 2, 1, 2, 2};
279 
280 // Returns the minimum of |length| or |max|-|start|. This is used to clamp array
281 // indices when accessing arrays whose bound is equal to |max|.
GetNumElements(int length,int start,int max)282 int GetNumElements(int length, int start, int max) {
283   return std::min(length, max - start);
284 }
285 
286 template <typename T>
SetBlockValues(int rows,int columns,T value,T * dst,ptrdiff_t stride)287 void SetBlockValues(int rows, int columns, T value, T* dst, ptrdiff_t stride) {
288   // Specialize all columns cases (values in kTransformWidth4x4[]) for better
289   // performance.
290   switch (columns) {
291     case 1:
292       MemSetBlock<T>(rows, 1, value, dst, stride);
293       break;
294     case 2:
295       MemSetBlock<T>(rows, 2, value, dst, stride);
296       break;
297     case 4:
298       MemSetBlock<T>(rows, 4, value, dst, stride);
299       break;
300     case 8:
301       MemSetBlock<T>(rows, 8, value, dst, stride);
302       break;
303     default:
304       assert(columns == 16);
305       MemSetBlock<T>(rows, 16, value, dst, stride);
306       break;
307   }
308 }
309 
SetTransformType(const Tile::Block & block,int x4,int y4,int w4,int h4,TransformType tx_type,TransformType transform_types[32][32])310 void SetTransformType(const Tile::Block& block, int x4, int y4, int w4, int h4,
311                       TransformType tx_type,
312                       TransformType transform_types[32][32]) {
313   const int y_offset = y4 - block.row4x4;
314   const int x_offset = x4 - block.column4x4;
315   TransformType* const dst = &transform_types[y_offset][x_offset];
316   SetBlockValues<TransformType>(h4, w4, tx_type, dst, 32);
317 }
318 
StoreMotionFieldMvs(ReferenceFrameType reference_frame_to_store,const MotionVector & mv_to_store,ptrdiff_t stride,int rows,int columns,ReferenceFrameType * reference_frame_row_start,MotionVector * mv)319 void StoreMotionFieldMvs(ReferenceFrameType reference_frame_to_store,
320                          const MotionVector& mv_to_store, ptrdiff_t stride,
321                          int rows, int columns,
322                          ReferenceFrameType* reference_frame_row_start,
323                          MotionVector* mv) {
324   static_assert(sizeof(*reference_frame_row_start) == sizeof(int8_t), "");
325   do {
326     // Don't switch the following two memory setting functions.
327     // Some ARM CPUs are quite sensitive to the order.
328     memset(reference_frame_row_start, reference_frame_to_store, columns);
329     std::fill(mv, mv + columns, mv_to_store);
330     reference_frame_row_start += stride;
331     mv += stride;
332   } while (--rows != 0);
333 }
334 
335 // Inverse transform process assumes that the quantized coefficients are stored
336 // as a virtual 2d array of size |tx_width| x tx_height. If transform width is
337 // 64, then this assumption is broken because the scan order used for populating
338 // the coefficients for such transforms is the same as the one used for
339 // corresponding transform with width 32 (e.g. the scan order used for 64x16 is
340 // the same as the one used for 32x16). So we must restore the coefficients to
341 // their correct positions and clean the positions they occupied.
342 template <typename ResidualType>
MoveCoefficientsForTxWidth64(int clamped_tx_height,int tx_width,ResidualType * residual)343 void MoveCoefficientsForTxWidth64(int clamped_tx_height, int tx_width,
344                                   ResidualType* residual) {
345   if (tx_width != 64) return;
346   const int rows = clamped_tx_height - 2;
347   auto* src = residual + 32 * rows;
348   residual += 64 * rows;
349   // Process 2 rows in each loop in reverse order to avoid overwrite.
350   int x = rows >> 1;
351   do {
352     // The 2 rows can be processed in order.
353     memcpy(residual, src, 32 * sizeof(src[0]));
354     memcpy(residual + 64, src + 32, 32 * sizeof(src[0]));
355     memset(src + 32, 0, 32 * sizeof(src[0]));
356     src -= 64;
357     residual -= 128;
358   } while (--x);
359   // Process the second row. The first row is already correct.
360   memcpy(residual + 64, src + 32, 32 * sizeof(src[0]));
361   memset(src + 32, 0, 32 * sizeof(src[0]));
362 }
363 
GetClampParameters(const Tile::Block & block,int min[2],int max[2])364 void GetClampParameters(const Tile::Block& block, int min[2], int max[2]) {
365   // 7.10.2.14 (part 1). (also contains implementations of 5.11.53
366   // and 5.11.54).
367   constexpr int kMvBorder4x4 = 4;
368   const int row_border = kMvBorder4x4 + block.height4x4;
369   const int column_border = kMvBorder4x4 + block.width4x4;
370   const int macroblocks_to_top_edge = -block.row4x4;
371   const int macroblocks_to_bottom_edge =
372       block.tile.frame_header().rows4x4 - block.height4x4 - block.row4x4;
373   const int macroblocks_to_left_edge = -block.column4x4;
374   const int macroblocks_to_right_edge =
375       block.tile.frame_header().columns4x4 - block.width4x4 - block.column4x4;
376   min[0] = MultiplyBy32(macroblocks_to_top_edge - row_border);
377   min[1] = MultiplyBy32(macroblocks_to_left_edge - column_border);
378   max[0] = MultiplyBy32(macroblocks_to_bottom_edge + row_border);
379   max[1] = MultiplyBy32(macroblocks_to_right_edge + column_border);
380 }
381 
382 // Section 8.3.2 in the spec, under coeff_base_eob.
GetCoeffBaseContextEob(TransformSize tx_size,int index)383 int GetCoeffBaseContextEob(TransformSize tx_size, int index) {
384   if (index == 0) return 0;
385   const TransformSize adjusted_tx_size = kAdjustedTransformSize[tx_size];
386   const int tx_width_log2 = kTransformWidthLog2[adjusted_tx_size];
387   const int tx_height = kTransformHeight[adjusted_tx_size];
388   if (index <= DivideBy8(tx_height << tx_width_log2)) return 1;
389   if (index <= DivideBy4(tx_height << tx_width_log2)) return 2;
390   return 3;
391 }
392 
393 // Section 8.3.2 in the spec, under coeff_br. Optimized for end of block based
394 // on the fact that {0, 1}, {1, 0}, {1, 1}, {0, 2} and {2, 0} will all be 0 in
395 // the end of block case.
GetCoeffBaseRangeContextEob(int adjusted_tx_width_log2,int pos,TransformClass tx_class)396 int GetCoeffBaseRangeContextEob(int adjusted_tx_width_log2, int pos,
397                                 TransformClass tx_class) {
398   if (pos == 0) return 0;
399   const int tx_width = 1 << adjusted_tx_width_log2;
400   const int row = pos >> adjusted_tx_width_log2;
401   const int column = pos & (tx_width - 1);
402   // This return statement is equivalent to:
403   // return ((tx_class == kTransformClass2D && (row | column) < 2) ||
404   //         (tx_class == kTransformClassHorizontal && column == 0) ||
405   //         (tx_class == kTransformClassVertical && row == 0))
406   //            ? 7
407   //            : 14;
408   return 14 >> ((static_cast<int>(tx_class == kTransformClass2D) &
409                  static_cast<int>((row | column) < 2)) |
410                 (tx_class & static_cast<int>(column == 0)) |
411                 ((tx_class >> 1) & static_cast<int>(row == 0)));
412 }
413 
414 }  // namespace
415 
Tile(int tile_number,const uint8_t * const data,size_t size,const ObuSequenceHeader & sequence_header,const ObuFrameHeader & frame_header,RefCountedBuffer * const current_frame,const DecoderState & state,FrameScratchBuffer * const frame_scratch_buffer,const WedgeMaskArray & wedge_masks,const QuantizerMatrix & quantizer_matrix,SymbolDecoderContext * const saved_symbol_decoder_context,const SegmentationMap * prev_segment_ids,PostFilter * const post_filter,const dsp::Dsp * const dsp,ThreadPool * const thread_pool,BlockingCounterWithStatus * const pending_tiles,bool frame_parallel,bool use_intra_prediction_buffer)416 Tile::Tile(int tile_number, const uint8_t* const data, size_t size,
417            const ObuSequenceHeader& sequence_header,
418            const ObuFrameHeader& frame_header,
419            RefCountedBuffer* const current_frame, const DecoderState& state,
420            FrameScratchBuffer* const frame_scratch_buffer,
421            const WedgeMaskArray& wedge_masks,
422            const QuantizerMatrix& quantizer_matrix,
423            SymbolDecoderContext* const saved_symbol_decoder_context,
424            const SegmentationMap* prev_segment_ids,
425            PostFilter* const post_filter, const dsp::Dsp* const dsp,
426            ThreadPool* const thread_pool,
427            BlockingCounterWithStatus* const pending_tiles, bool frame_parallel,
428            bool use_intra_prediction_buffer)
429     : number_(tile_number),
430       row_(number_ / frame_header.tile_info.tile_columns),
431       column_(number_ % frame_header.tile_info.tile_columns),
432       data_(data),
433       size_(size),
434       read_deltas_(false),
435       subsampling_x_{0, sequence_header.color_config.subsampling_x,
436                      sequence_header.color_config.subsampling_x},
437       subsampling_y_{0, sequence_header.color_config.subsampling_y,
438                      sequence_header.color_config.subsampling_y},
439       current_quantizer_index_(frame_header.quantizer.base_index),
440       sequence_header_(sequence_header),
441       frame_header_(frame_header),
442       reference_frame_sign_bias_(state.reference_frame_sign_bias),
443       reference_frames_(state.reference_frame),
444       motion_field_(frame_scratch_buffer->motion_field),
445       reference_order_hint_(state.reference_order_hint),
446       wedge_masks_(wedge_masks),
447       quantizer_matrix_(quantizer_matrix),
448       reader_(data_, size_, frame_header_.enable_cdf_update),
449       symbol_decoder_context_(frame_scratch_buffer->symbol_decoder_context),
450       saved_symbol_decoder_context_(saved_symbol_decoder_context),
451       prev_segment_ids_(prev_segment_ids),
452       dsp_(*dsp),
453       post_filter_(*post_filter),
454       block_parameters_holder_(frame_scratch_buffer->block_parameters_holder),
455       quantizer_(sequence_header_.color_config.bitdepth,
456                  &frame_header_.quantizer),
457       residual_size_((sequence_header_.color_config.bitdepth == 8)
458                          ? sizeof(int16_t)
459                          : sizeof(int32_t)),
460       intra_block_copy_lag_(
461           frame_header_.allow_intrabc
462               ? (sequence_header_.use_128x128_superblock ? 3 : 5)
463               : 1),
464       current_frame_(*current_frame),
465       cdef_index_(frame_scratch_buffer->cdef_index),
466       inter_transform_sizes_(frame_scratch_buffer->inter_transform_sizes),
467       thread_pool_(thread_pool),
468       residual_buffer_pool_(frame_scratch_buffer->residual_buffer_pool.get()),
469       tile_scratch_buffer_pool_(
470           &frame_scratch_buffer->tile_scratch_buffer_pool),
471       pending_tiles_(pending_tiles),
472       frame_parallel_(frame_parallel),
473       use_intra_prediction_buffer_(use_intra_prediction_buffer),
474       intra_prediction_buffer_(
475           use_intra_prediction_buffer_
476               ? &frame_scratch_buffer->intra_prediction_buffers.get()[row_]
477               : nullptr) {
478   row4x4_start_ = frame_header.tile_info.tile_row_start[row_];
479   row4x4_end_ = frame_header.tile_info.tile_row_start[row_ + 1];
480   column4x4_start_ = frame_header.tile_info.tile_column_start[column_];
481   column4x4_end_ = frame_header.tile_info.tile_column_start[column_ + 1];
482   const int block_width4x4 = kNum4x4BlocksWide[SuperBlockSize()];
483   const int block_width4x4_log2 = k4x4HeightLog2[SuperBlockSize()];
484   superblock_rows_ =
485       (row4x4_end_ - row4x4_start_ + block_width4x4 - 1) >> block_width4x4_log2;
486   superblock_columns_ =
487       (column4x4_end_ - column4x4_start_ + block_width4x4 - 1) >>
488       block_width4x4_log2;
489   // If |split_parse_and_decode_| is true, we do the necessary setup for
490   // splitting the parsing and the decoding steps. This is done in the following
491   // two cases:
492   //  1) If there is multi-threading within a tile (this is done if
493   //     |thread_pool_| is not nullptr and if there are at least as many
494   //     superblock columns as |intra_block_copy_lag_|).
495   //  2) If |frame_parallel| is true.
496   split_parse_and_decode_ = (thread_pool_ != nullptr &&
497                              superblock_columns_ > intra_block_copy_lag_) ||
498                             frame_parallel;
499   if (frame_parallel_) {
500     reference_frame_progress_cache_.fill(INT_MIN);
501   }
502   memset(delta_lf_, 0, sizeof(delta_lf_));
503   delta_lf_all_zero_ = true;
504   const YuvBuffer& buffer = post_filter_.frame_buffer();
505   for (int plane = kPlaneY; plane < PlaneCount(); ++plane) {
506     // Verify that the borders are big enough for Reconstruct(). max_tx_length
507     // is the maximum value of tx_width and tx_height for the plane.
508     const int max_tx_length = (plane == kPlaneY) ? 64 : 32;
509     // Reconstruct() may overwrite on the right. Since the right border of a
510     // row is followed in memory by the left border of the next row, the
511     // number of extra pixels to the right of a row is at least the sum of the
512     // left and right borders.
513     //
514     // Note: This assertion actually checks the sum of the left and right
515     // borders of post_filter_.GetUnfilteredBuffer(), which is a horizontally
516     // and vertically shifted version of |buffer|. Since the sum of the left and
517     // right borders is not changed by the shift, we can just check the sum of
518     // the left and right borders of |buffer|.
519     assert(buffer.left_border(plane) + buffer.right_border(plane) >=
520            max_tx_length - 1);
521     // Reconstruct() may overwrite on the bottom. We need an extra border row
522     // on the bottom because we need the left border of that row.
523     //
524     // Note: This assertion checks the bottom border of
525     // post_filter_.GetUnfilteredBuffer(). So we need to calculate the vertical
526     // shift that the PostFilter constructor applied to |buffer| and reduce the
527     // bottom border by that amount.
528 #ifndef NDEBUG
529     const int vertical_shift = static_cast<int>(
530         (post_filter_.GetUnfilteredBuffer(plane) - buffer.data(plane)) /
531         buffer.stride(plane));
532     const int bottom_border = buffer.bottom_border(plane) - vertical_shift;
533     assert(bottom_border >= max_tx_length);
534 #endif
535     // In AV1, a transform block of height H starts at a y coordinate that is
536     // a multiple of H. If a transform block at the bottom of the frame has
537     // height H, then Reconstruct() will write up to the row with index
538     // Align(buffer.height(plane), H) - 1. Therefore the maximum number of
539     // rows Reconstruct() may write to is
540     // Align(buffer.height(plane), max_tx_length).
541     buffer_[plane].Reset(Align(buffer.height(plane), max_tx_length),
542                          buffer.stride(plane),
543                          post_filter_.GetUnfilteredBuffer(plane));
544     const int plane_height =
545         SubsampledValue(frame_header_.height, subsampling_y_[plane]);
546     deblock_row_limit_[plane] =
547         std::min(frame_header_.rows4x4, DivideBy4(plane_height + 3)
548                                             << subsampling_y_[plane]);
549     const int plane_width =
550         SubsampledValue(frame_header_.width, subsampling_x_[plane]);
551     deblock_column_limit_[plane] =
552         std::min(frame_header_.columns4x4, DivideBy4(plane_width + 3)
553                                                << subsampling_x_[plane]);
554   }
555 }
556 
Init()557 bool Tile::Init() {
558   assert(coefficient_levels_.size() == dc_categories_.size());
559   for (size_t i = 0; i < coefficient_levels_.size(); ++i) {
560     const int contexts_per_plane = (i == kEntropyContextLeft)
561                                        ? frame_header_.rows4x4
562                                        : frame_header_.columns4x4;
563     if (!coefficient_levels_[i].Reset(PlaneCount(), contexts_per_plane)) {
564       LIBGAV1_DLOG(ERROR, "coefficient_levels_[%zu].Reset() failed.", i);
565       return false;
566     }
567     if (!dc_categories_[i].Reset(PlaneCount(), contexts_per_plane)) {
568       LIBGAV1_DLOG(ERROR, "dc_categories_[%zu].Reset() failed.", i);
569       return false;
570     }
571   }
572   if (split_parse_and_decode_) {
573     assert(residual_buffer_pool_ != nullptr);
574     if (!residual_buffer_threaded_.Reset(superblock_rows_, superblock_columns_,
575                                          /*zero_initialize=*/false)) {
576       LIBGAV1_DLOG(ERROR, "residual_buffer_threaded_.Reset() failed.");
577       return false;
578     }
579   } else {
580     // Add 32 * |kResidualPaddingVertical| padding to avoid bottom boundary
581     // checks when parsing quantized coefficients.
582     residual_buffer_ = MakeAlignedUniquePtr<uint8_t>(
583         32, (4096 + 32 * kResidualPaddingVertical) * residual_size_);
584     if (residual_buffer_ == nullptr) {
585       LIBGAV1_DLOG(ERROR, "Allocation of residual_buffer_ failed.");
586       return false;
587     }
588     prediction_parameters_.reset(new (std::nothrow) PredictionParameters());
589     if (prediction_parameters_ == nullptr) {
590       LIBGAV1_DLOG(ERROR, "Allocation of prediction_parameters_ failed.");
591       return false;
592     }
593   }
594   if (frame_header_.use_ref_frame_mvs) {
595     assert(sequence_header_.enable_order_hint);
596     SetupMotionField(frame_header_, current_frame_, reference_frames_,
597                      row4x4_start_, row4x4_end_, column4x4_start_,
598                      column4x4_end_, &motion_field_);
599   }
600   ResetLoopRestorationParams();
601   return true;
602 }
603 
604 template <ProcessingMode processing_mode, bool save_symbol_decoder_context>
ProcessSuperBlockRow(int row4x4,TileScratchBuffer * const scratch_buffer)605 bool Tile::ProcessSuperBlockRow(int row4x4,
606                                 TileScratchBuffer* const scratch_buffer) {
607   if (row4x4 < row4x4_start_ || row4x4 >= row4x4_end_) return true;
608   assert(scratch_buffer != nullptr);
609   const int block_width4x4 = kNum4x4BlocksWide[SuperBlockSize()];
610   for (int column4x4 = column4x4_start_; column4x4 < column4x4_end_;
611        column4x4 += block_width4x4) {
612     if (!ProcessSuperBlock(row4x4, column4x4, scratch_buffer,
613                            processing_mode)) {
614       LIBGAV1_DLOG(ERROR, "Error decoding super block row: %d column: %d",
615                    row4x4, column4x4);
616       return false;
617     }
618   }
619   if (save_symbol_decoder_context && row4x4 + block_width4x4 >= row4x4_end_) {
620     SaveSymbolDecoderContext();
621   }
622   if (processing_mode == kProcessingModeDecodeOnly ||
623       processing_mode == kProcessingModeParseAndDecode) {
624     PopulateIntraPredictionBuffer(row4x4);
625   }
626   return true;
627 }
628 
629 // Used in frame parallel mode. The symbol decoder context need not be saved in
630 // this case since it was done when parsing was complete.
631 template bool Tile::ProcessSuperBlockRow<kProcessingModeDecodeOnly, false>(
632     int row4x4, TileScratchBuffer* scratch_buffer);
633 // Used in non frame parallel mode.
634 template bool Tile::ProcessSuperBlockRow<kProcessingModeParseAndDecode, true>(
635     int row4x4, TileScratchBuffer* scratch_buffer);
636 
SaveSymbolDecoderContext()637 void Tile::SaveSymbolDecoderContext() {
638   if (frame_header_.enable_frame_end_update_cdf &&
639       number_ == frame_header_.tile_info.context_update_id) {
640     *saved_symbol_decoder_context_ = symbol_decoder_context_;
641   }
642 }
643 
ParseAndDecode()644 bool Tile::ParseAndDecode() {
645   if (split_parse_and_decode_) {
646     if (!ThreadedParseAndDecode()) return false;
647     SaveSymbolDecoderContext();
648     return true;
649   }
650   std::unique_ptr<TileScratchBuffer> scratch_buffer =
651       tile_scratch_buffer_pool_->Get();
652   if (scratch_buffer == nullptr) {
653     pending_tiles_->Decrement(false);
654     LIBGAV1_DLOG(ERROR, "Failed to get scratch buffer.");
655     return false;
656   }
657   const int block_width4x4 = kNum4x4BlocksWide[SuperBlockSize()];
658   for (int row4x4 = row4x4_start_; row4x4 < row4x4_end_;
659        row4x4 += block_width4x4) {
660     if (!ProcessSuperBlockRow<kProcessingModeParseAndDecode, true>(
661             row4x4, scratch_buffer.get())) {
662       pending_tiles_->Decrement(false);
663       return false;
664     }
665   }
666   tile_scratch_buffer_pool_->Release(std::move(scratch_buffer));
667   pending_tiles_->Decrement(true);
668   return true;
669 }
670 
Parse()671 bool Tile::Parse() {
672   const int block_width4x4 = kNum4x4BlocksWide[SuperBlockSize()];
673   std::unique_ptr<TileScratchBuffer> scratch_buffer =
674       tile_scratch_buffer_pool_->Get();
675   if (scratch_buffer == nullptr) {
676     LIBGAV1_DLOG(ERROR, "Failed to get scratch buffer.");
677     return false;
678   }
679   for (int row4x4 = row4x4_start_; row4x4 < row4x4_end_;
680        row4x4 += block_width4x4) {
681     if (!ProcessSuperBlockRow<kProcessingModeParseOnly, false>(
682             row4x4, scratch_buffer.get())) {
683       return false;
684     }
685   }
686   tile_scratch_buffer_pool_->Release(std::move(scratch_buffer));
687   SaveSymbolDecoderContext();
688   return true;
689 }
690 
Decode(std::mutex * const mutex,int * const superblock_row_progress,std::condition_variable * const superblock_row_progress_condvar)691 bool Tile::Decode(
692     std::mutex* const mutex, int* const superblock_row_progress,
693     std::condition_variable* const superblock_row_progress_condvar) {
694   const int block_width4x4 = sequence_header_.use_128x128_superblock ? 32 : 16;
695   const int block_width4x4_log2 =
696       sequence_header_.use_128x128_superblock ? 5 : 4;
697   std::unique_ptr<TileScratchBuffer> scratch_buffer =
698       tile_scratch_buffer_pool_->Get();
699   if (scratch_buffer == nullptr) {
700     LIBGAV1_DLOG(ERROR, "Failed to get scratch buffer.");
701     return false;
702   }
703   for (int row4x4 = row4x4_start_, index = row4x4_start_ >> block_width4x4_log2;
704        row4x4 < row4x4_end_; row4x4 += block_width4x4, ++index) {
705     if (!ProcessSuperBlockRow<kProcessingModeDecodeOnly, false>(
706             row4x4, scratch_buffer.get())) {
707       return false;
708     }
709     if (post_filter_.DoDeblock()) {
710       // Apply vertical deblock filtering for all the columns in this tile
711       // except for the first 64 columns.
712       post_filter_.ApplyDeblockFilter(
713           kLoopFilterTypeVertical, row4x4,
714           column4x4_start_ + kNum4x4InLoopFilterUnit, column4x4_end_,
715           block_width4x4);
716       // If this is the first superblock row of the tile, then we cannot apply
717       // horizontal deblocking here since we don't know if the top row is
718       // available. So it will be done by the calling thread in that case.
719       if (row4x4 != row4x4_start_) {
720         // Apply horizontal deblock filtering for all the columns in this tile
721         // except for the first and the last 64 columns.
722         // Note about the last tile of each row: For the last tile,
723         // column4x4_end may not be a multiple of 16. In that case it is still
724         // okay to simply subtract 16 since ApplyDeblockFilter() will only do
725         // the filters in increments of 64 columns (or 32 columns for chroma
726         // with subsampling).
727         post_filter_.ApplyDeblockFilter(
728             kLoopFilterTypeHorizontal, row4x4,
729             column4x4_start_ + kNum4x4InLoopFilterUnit,
730             column4x4_end_ - kNum4x4InLoopFilterUnit, block_width4x4);
731       }
732     }
733     bool notify;
734     {
735       std::unique_lock<std::mutex> lock(*mutex);
736       notify = ++superblock_row_progress[index] ==
737                frame_header_.tile_info.tile_columns;
738     }
739     if (notify) {
740       // We are done decoding this superblock row. Notify the post filtering
741       // thread.
742       superblock_row_progress_condvar[index].notify_one();
743     }
744   }
745   tile_scratch_buffer_pool_->Release(std::move(scratch_buffer));
746   return true;
747 }
748 
ThreadedParseAndDecode()749 bool Tile::ThreadedParseAndDecode() {
750   {
751     std::lock_guard<std::mutex> lock(threading_.mutex);
752     if (!threading_.sb_state.Reset(superblock_rows_, superblock_columns_)) {
753       pending_tiles_->Decrement(false);
754       LIBGAV1_DLOG(ERROR, "threading.sb_state.Reset() failed.");
755       return false;
756     }
757     // Account for the parsing job.
758     ++threading_.pending_jobs;
759   }
760 
761   const int block_width4x4 = kNum4x4BlocksWide[SuperBlockSize()];
762 
763   // Begin parsing.
764   std::unique_ptr<TileScratchBuffer> scratch_buffer =
765       tile_scratch_buffer_pool_->Get();
766   if (scratch_buffer == nullptr) {
767     pending_tiles_->Decrement(false);
768     LIBGAV1_DLOG(ERROR, "Failed to get scratch buffer.");
769     return false;
770   }
771   for (int row4x4 = row4x4_start_, row_index = 0; row4x4 < row4x4_end_;
772        row4x4 += block_width4x4, ++row_index) {
773     for (int column4x4 = column4x4_start_, column_index = 0;
774          column4x4 < column4x4_end_;
775          column4x4 += block_width4x4, ++column_index) {
776       if (!ProcessSuperBlock(row4x4, column4x4, scratch_buffer.get(),
777                              kProcessingModeParseOnly)) {
778         std::lock_guard<std::mutex> lock(threading_.mutex);
779         threading_.abort = true;
780         break;
781       }
782       std::unique_lock<std::mutex> lock(threading_.mutex);
783       if (threading_.abort) break;
784       threading_.sb_state[row_index][column_index] = kSuperBlockStateParsed;
785       // Schedule the decoding of this superblock if it is allowed.
786       if (CanDecode(row_index, column_index)) {
787         ++threading_.pending_jobs;
788         threading_.sb_state[row_index][column_index] =
789             kSuperBlockStateScheduled;
790         lock.unlock();
791         thread_pool_->Schedule(
792             [this, row_index, column_index, block_width4x4]() {
793               DecodeSuperBlock(row_index, column_index, block_width4x4);
794             });
795       }
796     }
797     std::lock_guard<std::mutex> lock(threading_.mutex);
798     if (threading_.abort) break;
799   }
800   tile_scratch_buffer_pool_->Release(std::move(scratch_buffer));
801 
802   // We are done parsing. We can return here since the calling thread will make
803   // sure that it waits for all the superblocks to be decoded.
804   //
805   // Finish using |threading_| before |pending_tiles_->Decrement()| because the
806   // Tile object could go out of scope as soon as |pending_tiles_->Decrement()|
807   // is called.
808   threading_.mutex.lock();
809   const bool no_pending_jobs = (--threading_.pending_jobs == 0);
810   const bool job_succeeded = !threading_.abort;
811   threading_.mutex.unlock();
812   if (no_pending_jobs) {
813     // We are done parsing and decoding this tile.
814     pending_tiles_->Decrement(job_succeeded);
815   }
816   return job_succeeded;
817 }
818 
CanDecode(int row_index,int column_index) const819 bool Tile::CanDecode(int row_index, int column_index) const {
820   assert(row_index >= 0);
821   assert(column_index >= 0);
822   // If |threading_.sb_state[row_index][column_index]| is not equal to
823   // kSuperBlockStateParsed, then return false. This is ok because if
824   // |threading_.sb_state[row_index][column_index]| is equal to:
825   //   kSuperBlockStateNone - then the superblock is not yet parsed.
826   //   kSuperBlockStateScheduled - then the superblock is already scheduled for
827   //                               decode.
828   //   kSuperBlockStateDecoded - then the superblock has already been decoded.
829   if (row_index >= superblock_rows_ || column_index >= superblock_columns_ ||
830       threading_.sb_state[row_index][column_index] != kSuperBlockStateParsed) {
831     return false;
832   }
833   // First superblock has no dependencies.
834   if (row_index == 0 && column_index == 0) {
835     return true;
836   }
837   // Superblocks in the first row only depend on the superblock to the left of
838   // it.
839   if (row_index == 0) {
840     return threading_.sb_state[0][column_index - 1] == kSuperBlockStateDecoded;
841   }
842   // All other superblocks depend on superblock to the left of it (if one
843   // exists) and superblock to the top right with a lag of
844   // |intra_block_copy_lag_| (if one exists).
845   const int top_right_column_index =
846       std::min(column_index + intra_block_copy_lag_, superblock_columns_ - 1);
847   return threading_.sb_state[row_index - 1][top_right_column_index] ==
848              kSuperBlockStateDecoded &&
849          (column_index == 0 ||
850           threading_.sb_state[row_index][column_index - 1] ==
851               kSuperBlockStateDecoded);
852 }
853 
DecodeSuperBlock(int row_index,int column_index,int block_width4x4)854 void Tile::DecodeSuperBlock(int row_index, int column_index,
855                             int block_width4x4) {
856   const int row4x4 = row4x4_start_ + (row_index * block_width4x4);
857   const int column4x4 = column4x4_start_ + (column_index * block_width4x4);
858   std::unique_ptr<TileScratchBuffer> scratch_buffer =
859       tile_scratch_buffer_pool_->Get();
860   bool ok = scratch_buffer != nullptr;
861   if (ok) {
862     ok = ProcessSuperBlock(row4x4, column4x4, scratch_buffer.get(),
863                            kProcessingModeDecodeOnly);
864     tile_scratch_buffer_pool_->Release(std::move(scratch_buffer));
865   }
866   std::unique_lock<std::mutex> lock(threading_.mutex);
867   if (ok) {
868     threading_.sb_state[row_index][column_index] = kSuperBlockStateDecoded;
869     // Candidate rows and columns that we could potentially begin the decoding
870     // (if it is allowed to do so). The candidates are:
871     //   1) The superblock to the bottom-left of the current superblock with a
872     //   lag of |intra_block_copy_lag_| (or the beginning of the next superblock
873     //   row in case there are less than |intra_block_copy_lag_| superblock
874     //   columns in the Tile).
875     //   2) The superblock to the right of the current superblock.
876     const int candidate_row_indices[] = {row_index + 1, row_index};
877     const int candidate_column_indices[] = {
878         std::max(0, column_index - intra_block_copy_lag_), column_index + 1};
879     for (size_t i = 0; i < std::extent<decltype(candidate_row_indices)>::value;
880          ++i) {
881       const int candidate_row_index = candidate_row_indices[i];
882       const int candidate_column_index = candidate_column_indices[i];
883       if (!CanDecode(candidate_row_index, candidate_column_index)) {
884         continue;
885       }
886       ++threading_.pending_jobs;
887       threading_.sb_state[candidate_row_index][candidate_column_index] =
888           kSuperBlockStateScheduled;
889       lock.unlock();
890       thread_pool_->Schedule([this, candidate_row_index, candidate_column_index,
891                               block_width4x4]() {
892         DecodeSuperBlock(candidate_row_index, candidate_column_index,
893                          block_width4x4);
894       });
895       lock.lock();
896     }
897   } else {
898     threading_.abort = true;
899   }
900   // Finish using |threading_| before |pending_tiles_->Decrement()| because the
901   // Tile object could go out of scope as soon as |pending_tiles_->Decrement()|
902   // is called.
903   const bool no_pending_jobs = (--threading_.pending_jobs == 0);
904   const bool job_succeeded = !threading_.abort;
905   lock.unlock();
906   if (no_pending_jobs) {
907     // We are done parsing and decoding this tile.
908     pending_tiles_->Decrement(job_succeeded);
909   }
910 }
911 
PopulateIntraPredictionBuffer(int row4x4)912 void Tile::PopulateIntraPredictionBuffer(int row4x4) {
913   const int block_width4x4 = kNum4x4BlocksWide[SuperBlockSize()];
914   if (!use_intra_prediction_buffer_ || row4x4 + block_width4x4 >= row4x4_end_) {
915     return;
916   }
917   const size_t pixel_size =
918       (sequence_header_.color_config.bitdepth == 8 ? sizeof(uint8_t)
919                                                    : sizeof(uint16_t));
920   for (int plane = kPlaneY; plane < PlaneCount(); ++plane) {
921     const int row_to_copy =
922         (MultiplyBy4(row4x4 + block_width4x4) >> subsampling_y_[plane]) - 1;
923     const size_t pixels_to_copy =
924         (MultiplyBy4(column4x4_end_ - column4x4_start_) >>
925          subsampling_x_[plane]) *
926         pixel_size;
927     const size_t column_start =
928         MultiplyBy4(column4x4_start_) >> subsampling_x_[plane];
929     void* start;
930 #if LIBGAV1_MAX_BITDEPTH >= 10
931     if (sequence_header_.color_config.bitdepth > 8) {
932       Array2DView<uint16_t> buffer(
933           buffer_[plane].rows(), buffer_[plane].columns() / sizeof(uint16_t),
934           reinterpret_cast<uint16_t*>(&buffer_[plane][0][0]));
935       start = &buffer[row_to_copy][column_start];
936     } else  // NOLINT
937 #endif
938     {
939       start = &buffer_[plane][row_to_copy][column_start];
940     }
941     memcpy((*intra_prediction_buffer_)[plane].get() + column_start * pixel_size,
942            start, pixels_to_copy);
943   }
944 }
945 
GetTransformAllZeroContext(const Block & block,Plane plane,TransformSize tx_size,int x4,int y4,int w4,int h4)946 int Tile::GetTransformAllZeroContext(const Block& block, Plane plane,
947                                      TransformSize tx_size, int x4, int y4,
948                                      int w4, int h4) {
949   const int max_x4x4 = frame_header_.columns4x4 >> subsampling_x_[plane];
950   const int max_y4x4 = frame_header_.rows4x4 >> subsampling_y_[plane];
951 
952   const int tx_width = kTransformWidth[tx_size];
953   const int tx_height = kTransformHeight[tx_size];
954   const BlockSize plane_size = block.residual_size[plane];
955   const int block_width = kBlockWidthPixels[plane_size];
956   const int block_height = kBlockHeightPixels[plane_size];
957 
958   int top = 0;
959   int left = 0;
960   const int num_top_elements = GetNumElements(w4, x4, max_x4x4);
961   const int num_left_elements = GetNumElements(h4, y4, max_y4x4);
962   if (plane == kPlaneY) {
963     if (block_width == tx_width && block_height == tx_height) return 0;
964     const uint8_t* coefficient_levels =
965         &coefficient_levels_[kEntropyContextTop][plane][x4];
966     for (int i = 0; i < num_top_elements; ++i) {
967       top = std::max(top, static_cast<int>(coefficient_levels[i]));
968     }
969     coefficient_levels = &coefficient_levels_[kEntropyContextLeft][plane][y4];
970     for (int i = 0; i < num_left_elements; ++i) {
971       left = std::max(left, static_cast<int>(coefficient_levels[i]));
972     }
973     assert(top <= 4);
974     assert(left <= 4);
975     // kAllZeroContextsByTopLeft is pre-computed based on the logic in the spec
976     // for top and left.
977     return kAllZeroContextsByTopLeft[top][left];
978   }
979   const uint8_t* coefficient_levels =
980       &coefficient_levels_[kEntropyContextTop][plane][x4];
981   const int8_t* dc_categories = &dc_categories_[kEntropyContextTop][plane][x4];
982   for (int i = 0; i < num_top_elements; ++i) {
983     top |= coefficient_levels[i];
984     top |= dc_categories[i];
985   }
986   coefficient_levels = &coefficient_levels_[kEntropyContextLeft][plane][y4];
987   dc_categories = &dc_categories_[kEntropyContextLeft][plane][y4];
988   for (int i = 0; i < num_left_elements; ++i) {
989     left |= coefficient_levels[i];
990     left |= dc_categories[i];
991   }
992   return static_cast<int>(top != 0) + static_cast<int>(left != 0) + 7 +
993          3 * static_cast<int>(block_width * block_height >
994                               tx_width * tx_height);
995 }
996 
GetTransformSet(TransformSize tx_size,bool is_inter) const997 TransformSet Tile::GetTransformSet(TransformSize tx_size, bool is_inter) const {
998   const TransformSize tx_size_square_min = kTransformSizeSquareMin[tx_size];
999   const TransformSize tx_size_square_max = kTransformSizeSquareMax[tx_size];
1000   if (tx_size_square_max == kTransformSize64x64) return kTransformSetDctOnly;
1001   if (is_inter) {
1002     if (frame_header_.reduced_tx_set ||
1003         tx_size_square_max == kTransformSize32x32) {
1004       return kTransformSetInter3;
1005     }
1006     if (tx_size_square_min == kTransformSize16x16) return kTransformSetInter2;
1007     return kTransformSetInter1;
1008   }
1009   if (tx_size_square_max == kTransformSize32x32) return kTransformSetDctOnly;
1010   if (frame_header_.reduced_tx_set ||
1011       tx_size_square_min == kTransformSize16x16) {
1012     return kTransformSetIntra2;
1013   }
1014   return kTransformSetIntra1;
1015 }
1016 
ComputeTransformType(const Block & block,Plane plane,TransformSize tx_size,int block_x,int block_y)1017 TransformType Tile::ComputeTransformType(const Block& block, Plane plane,
1018                                          TransformSize tx_size, int block_x,
1019                                          int block_y) {
1020   const BlockParameters& bp = *block.bp;
1021   const TransformSize tx_size_square_max = kTransformSizeSquareMax[tx_size];
1022   if (frame_header_.segmentation.lossless[bp.segment_id] ||
1023       tx_size_square_max == kTransformSize64x64) {
1024     return kTransformTypeDctDct;
1025   }
1026   if (plane == kPlaneY) {
1027     return transform_types_[block_y - block.row4x4][block_x - block.column4x4];
1028   }
1029   const TransformSet tx_set = GetTransformSet(tx_size, bp.is_inter);
1030   TransformType tx_type;
1031   if (bp.is_inter) {
1032     const int x4 =
1033         std::max(block.column4x4, block_x << subsampling_x_[kPlaneU]);
1034     const int y4 = std::max(block.row4x4, block_y << subsampling_y_[kPlaneU]);
1035     tx_type = transform_types_[y4 - block.row4x4][x4 - block.column4x4];
1036   } else {
1037     tx_type = kModeToTransformType[bp.uv_mode];
1038   }
1039   return kTransformTypeInSetMask[tx_set].Contains(tx_type)
1040              ? tx_type
1041              : kTransformTypeDctDct;
1042 }
1043 
ReadTransformType(const Block & block,int x4,int y4,TransformSize tx_size)1044 void Tile::ReadTransformType(const Block& block, int x4, int y4,
1045                              TransformSize tx_size) {
1046   BlockParameters& bp = *block.bp;
1047   const TransformSet tx_set = GetTransformSet(tx_size, bp.is_inter);
1048 
1049   TransformType tx_type = kTransformTypeDctDct;
1050   if (tx_set != kTransformSetDctOnly &&
1051       frame_header_.segmentation.qindex[bp.segment_id] > 0) {
1052     const int cdf_index = SymbolDecoderContext::TxTypeIndex(tx_set);
1053     const int cdf_tx_size_index =
1054         TransformSizeToSquareTransformIndex(kTransformSizeSquareMin[tx_size]);
1055     uint16_t* cdf;
1056     if (bp.is_inter) {
1057       cdf = symbol_decoder_context_
1058                 .inter_tx_type_cdf[cdf_index][cdf_tx_size_index];
1059       switch (tx_set) {
1060         case kTransformSetInter1:
1061           tx_type = static_cast<TransformType>(reader_.ReadSymbol<16>(cdf));
1062           break;
1063         case kTransformSetInter2:
1064           tx_type = static_cast<TransformType>(reader_.ReadSymbol<12>(cdf));
1065           break;
1066         default:
1067           assert(tx_set == kTransformSetInter3);
1068           tx_type = static_cast<TransformType>(reader_.ReadSymbol(cdf));
1069           break;
1070       }
1071     } else {
1072       const PredictionMode intra_direction =
1073           block.bp->prediction_parameters->use_filter_intra
1074               ? kFilterIntraModeToIntraPredictor[block.bp->prediction_parameters
1075                                                      ->filter_intra_mode]
1076               : bp.y_mode;
1077       cdf =
1078           symbol_decoder_context_
1079               .intra_tx_type_cdf[cdf_index][cdf_tx_size_index][intra_direction];
1080       assert(tx_set == kTransformSetIntra1 || tx_set == kTransformSetIntra2);
1081       tx_type = static_cast<TransformType>((tx_set == kTransformSetIntra1)
1082                                                ? reader_.ReadSymbol<7>(cdf)
1083                                                : reader_.ReadSymbol<5>(cdf));
1084     }
1085 
1086     // This array does not contain an entry for kTransformSetDctOnly, so the
1087     // first dimension needs to be offset by 1.
1088     tx_type = kInverseTransformTypeBySet[tx_set - 1][tx_type];
1089   }
1090   SetTransformType(block, x4, y4, kTransformWidth4x4[tx_size],
1091                    kTransformHeight4x4[tx_size], tx_type, transform_types_);
1092 }
1093 
1094 // Section 8.3.2 in the spec, under coeff_base and coeff_br.
1095 // Bottom boundary checks are avoided by the padded rows.
1096 // For a coefficient near the right boundary, the two right neighbors and the
1097 // one bottom-right neighbor may be out of boundary. We don't check the right
1098 // boundary for them, because the out of boundary neighbors project to positions
1099 // above the diagonal line which goes through the current coefficient and these
1100 // positions are still all 0s according to the diagonal scan order.
1101 template <typename ResidualType>
ReadCoeffBase2D(const uint16_t * scan,TransformSize tx_size,int adjusted_tx_width_log2,int eob,uint16_t coeff_base_cdf[kCoeffBaseContexts][kCoeffBaseSymbolCount+1],uint16_t coeff_base_range_cdf[kCoeffBaseRangeContexts][kCoeffBaseRangeSymbolCount+1],ResidualType * const quantized_buffer,uint8_t * const level_buffer)1102 void Tile::ReadCoeffBase2D(
1103     const uint16_t* scan, TransformSize tx_size, int adjusted_tx_width_log2,
1104     int eob,
1105     uint16_t coeff_base_cdf[kCoeffBaseContexts][kCoeffBaseSymbolCount + 1],
1106     uint16_t coeff_base_range_cdf[kCoeffBaseRangeContexts]
1107                                  [kCoeffBaseRangeSymbolCount + 1],
1108     ResidualType* const quantized_buffer, uint8_t* const level_buffer) {
1109   const int tx_width = 1 << adjusted_tx_width_log2;
1110   for (int i = eob - 2; i >= 1; --i) {
1111     const uint16_t pos = scan[i];
1112     const int row = pos >> adjusted_tx_width_log2;
1113     const int column = pos & (tx_width - 1);
1114     auto* const quantized = &quantized_buffer[pos];
1115     auto* const levels = &level_buffer[pos];
1116     const int neighbor_sum = 1 + levels[1] + levels[tx_width] +
1117                              levels[tx_width + 1] + levels[2] +
1118                              levels[MultiplyBy2(tx_width)];
1119     const int context =
1120         ((neighbor_sum > 7) ? 4 : DivideBy2(neighbor_sum)) +
1121         kCoeffBaseContextOffset[tx_size][std::min(row, 4)][std::min(column, 4)];
1122     int level =
1123         reader_.ReadSymbol<kCoeffBaseSymbolCount>(coeff_base_cdf[context]);
1124     levels[0] = level;
1125     if (level > kNumQuantizerBaseLevels) {
1126       // No need to clip quantized values to COEFF_BASE_RANGE + NUM_BASE_LEVELS
1127       // + 1, because we clip the overall output to 6 and the unclipped
1128       // quantized values will always result in an output of greater than 6.
1129       int context = std::min(6, DivideBy2(1 + quantized[1] +          // {0, 1}
1130                                           quantized[tx_width] +       // {1, 0}
1131                                           quantized[tx_width + 1]));  // {1, 1}
1132       context += 14 >> static_cast<int>((row | column) < 2);
1133       level += ReadCoeffBaseRange(coeff_base_range_cdf[context]);
1134     }
1135     quantized[0] = level;
1136   }
1137   // Read position 0.
1138   {
1139     auto* const quantized = &quantized_buffer[0];
1140     int level = reader_.ReadSymbol<kCoeffBaseSymbolCount>(coeff_base_cdf[0]);
1141     level_buffer[0] = level;
1142     if (level > kNumQuantizerBaseLevels) {
1143       // No need to clip quantized values to COEFF_BASE_RANGE + NUM_BASE_LEVELS
1144       // + 1, because we clip the overall output to 6 and the unclipped
1145       // quantized values will always result in an output of greater than 6.
1146       const int context =
1147           std::min(6, DivideBy2(1 + quantized[1] +          // {0, 1}
1148                                 quantized[tx_width] +       // {1, 0}
1149                                 quantized[tx_width + 1]));  // {1, 1}
1150       level += ReadCoeffBaseRange(coeff_base_range_cdf[context]);
1151     }
1152     quantized[0] = level;
1153   }
1154 }
1155 
1156 // Section 8.3.2 in the spec, under coeff_base and coeff_br.
1157 // Bottom boundary checks are avoided by the padded rows.
1158 // For a coefficient near the right boundary, the four right neighbors may be
1159 // out of boundary. We don't do the boundary check for the first three right
1160 // neighbors, because even for the transform blocks with smallest width 4, the
1161 // first three out of boundary neighbors project to positions left of the
1162 // current coefficient and these positions are still all 0s according to the
1163 // column scan order. However, when transform block width is 4 and the current
1164 // coefficient is on the right boundary, its fourth right neighbor projects to
1165 // the under position on the same column, which could be nonzero. Therefore, we
1166 // must skip the fourth right neighbor. To make it simple, for any coefficient,
1167 // we always do the boundary check for its fourth right neighbor.
1168 template <typename ResidualType>
ReadCoeffBaseHorizontal(const uint16_t * scan,TransformSize,int adjusted_tx_width_log2,int eob,uint16_t coeff_base_cdf[kCoeffBaseContexts][kCoeffBaseSymbolCount+1],uint16_t coeff_base_range_cdf[kCoeffBaseRangeContexts][kCoeffBaseRangeSymbolCount+1],ResidualType * const quantized_buffer,uint8_t * const level_buffer)1169 void Tile::ReadCoeffBaseHorizontal(
1170     const uint16_t* scan, TransformSize /*tx_size*/, int adjusted_tx_width_log2,
1171     int eob,
1172     uint16_t coeff_base_cdf[kCoeffBaseContexts][kCoeffBaseSymbolCount + 1],
1173     uint16_t coeff_base_range_cdf[kCoeffBaseRangeContexts]
1174                                  [kCoeffBaseRangeSymbolCount + 1],
1175     ResidualType* const quantized_buffer, uint8_t* const level_buffer) {
1176   const int tx_width = 1 << adjusted_tx_width_log2;
1177   int i = eob - 2;
1178   do {
1179     const uint16_t pos = scan[i];
1180     const int column = pos & (tx_width - 1);
1181     auto* const quantized = &quantized_buffer[pos];
1182     auto* const levels = &level_buffer[pos];
1183     const int neighbor_sum =
1184         1 + (levels[1] +                                  // {0, 1}
1185              levels[tx_width] +                           // {1, 0}
1186              levels[2] +                                  // {0, 2}
1187              levels[3] +                                  // {0, 3}
1188              ((column + 4 < tx_width) ? levels[4] : 0));  // {0, 4}
1189     const int context = ((neighbor_sum > 7) ? 4 : DivideBy2(neighbor_sum)) +
1190                         kCoeffBasePositionContextOffset[column];
1191     int level =
1192         reader_.ReadSymbol<kCoeffBaseSymbolCount>(coeff_base_cdf[context]);
1193     levels[0] = level;
1194     if (level > kNumQuantizerBaseLevels) {
1195       // No need to clip quantized values to COEFF_BASE_RANGE + NUM_BASE_LEVELS
1196       // + 1, because we clip the overall output to 6 and the unclipped
1197       // quantized values will always result in an output of greater than 6.
1198       int context = std::min(6, DivideBy2(1 + quantized[1] +     // {0, 1}
1199                                           quantized[tx_width] +  // {1, 0}
1200                                           quantized[2]));        // {0, 2}
1201       if (pos != 0) {
1202         context += 14 >> static_cast<int>(column == 0);
1203       }
1204       level += ReadCoeffBaseRange(coeff_base_range_cdf[context]);
1205     }
1206     quantized[0] = level;
1207   } while (--i >= 0);
1208 }
1209 
1210 // Section 8.3.2 in the spec, under coeff_base and coeff_br.
1211 // Bottom boundary checks are avoided by the padded rows.
1212 // Right boundary check is performed explicitly.
1213 template <typename ResidualType>
ReadCoeffBaseVertical(const uint16_t * scan,TransformSize,int adjusted_tx_width_log2,int eob,uint16_t coeff_base_cdf[kCoeffBaseContexts][kCoeffBaseSymbolCount+1],uint16_t coeff_base_range_cdf[kCoeffBaseRangeContexts][kCoeffBaseRangeSymbolCount+1],ResidualType * const quantized_buffer,uint8_t * const level_buffer)1214 void Tile::ReadCoeffBaseVertical(
1215     const uint16_t* scan, TransformSize /*tx_size*/, int adjusted_tx_width_log2,
1216     int eob,
1217     uint16_t coeff_base_cdf[kCoeffBaseContexts][kCoeffBaseSymbolCount + 1],
1218     uint16_t coeff_base_range_cdf[kCoeffBaseRangeContexts]
1219                                  [kCoeffBaseRangeSymbolCount + 1],
1220     ResidualType* const quantized_buffer, uint8_t* const level_buffer) {
1221   const int tx_width = 1 << adjusted_tx_width_log2;
1222   int i = eob - 2;
1223   do {
1224     const uint16_t pos = scan[i];
1225     const int row = pos >> adjusted_tx_width_log2;
1226     const int column = pos & (tx_width - 1);
1227     auto* const quantized = &quantized_buffer[pos];
1228     auto* const levels = &level_buffer[pos];
1229     const int neighbor_sum =
1230         1 + (((column + 1 < tx_width) ? levels[1] : 0) +  // {0, 1}
1231              levels[tx_width] +                           // {1, 0}
1232              levels[MultiplyBy2(tx_width)] +              // {2, 0}
1233              levels[tx_width * 3] +                       // {3, 0}
1234              levels[MultiplyBy4(tx_width)]);              // {4, 0}
1235     const int context = ((neighbor_sum > 7) ? 4 : DivideBy2(neighbor_sum)) +
1236                         kCoeffBasePositionContextOffset[row];
1237     int level =
1238         reader_.ReadSymbol<kCoeffBaseSymbolCount>(coeff_base_cdf[context]);
1239     levels[0] = level;
1240     if (level > kNumQuantizerBaseLevels) {
1241       // No need to clip quantized values to COEFF_BASE_RANGE + NUM_BASE_LEVELS
1242       // + 1, because we clip the overall output to 6 and the unclipped
1243       // quantized values will always result in an output of greater than 6.
1244       const int quantized_column1 = (column + 1 < tx_width) ? quantized[1] : 0;
1245       int context =
1246           std::min(6, DivideBy2(1 + quantized_column1 +              // {0, 1}
1247                                 quantized[tx_width] +                // {1, 0}
1248                                 quantized[MultiplyBy2(tx_width)]));  // {2, 0}
1249       if (pos != 0) {
1250         context += 14 >> static_cast<int>(row == 0);
1251       }
1252       level += ReadCoeffBaseRange(coeff_base_range_cdf[context]);
1253     }
1254     quantized[0] = level;
1255   } while (--i >= 0);
1256 }
1257 
GetDcSignContext(int x4,int y4,int w4,int h4,Plane plane)1258 int Tile::GetDcSignContext(int x4, int y4, int w4, int h4, Plane plane) {
1259   const int max_x4x4 = frame_header_.columns4x4 >> subsampling_x_[plane];
1260   const int8_t* dc_categories = &dc_categories_[kEntropyContextTop][plane][x4];
1261   // Set dc_sign to 8-bit long so that std::accumulate() saves sign extension.
1262   int8_t dc_sign = std::accumulate(
1263       dc_categories, dc_categories + GetNumElements(w4, x4, max_x4x4), 0);
1264   const int max_y4x4 = frame_header_.rows4x4 >> subsampling_y_[plane];
1265   dc_categories = &dc_categories_[kEntropyContextLeft][plane][y4];
1266   dc_sign = std::accumulate(
1267       dc_categories, dc_categories + GetNumElements(h4, y4, max_y4x4), dc_sign);
1268   // This return statement is equivalent to:
1269   //   if (dc_sign < 0) return 1;
1270   //   if (dc_sign > 0) return 2;
1271   //   return 0;
1272   // And it is better than:
1273   //   return static_cast<int>(dc_sign != 0) + static_cast<int>(dc_sign > 0);
1274   return static_cast<int>(dc_sign < 0) +
1275          MultiplyBy2(static_cast<int>(dc_sign > 0));
1276 }
1277 
SetEntropyContexts(int x4,int y4,int w4,int h4,Plane plane,uint8_t coefficient_level,int8_t dc_category)1278 void Tile::SetEntropyContexts(int x4, int y4, int w4, int h4, Plane plane,
1279                               uint8_t coefficient_level, int8_t dc_category) {
1280   const int max_x4x4 = frame_header_.columns4x4 >> subsampling_x_[plane];
1281   const int num_top_elements = GetNumElements(w4, x4, max_x4x4);
1282   memset(&coefficient_levels_[kEntropyContextTop][plane][x4], coefficient_level,
1283          num_top_elements);
1284   memset(&dc_categories_[kEntropyContextTop][plane][x4], dc_category,
1285          num_top_elements);
1286   const int max_y4x4 = frame_header_.rows4x4 >> subsampling_y_[plane];
1287   const int num_left_elements = GetNumElements(h4, y4, max_y4x4);
1288   memset(&coefficient_levels_[kEntropyContextLeft][plane][y4],
1289          coefficient_level, num_left_elements);
1290   memset(&dc_categories_[kEntropyContextLeft][plane][y4], dc_category,
1291          num_left_elements);
1292 }
1293 
1294 template <typename ResidualType, bool is_dc_coefficient>
ReadSignAndApplyDequantization(const uint16_t * const scan,int i,int q_value,const uint8_t * const quantizer_matrix,int shift,int max_value,uint16_t * const dc_sign_cdf,int8_t * const dc_category,int * const coefficient_level,ResidualType * residual_buffer)1295 bool Tile::ReadSignAndApplyDequantization(
1296     const uint16_t* const scan, int i, int q_value,
1297     const uint8_t* const quantizer_matrix, int shift, int max_value,
1298     uint16_t* const dc_sign_cdf, int8_t* const dc_category,
1299     int* const coefficient_level, ResidualType* residual_buffer) {
1300   const int pos = is_dc_coefficient ? 0 : scan[i];
1301   // If residual_buffer[pos] is zero, then the rest of the function has no
1302   // effect.
1303   int level = residual_buffer[pos];
1304   if (level == 0) return true;
1305   const int sign = is_dc_coefficient
1306                        ? static_cast<int>(reader_.ReadSymbol(dc_sign_cdf))
1307                        : reader_.ReadBit();
1308   if (level > kNumQuantizerBaseLevels + kQuantizerCoefficientBaseRange) {
1309     int length = 0;
1310     bool golomb_length_bit = false;
1311     do {
1312       golomb_length_bit = static_cast<bool>(reader_.ReadBit());
1313       ++length;
1314       if (length > 20) {
1315         LIBGAV1_DLOG(ERROR, "Invalid golomb_length %d", length);
1316         return false;
1317       }
1318     } while (!golomb_length_bit);
1319     int x = 1;
1320     for (int i = length - 2; i >= 0; --i) {
1321       x = (x << 1) | reader_.ReadBit();
1322     }
1323     level += x - 1;
1324   }
1325   if (is_dc_coefficient) {
1326     *dc_category = (sign != 0) ? -1 : 1;
1327   }
1328   level &= 0xfffff;
1329   *coefficient_level += level;
1330   // Apply dequantization. Step 1 of section 7.12.3 in the spec.
1331   int q = q_value;
1332   if (quantizer_matrix != nullptr) {
1333     q = RightShiftWithRounding(q * quantizer_matrix[pos], 5);
1334   }
1335   // The intermediate multiplication can exceed 32 bits, so it has to be
1336   // performed by promoting one of the values to int64_t.
1337   int32_t dequantized_value = (static_cast<int64_t>(q) * level) & 0xffffff;
1338   dequantized_value >>= shift;
1339   // At this point:
1340   //   * |dequantized_value| is always non-negative.
1341   //   * |sign| can be either 0 or 1.
1342   //   * min_value = -(max_value + 1).
1343   // We need to apply the following:
1344   // dequantized_value = sign ? -dequantized_value : dequantized_value;
1345   // dequantized_value = Clip3(dequantized_value, min_value, max_value);
1346   //
1347   // Note that -x == ~(x - 1).
1348   //
1349   // Now, The above two lines can be done with a std::min and xor as follows:
1350   dequantized_value = std::min(dequantized_value - sign, max_value) ^ -sign;
1351   residual_buffer[pos] = dequantized_value;
1352   return true;
1353 }
1354 
ReadCoeffBaseRange(uint16_t * cdf)1355 int Tile::ReadCoeffBaseRange(uint16_t* cdf) {
1356   int level = 0;
1357   for (int j = 0; j < kCoeffBaseRangeMaxIterations; ++j) {
1358     const int coeff_base_range =
1359         reader_.ReadSymbol<kCoeffBaseRangeSymbolCount>(cdf);
1360     level += coeff_base_range;
1361     if (coeff_base_range < (kCoeffBaseRangeSymbolCount - 1)) break;
1362   }
1363   return level;
1364 }
1365 
1366 template <typename ResidualType>
ReadTransformCoefficients(const Block & block,Plane plane,int start_x,int start_y,TransformSize tx_size,TransformType * const tx_type)1367 int Tile::ReadTransformCoefficients(const Block& block, Plane plane,
1368                                     int start_x, int start_y,
1369                                     TransformSize tx_size,
1370                                     TransformType* const tx_type) {
1371   const int x4 = DivideBy4(start_x);
1372   const int y4 = DivideBy4(start_y);
1373   const int w4 = kTransformWidth4x4[tx_size];
1374   const int h4 = kTransformHeight4x4[tx_size];
1375   const int tx_size_context = kTransformSizeContext[tx_size];
1376   int context =
1377       GetTransformAllZeroContext(block, plane, tx_size, x4, y4, w4, h4);
1378   const bool all_zero = reader_.ReadSymbol(
1379       symbol_decoder_context_.all_zero_cdf[tx_size_context][context]);
1380   if (all_zero) {
1381     if (plane == kPlaneY) {
1382       SetTransformType(block, x4, y4, w4, h4, kTransformTypeDctDct,
1383                        transform_types_);
1384     }
1385     SetEntropyContexts(x4, y4, w4, h4, plane, 0, 0);
1386     // This is not used in this case, so it can be set to any value.
1387     *tx_type = kNumTransformTypes;
1388     return 0;
1389   }
1390   const int tx_width = kTransformWidth[tx_size];
1391   const int tx_height = kTransformHeight[tx_size];
1392   const TransformSize adjusted_tx_size = kAdjustedTransformSize[tx_size];
1393   const int adjusted_tx_width_log2 = kTransformWidthLog2[adjusted_tx_size];
1394   const int tx_padding =
1395       (1 << adjusted_tx_width_log2) * kResidualPaddingVertical;
1396   auto* residual = reinterpret_cast<ResidualType*>(*block.residual);
1397   // Clear padding to avoid bottom boundary checks when parsing quantized
1398   // coefficients.
1399   memset(residual, 0, (tx_width * tx_height + tx_padding) * residual_size_);
1400   uint8_t level_buffer[(32 + kResidualPaddingVertical) * 32];
1401   memset(
1402       level_buffer, 0,
1403       kTransformWidth[adjusted_tx_size] * kTransformHeight[adjusted_tx_size] +
1404           tx_padding);
1405   const int clamped_tx_height = std::min(tx_height, 32);
1406   if (plane == kPlaneY) {
1407     ReadTransformType(block, x4, y4, tx_size);
1408   }
1409   BlockParameters& bp = *block.bp;
1410   *tx_type = ComputeTransformType(block, plane, tx_size, x4, y4);
1411   const int eob_multi_size = kEobMultiSizeLookup[tx_size];
1412   const PlaneType plane_type = GetPlaneType(plane);
1413   const TransformClass tx_class = GetTransformClass(*tx_type);
1414   context = static_cast<int>(tx_class != kTransformClass2D);
1415   int eob_pt = 1;
1416   switch (eob_multi_size) {
1417     case 0:
1418       eob_pt += reader_.ReadSymbol<kEobPt16SymbolCount>(
1419           symbol_decoder_context_.eob_pt_16_cdf[plane_type][context]);
1420       break;
1421     case 1:
1422       eob_pt += reader_.ReadSymbol<kEobPt32SymbolCount>(
1423           symbol_decoder_context_.eob_pt_32_cdf[plane_type][context]);
1424       break;
1425     case 2:
1426       eob_pt += reader_.ReadSymbol<kEobPt64SymbolCount>(
1427           symbol_decoder_context_.eob_pt_64_cdf[plane_type][context]);
1428       break;
1429     case 3:
1430       eob_pt += reader_.ReadSymbol<kEobPt128SymbolCount>(
1431           symbol_decoder_context_.eob_pt_128_cdf[plane_type][context]);
1432       break;
1433     case 4:
1434       eob_pt += reader_.ReadSymbol<kEobPt256SymbolCount>(
1435           symbol_decoder_context_.eob_pt_256_cdf[plane_type][context]);
1436       break;
1437     case 5:
1438       eob_pt += reader_.ReadSymbol<kEobPt512SymbolCount>(
1439           symbol_decoder_context_.eob_pt_512_cdf[plane_type]);
1440       break;
1441     case 6:
1442     default:
1443       eob_pt += reader_.ReadSymbol<kEobPt1024SymbolCount>(
1444           symbol_decoder_context_.eob_pt_1024_cdf[plane_type]);
1445       break;
1446   }
1447   int eob = (eob_pt < 2) ? eob_pt : ((1 << (eob_pt - 2)) + 1);
1448   if (eob_pt >= 3) {
1449     context = eob_pt - 3;
1450     const bool eob_extra = reader_.ReadSymbol(
1451         symbol_decoder_context_
1452             .eob_extra_cdf[tx_size_context][plane_type][context]);
1453     if (eob_extra) eob += 1 << (eob_pt - 3);
1454     for (int i = 1; i < eob_pt - 2; ++i) {
1455       assert(eob_pt - i >= 3);
1456       assert(eob_pt <= kEobPt1024SymbolCount);
1457       if (static_cast<bool>(reader_.ReadBit())) {
1458         eob += 1 << (eob_pt - i - 3);
1459       }
1460     }
1461   }
1462   const uint16_t* scan = kScan[tx_class][tx_size];
1463   const int clamped_tx_size_context = std::min(tx_size_context, 3);
1464   auto coeff_base_range_cdf =
1465       symbol_decoder_context_
1466           .coeff_base_range_cdf[clamped_tx_size_context][plane_type];
1467   // Read the last coefficient.
1468   {
1469     context = GetCoeffBaseContextEob(tx_size, eob - 1);
1470     const uint16_t pos = scan[eob - 1];
1471     int level =
1472         1 + reader_.ReadSymbol<kCoeffBaseEobSymbolCount>(
1473                 symbol_decoder_context_
1474                     .coeff_base_eob_cdf[tx_size_context][plane_type][context]);
1475     level_buffer[pos] = level;
1476     if (level > kNumQuantizerBaseLevels) {
1477       level +=
1478           ReadCoeffBaseRange(coeff_base_range_cdf[GetCoeffBaseRangeContextEob(
1479               adjusted_tx_width_log2, pos, tx_class)]);
1480     }
1481     residual[pos] = level;
1482   }
1483   if (eob > 1) {
1484     // Read all the other coefficients.
1485     // Lookup used to call the right variant of ReadCoeffBase*() based on the
1486     // transform class.
1487     static constexpr void (Tile::*kGetCoeffBaseFunc[])(
1488         const uint16_t* scan, TransformSize tx_size, int adjusted_tx_width_log2,
1489         int eob,
1490         uint16_t coeff_base_cdf[kCoeffBaseContexts][kCoeffBaseSymbolCount + 1],
1491         uint16_t coeff_base_range_cdf[kCoeffBaseRangeContexts]
1492                                      [kCoeffBaseRangeSymbolCount + 1],
1493         ResidualType* quantized_buffer,
1494         uint8_t* level_buffer) = {&Tile::ReadCoeffBase2D<ResidualType>,
1495                                   &Tile::ReadCoeffBaseHorizontal<ResidualType>,
1496                                   &Tile::ReadCoeffBaseVertical<ResidualType>};
1497     (this->*kGetCoeffBaseFunc[tx_class])(
1498         scan, tx_size, adjusted_tx_width_log2, eob,
1499         symbol_decoder_context_.coeff_base_cdf[tx_size_context][plane_type],
1500         coeff_base_range_cdf, residual, level_buffer);
1501   }
1502   const int max_value = (1 << (7 + sequence_header_.color_config.bitdepth)) - 1;
1503   const int current_quantizer_index = GetQIndex(
1504       frame_header_.segmentation, bp.segment_id, current_quantizer_index_);
1505   const int dc_q_value = quantizer_.GetDcValue(plane, current_quantizer_index);
1506   const int ac_q_value = quantizer_.GetAcValue(plane, current_quantizer_index);
1507   const int shift = kQuantizationShift[tx_size];
1508   const uint8_t* const quantizer_matrix =
1509       (frame_header_.quantizer.use_matrix &&
1510        *tx_type < kTransformTypeIdentityIdentity &&
1511        !frame_header_.segmentation.lossless[bp.segment_id] &&
1512        frame_header_.quantizer.matrix_level[plane] < 15)
1513           ? quantizer_matrix_[frame_header_.quantizer.matrix_level[plane]]
1514                              [plane_type][adjusted_tx_size]
1515                                  .get()
1516           : nullptr;
1517   int coefficient_level = 0;
1518   int8_t dc_category = 0;
1519   uint16_t* const dc_sign_cdf =
1520       (residual[0] != 0)
1521           ? symbol_decoder_context_.dc_sign_cdf[plane_type][GetDcSignContext(
1522                 x4, y4, w4, h4, plane)]
1523           : nullptr;
1524   assert(scan[0] == 0);
1525   if (!ReadSignAndApplyDequantization<ResidualType, /*is_dc_coefficient=*/true>(
1526           scan, 0, dc_q_value, quantizer_matrix, shift, max_value, dc_sign_cdf,
1527           &dc_category, &coefficient_level, residual)) {
1528     return -1;
1529   }
1530   if (eob > 1) {
1531     int i = 1;
1532     do {
1533       if (!ReadSignAndApplyDequantization<ResidualType,
1534                                           /*is_dc_coefficient=*/false>(
1535               scan, i, ac_q_value, quantizer_matrix, shift, max_value, nullptr,
1536               nullptr, &coefficient_level, residual)) {
1537         return -1;
1538       }
1539     } while (++i < eob);
1540     MoveCoefficientsForTxWidth64(clamped_tx_height, tx_width, residual);
1541   }
1542   SetEntropyContexts(x4, y4, w4, h4, plane, std::min(4, coefficient_level),
1543                      dc_category);
1544   if (split_parse_and_decode_) {
1545     *block.residual += tx_width * tx_height * residual_size_;
1546   }
1547   return eob;
1548 }
1549 
1550 // CALL_BITDEPTH_FUNCTION is a macro that calls the appropriate template
1551 // |function| depending on the value of |sequence_header_.color_config.bitdepth|
1552 // with the variadic arguments.
1553 #if LIBGAV1_MAX_BITDEPTH >= 10
1554 #define CALL_BITDEPTH_FUNCTION(function, ...)         \
1555   do {                                                \
1556     if (sequence_header_.color_config.bitdepth > 8) { \
1557       function<uint16_t>(__VA_ARGS__);                \
1558     } else {                                          \
1559       function<uint8_t>(__VA_ARGS__);                 \
1560     }                                                 \
1561   } while (false)
1562 #else
1563 #define CALL_BITDEPTH_FUNCTION(function, ...) \
1564   do {                                        \
1565     function<uint8_t>(__VA_ARGS__);           \
1566   } while (false)
1567 #endif
1568 
TransformBlock(const Block & block,Plane plane,int base_x,int base_y,TransformSize tx_size,int x,int y,ProcessingMode mode)1569 bool Tile::TransformBlock(const Block& block, Plane plane, int base_x,
1570                           int base_y, TransformSize tx_size, int x, int y,
1571                           ProcessingMode mode) {
1572   BlockParameters& bp = *block.bp;
1573   const int subsampling_x = subsampling_x_[plane];
1574   const int subsampling_y = subsampling_y_[plane];
1575   const int start_x = base_x + MultiplyBy4(x);
1576   const int start_y = base_y + MultiplyBy4(y);
1577   const int max_x = MultiplyBy4(frame_header_.columns4x4) >> subsampling_x;
1578   const int max_y = MultiplyBy4(frame_header_.rows4x4) >> subsampling_y;
1579   if (start_x >= max_x || start_y >= max_y) return true;
1580   const int row = DivideBy4(start_y << subsampling_y);
1581   const int column = DivideBy4(start_x << subsampling_x);
1582   const int mask = sequence_header_.use_128x128_superblock ? 31 : 15;
1583   const int sub_block_row4x4 = row & mask;
1584   const int sub_block_column4x4 = column & mask;
1585   const int step_x = kTransformWidth4x4[tx_size];
1586   const int step_y = kTransformHeight4x4[tx_size];
1587   const bool do_decode = mode == kProcessingModeDecodeOnly ||
1588                          mode == kProcessingModeParseAndDecode;
1589   if (do_decode && !bp.is_inter) {
1590     if (bp.palette_mode_info.size[GetPlaneType(plane)] > 0) {
1591       CALL_BITDEPTH_FUNCTION(PalettePrediction, block, plane, start_x, start_y,
1592                              x, y, tx_size);
1593     } else {
1594       const PredictionMode mode =
1595           (plane == kPlaneY)
1596               ? bp.y_mode
1597               : (bp.uv_mode == kPredictionModeChromaFromLuma ? kPredictionModeDc
1598                                                              : bp.uv_mode);
1599       const int tr_row4x4 = (sub_block_row4x4 >> subsampling_y);
1600       const int tr_column4x4 =
1601           (sub_block_column4x4 >> subsampling_x) + step_x + 1;
1602       const int bl_row4x4 = (sub_block_row4x4 >> subsampling_y) + step_y + 1;
1603       const int bl_column4x4 = (sub_block_column4x4 >> subsampling_x);
1604       const bool has_left = x > 0 || block.left_available[plane];
1605       const bool has_top = y > 0 || block.top_available[plane];
1606 
1607       CALL_BITDEPTH_FUNCTION(
1608           IntraPrediction, block, plane, start_x, start_y, has_left, has_top,
1609           block.scratch_buffer->block_decoded[plane][tr_row4x4][tr_column4x4],
1610           block.scratch_buffer->block_decoded[plane][bl_row4x4][bl_column4x4],
1611           mode, tx_size);
1612       if (plane != kPlaneY && bp.uv_mode == kPredictionModeChromaFromLuma) {
1613         CALL_BITDEPTH_FUNCTION(ChromaFromLumaPrediction, block, plane, start_x,
1614                                start_y, tx_size);
1615       }
1616     }
1617     if (plane == kPlaneY) {
1618       block.bp->prediction_parameters->max_luma_width =
1619           start_x + MultiplyBy4(step_x);
1620       block.bp->prediction_parameters->max_luma_height =
1621           start_y + MultiplyBy4(step_y);
1622       block.scratch_buffer->cfl_luma_buffer_valid = false;
1623     }
1624   }
1625   if (!bp.skip) {
1626     const int sb_row_index = SuperBlockRowIndex(block.row4x4);
1627     const int sb_column_index = SuperBlockColumnIndex(block.column4x4);
1628     if (mode == kProcessingModeDecodeOnly) {
1629       Queue<TransformParameters>& tx_params =
1630           *residual_buffer_threaded_[sb_row_index][sb_column_index]
1631                ->transform_parameters();
1632       ReconstructBlock(block, plane, start_x, start_y, tx_size,
1633                        tx_params.Front().type,
1634                        tx_params.Front().non_zero_coeff_count);
1635       tx_params.Pop();
1636     } else {
1637       TransformType tx_type;
1638       int non_zero_coeff_count;
1639 #if LIBGAV1_MAX_BITDEPTH >= 10
1640       if (sequence_header_.color_config.bitdepth > 8) {
1641         non_zero_coeff_count = ReadTransformCoefficients<int32_t>(
1642             block, plane, start_x, start_y, tx_size, &tx_type);
1643       } else  // NOLINT
1644 #endif
1645       {
1646         non_zero_coeff_count = ReadTransformCoefficients<int16_t>(
1647             block, plane, start_x, start_y, tx_size, &tx_type);
1648       }
1649       if (non_zero_coeff_count < 0) return false;
1650       if (mode == kProcessingModeParseAndDecode) {
1651         ReconstructBlock(block, plane, start_x, start_y, tx_size, tx_type,
1652                          non_zero_coeff_count);
1653       } else {
1654         assert(mode == kProcessingModeParseOnly);
1655         residual_buffer_threaded_[sb_row_index][sb_column_index]
1656             ->transform_parameters()
1657             ->Push(TransformParameters(tx_type, non_zero_coeff_count));
1658       }
1659     }
1660   }
1661   if (do_decode) {
1662     bool* block_decoded =
1663         &block.scratch_buffer
1664              ->block_decoded[plane][(sub_block_row4x4 >> subsampling_y) + 1]
1665                             [(sub_block_column4x4 >> subsampling_x) + 1];
1666     SetBlockValues<bool>(step_y, step_x, true, block_decoded,
1667                          TileScratchBuffer::kBlockDecodedStride);
1668   }
1669   return true;
1670 }
1671 
TransformTree(const Block & block,int start_x,int start_y,BlockSize plane_size,ProcessingMode mode)1672 bool Tile::TransformTree(const Block& block, int start_x, int start_y,
1673                          BlockSize plane_size, ProcessingMode mode) {
1674   assert(plane_size <= kBlock64x64);
1675   // Branching factor is 4; Maximum Depth is 4; So the maximum stack size
1676   // required is (4 - 1) * 4 + 1 = 13.
1677   Stack<TransformTreeNode, 13> stack;
1678   // It is okay to cast BlockSize to TransformSize here since the enum are
1679   // equivalent for all BlockSize values <= kBlock64x64.
1680   stack.Push(TransformTreeNode(start_x, start_y,
1681                                static_cast<TransformSize>(plane_size)));
1682 
1683   do {
1684     TransformTreeNode node = stack.Pop();
1685     const int row = DivideBy4(node.y);
1686     const int column = DivideBy4(node.x);
1687     if (row >= frame_header_.rows4x4 || column >= frame_header_.columns4x4) {
1688       continue;
1689     }
1690     const TransformSize inter_tx_size = inter_transform_sizes_[row][column];
1691     const int width = kTransformWidth[node.tx_size];
1692     const int height = kTransformHeight[node.tx_size];
1693     if (width <= kTransformWidth[inter_tx_size] &&
1694         height <= kTransformHeight[inter_tx_size]) {
1695       if (!TransformBlock(block, kPlaneY, node.x, node.y, node.tx_size, 0, 0,
1696                           mode)) {
1697         return false;
1698       }
1699       continue;
1700     }
1701     // The split transform size look up gives the right transform size that we
1702     // should push in the stack.
1703     //   if (width > height) => transform size whose width is half.
1704     //   if (width < height) => transform size whose height is half.
1705     //   if (width == height) => transform size whose width and height are half.
1706     const TransformSize split_tx_size = kSplitTransformSize[node.tx_size];
1707     const int half_width = DivideBy2(width);
1708     if (width > height) {
1709       stack.Push(TransformTreeNode(node.x + half_width, node.y, split_tx_size));
1710       stack.Push(TransformTreeNode(node.x, node.y, split_tx_size));
1711       continue;
1712     }
1713     const int half_height = DivideBy2(height);
1714     if (width < height) {
1715       stack.Push(
1716           TransformTreeNode(node.x, node.y + half_height, split_tx_size));
1717       stack.Push(TransformTreeNode(node.x, node.y, split_tx_size));
1718       continue;
1719     }
1720     stack.Push(TransformTreeNode(node.x + half_width, node.y + half_height,
1721                                  split_tx_size));
1722     stack.Push(TransformTreeNode(node.x, node.y + half_height, split_tx_size));
1723     stack.Push(TransformTreeNode(node.x + half_width, node.y, split_tx_size));
1724     stack.Push(TransformTreeNode(node.x, node.y, split_tx_size));
1725   } while (!stack.Empty());
1726   return true;
1727 }
1728 
ReconstructBlock(const Block & block,Plane plane,int start_x,int start_y,TransformSize tx_size,TransformType tx_type,int non_zero_coeff_count)1729 void Tile::ReconstructBlock(const Block& block, Plane plane, int start_x,
1730                             int start_y, TransformSize tx_size,
1731                             TransformType tx_type, int non_zero_coeff_count) {
1732   // Reconstruction process. Steps 2 and 3 of Section 7.12.3 in the spec.
1733   assert(non_zero_coeff_count >= 0);
1734   if (non_zero_coeff_count == 0) return;
1735 #if LIBGAV1_MAX_BITDEPTH >= 10
1736   if (sequence_header_.color_config.bitdepth > 8) {
1737     Array2DView<uint16_t> buffer(
1738         buffer_[plane].rows(), buffer_[plane].columns() / sizeof(uint16_t),
1739         reinterpret_cast<uint16_t*>(&buffer_[plane][0][0]));
1740     Reconstruct(dsp_, tx_type, tx_size,
1741                 frame_header_.segmentation.lossless[block.bp->segment_id],
1742                 reinterpret_cast<int32_t*>(*block.residual), start_x, start_y,
1743                 &buffer, non_zero_coeff_count);
1744   } else  // NOLINT
1745 #endif
1746   {
1747     Reconstruct(dsp_, tx_type, tx_size,
1748                 frame_header_.segmentation.lossless[block.bp->segment_id],
1749                 reinterpret_cast<int16_t*>(*block.residual), start_x, start_y,
1750                 &buffer_[plane], non_zero_coeff_count);
1751   }
1752   if (split_parse_and_decode_) {
1753     *block.residual +=
1754         kTransformWidth[tx_size] * kTransformHeight[tx_size] * residual_size_;
1755   }
1756 }
1757 
Residual(const Block & block,ProcessingMode mode)1758 bool Tile::Residual(const Block& block, ProcessingMode mode) {
1759   const int width_chunks = std::max(1, block.width >> 6);
1760   const int height_chunks = std::max(1, block.height >> 6);
1761   const BlockSize size_chunk4x4 =
1762       (width_chunks > 1 || height_chunks > 1) ? kBlock64x64 : block.size;
1763   const BlockParameters& bp = *block.bp;
1764   for (int chunk_y = 0; chunk_y < height_chunks; ++chunk_y) {
1765     for (int chunk_x = 0; chunk_x < width_chunks; ++chunk_x) {
1766       const int num_planes = block.HasChroma() ? PlaneCount() : 1;
1767       int plane = kPlaneY;
1768       do {
1769         const int subsampling_x = subsampling_x_[plane];
1770         const int subsampling_y = subsampling_y_[plane];
1771         // For Y Plane, when lossless is true |bp.transform_size| is always
1772         // kTransformSize4x4. So we can simply use |bp.transform_size| here as
1773         // the Y plane's transform size (part of Section 5.11.37 in the spec).
1774         const TransformSize tx_size =
1775             (plane == kPlaneY) ? bp.transform_size : bp.uv_transform_size;
1776         const BlockSize plane_size =
1777             kPlaneResidualSize[size_chunk4x4][subsampling_x][subsampling_y];
1778         assert(plane_size != kBlockInvalid);
1779         if (bp.is_inter &&
1780             !frame_header_.segmentation.lossless[bp.segment_id] &&
1781             plane == kPlaneY) {
1782           const int row_chunk4x4 = block.row4x4 + MultiplyBy16(chunk_y);
1783           const int column_chunk4x4 = block.column4x4 + MultiplyBy16(chunk_x);
1784           const int base_x = MultiplyBy4(column_chunk4x4 >> subsampling_x);
1785           const int base_y = MultiplyBy4(row_chunk4x4 >> subsampling_y);
1786           if (!TransformTree(block, base_x, base_y, plane_size, mode)) {
1787             return false;
1788           }
1789         } else {
1790           const int base_x = MultiplyBy4(block.column4x4 >> subsampling_x);
1791           const int base_y = MultiplyBy4(block.row4x4 >> subsampling_y);
1792           const int step_x = kTransformWidth4x4[tx_size];
1793           const int step_y = kTransformHeight4x4[tx_size];
1794           const int num4x4_wide = kNum4x4BlocksWide[plane_size];
1795           const int num4x4_high = kNum4x4BlocksHigh[plane_size];
1796           for (int y = 0; y < num4x4_high; y += step_y) {
1797             for (int x = 0; x < num4x4_wide; x += step_x) {
1798               if (!TransformBlock(
1799                       block, static_cast<Plane>(plane), base_x, base_y, tx_size,
1800                       x + (MultiplyBy16(chunk_x) >> subsampling_x),
1801                       y + (MultiplyBy16(chunk_y) >> subsampling_y), mode)) {
1802                 return false;
1803               }
1804             }
1805           }
1806         }
1807       } while (++plane < num_planes);
1808     }
1809   }
1810   return true;
1811 }
1812 
1813 // The purpose of this function is to limit the maximum size of motion vectors
1814 // and also, if use_intra_block_copy is true, to additionally constrain the
1815 // motion vector so that the data is fetched from parts of the tile that have
1816 // already been decoded and are not too close to the current block (in order to
1817 // make a pipelined decoder implementation feasible).
IsMvValid(const Block & block,bool is_compound) const1818 bool Tile::IsMvValid(const Block& block, bool is_compound) const {
1819   const BlockParameters& bp = *block.bp;
1820   for (int i = 0; i < 1 + static_cast<int>(is_compound); ++i) {
1821     for (int mv_component : bp.mv.mv[i].mv) {
1822       if (std::abs(mv_component) >= (1 << 14)) {
1823         return false;
1824       }
1825     }
1826   }
1827   if (!block.bp->prediction_parameters->use_intra_block_copy) {
1828     return true;
1829   }
1830   if ((bp.mv.mv[0].mv32 & 0x00070007) != 0) {
1831     return false;
1832   }
1833   const int delta_row = bp.mv.mv[0].mv[0] >> 3;
1834   const int delta_column = bp.mv.mv[0].mv[1] >> 3;
1835   int src_top_edge = MultiplyBy4(block.row4x4) + delta_row;
1836   int src_left_edge = MultiplyBy4(block.column4x4) + delta_column;
1837   const int src_bottom_edge = src_top_edge + block.height;
1838   const int src_right_edge = src_left_edge + block.width;
1839   if (block.HasChroma()) {
1840     if (block.width < 8 && subsampling_x_[kPlaneU] != 0) {
1841       src_left_edge -= 4;
1842     }
1843     if (block.height < 8 && subsampling_y_[kPlaneU] != 0) {
1844       src_top_edge -= 4;
1845     }
1846   }
1847   if (src_top_edge < MultiplyBy4(row4x4_start_) ||
1848       src_left_edge < MultiplyBy4(column4x4_start_) ||
1849       src_bottom_edge > MultiplyBy4(row4x4_end_) ||
1850       src_right_edge > MultiplyBy4(column4x4_end_)) {
1851     return false;
1852   }
1853   // sb_height_log2 = use_128x128_superblock ? log2(128) : log2(64)
1854   const int sb_height_log2 =
1855       6 + static_cast<int>(sequence_header_.use_128x128_superblock);
1856   const int active_sb_row = MultiplyBy4(block.row4x4) >> sb_height_log2;
1857   const int active_64x64_block_column = MultiplyBy4(block.column4x4) >> 6;
1858   const int src_sb_row = (src_bottom_edge - 1) >> sb_height_log2;
1859   const int src_64x64_block_column = (src_right_edge - 1) >> 6;
1860   const int total_64x64_blocks_per_row =
1861       ((column4x4_end_ - column4x4_start_ - 1) >> 4) + 1;
1862   const int active_64x64_block =
1863       active_sb_row * total_64x64_blocks_per_row + active_64x64_block_column;
1864   const int src_64x64_block =
1865       src_sb_row * total_64x64_blocks_per_row + src_64x64_block_column;
1866   if (src_64x64_block >= active_64x64_block - kIntraBlockCopyDelay64x64Blocks) {
1867     return false;
1868   }
1869 
1870   // Wavefront constraint: use only top left area of frame for reference.
1871   if (src_sb_row > active_sb_row) return false;
1872   const int gradient =
1873       1 + kIntraBlockCopyDelay64x64Blocks +
1874       static_cast<int>(sequence_header_.use_128x128_superblock);
1875   const int wavefront_offset = gradient * (active_sb_row - src_sb_row);
1876   return src_64x64_block_column < active_64x64_block_column -
1877                                       kIntraBlockCopyDelay64x64Blocks +
1878                                       wavefront_offset;
1879 }
1880 
AssignInterMv(const Block & block,bool is_compound)1881 bool Tile::AssignInterMv(const Block& block, bool is_compound) {
1882   int min[2];
1883   int max[2];
1884   GetClampParameters(block, min, max);
1885   BlockParameters& bp = *block.bp;
1886   const PredictionParameters& prediction_parameters = *bp.prediction_parameters;
1887   bp.mv.mv64 = 0;
1888   if (is_compound) {
1889     for (int i = 0; i < 2; ++i) {
1890       const PredictionMode mode = GetSinglePredictionMode(i, bp.y_mode);
1891       MotionVector predicted_mv;
1892       if (mode == kPredictionModeGlobalMv) {
1893         predicted_mv = prediction_parameters.global_mv[i];
1894       } else {
1895         const int ref_mv_index = (mode == kPredictionModeNearestMv ||
1896                                   (mode == kPredictionModeNewMv &&
1897                                    prediction_parameters.ref_mv_count <= 1))
1898                                      ? 0
1899                                      : prediction_parameters.ref_mv_index;
1900         predicted_mv = prediction_parameters.reference_mv(ref_mv_index, i);
1901         if (ref_mv_index < prediction_parameters.ref_mv_count) {
1902           predicted_mv.mv[0] = Clip3(predicted_mv.mv[0], min[0], max[0]);
1903           predicted_mv.mv[1] = Clip3(predicted_mv.mv[1], min[1], max[1]);
1904         }
1905       }
1906       if (mode == kPredictionModeNewMv) {
1907         ReadMotionVector(block, i);
1908         bp.mv.mv[i].mv[0] += predicted_mv.mv[0];
1909         bp.mv.mv[i].mv[1] += predicted_mv.mv[1];
1910       } else {
1911         bp.mv.mv[i] = predicted_mv;
1912       }
1913     }
1914   } else {
1915     const PredictionMode mode = GetSinglePredictionMode(0, bp.y_mode);
1916     MotionVector predicted_mv;
1917     if (mode == kPredictionModeGlobalMv) {
1918       predicted_mv = prediction_parameters.global_mv[0];
1919     } else {
1920       const int ref_mv_index = (mode == kPredictionModeNearestMv ||
1921                                 (mode == kPredictionModeNewMv &&
1922                                  prediction_parameters.ref_mv_count <= 1))
1923                                    ? 0
1924                                    : prediction_parameters.ref_mv_index;
1925       predicted_mv = prediction_parameters.reference_mv(ref_mv_index);
1926       if (ref_mv_index < prediction_parameters.ref_mv_count) {
1927         predicted_mv.mv[0] = Clip3(predicted_mv.mv[0], min[0], max[0]);
1928         predicted_mv.mv[1] = Clip3(predicted_mv.mv[1], min[1], max[1]);
1929       }
1930     }
1931     if (mode == kPredictionModeNewMv) {
1932       ReadMotionVector(block, 0);
1933       bp.mv.mv[0].mv[0] += predicted_mv.mv[0];
1934       bp.mv.mv[0].mv[1] += predicted_mv.mv[1];
1935     } else {
1936       bp.mv.mv[0] = predicted_mv;
1937     }
1938   }
1939   return IsMvValid(block, is_compound);
1940 }
1941 
AssignIntraMv(const Block & block)1942 bool Tile::AssignIntraMv(const Block& block) {
1943   // TODO(linfengz): Check if the clamping process is necessary.
1944   int min[2];
1945   int max[2];
1946   GetClampParameters(block, min, max);
1947   BlockParameters& bp = *block.bp;
1948   const PredictionParameters& prediction_parameters = *bp.prediction_parameters;
1949   const MotionVector& ref_mv_0 = prediction_parameters.reference_mv(0);
1950   bp.mv.mv64 = 0;
1951   ReadMotionVector(block, 0);
1952   if (ref_mv_0.mv32 == 0) {
1953     const MotionVector& ref_mv_1 = prediction_parameters.reference_mv(1);
1954     if (ref_mv_1.mv32 == 0) {
1955       const int super_block_size4x4 = kNum4x4BlocksHigh[SuperBlockSize()];
1956       if (block.row4x4 - super_block_size4x4 < row4x4_start_) {
1957         bp.mv.mv[0].mv[1] -= MultiplyBy32(super_block_size4x4);
1958         bp.mv.mv[0].mv[1] -= MultiplyBy8(kIntraBlockCopyDelayPixels);
1959       } else {
1960         bp.mv.mv[0].mv[0] -= MultiplyBy32(super_block_size4x4);
1961       }
1962     } else {
1963       bp.mv.mv[0].mv[0] += Clip3(ref_mv_1.mv[0], min[0], max[0]);
1964       bp.mv.mv[0].mv[1] += Clip3(ref_mv_1.mv[1], min[0], max[0]);
1965     }
1966   } else {
1967     bp.mv.mv[0].mv[0] += Clip3(ref_mv_0.mv[0], min[0], max[0]);
1968     bp.mv.mv[0].mv[1] += Clip3(ref_mv_0.mv[1], min[1], max[1]);
1969   }
1970   return IsMvValid(block, /*is_compound=*/false);
1971 }
1972 
ResetEntropyContext(const Block & block)1973 void Tile::ResetEntropyContext(const Block& block) {
1974   const int num_planes = block.HasChroma() ? PlaneCount() : 1;
1975   int plane = kPlaneY;
1976   do {
1977     const int subsampling_x = subsampling_x_[plane];
1978     const int start_x = block.column4x4 >> subsampling_x;
1979     const int end_x =
1980         std::min((block.column4x4 + block.width4x4) >> subsampling_x,
1981                  frame_header_.columns4x4);
1982     memset(&coefficient_levels_[kEntropyContextTop][plane][start_x], 0,
1983            end_x - start_x);
1984     memset(&dc_categories_[kEntropyContextTop][plane][start_x], 0,
1985            end_x - start_x);
1986     const int subsampling_y = subsampling_y_[plane];
1987     const int start_y = block.row4x4 >> subsampling_y;
1988     const int end_y =
1989         std::min((block.row4x4 + block.height4x4) >> subsampling_y,
1990                  frame_header_.rows4x4);
1991     memset(&coefficient_levels_[kEntropyContextLeft][plane][start_y], 0,
1992            end_y - start_y);
1993     memset(&dc_categories_[kEntropyContextLeft][plane][start_y], 0,
1994            end_y - start_y);
1995   } while (++plane < num_planes);
1996 }
1997 
ComputePrediction(const Block & block)1998 bool Tile::ComputePrediction(const Block& block) {
1999   const BlockParameters& bp = *block.bp;
2000   if (!bp.is_inter) return true;
2001   const int mask =
2002       (1 << (4 + static_cast<int>(sequence_header_.use_128x128_superblock))) -
2003       1;
2004   const int sub_block_row4x4 = block.row4x4 & mask;
2005   const int sub_block_column4x4 = block.column4x4 & mask;
2006   const int plane_count = block.HasChroma() ? PlaneCount() : 1;
2007   // Returns true if this block applies local warping. The state is determined
2008   // in the Y plane and carried for use in the U/V planes.
2009   // But the U/V planes will not apply warping when the block size is smaller
2010   // than 8x8, even if this variable is true.
2011   bool is_local_valid = false;
2012   // Local warping parameters, similar usage as is_local_valid.
2013   GlobalMotion local_warp_params;
2014   int plane = kPlaneY;
2015   do {
2016     const int8_t subsampling_x = subsampling_x_[plane];
2017     const int8_t subsampling_y = subsampling_y_[plane];
2018     const BlockSize plane_size = block.residual_size[plane];
2019     const int block_width4x4 = kNum4x4BlocksWide[plane_size];
2020     const int block_height4x4 = kNum4x4BlocksHigh[plane_size];
2021     const int block_width = MultiplyBy4(block_width4x4);
2022     const int block_height = MultiplyBy4(block_height4x4);
2023     const int base_x = MultiplyBy4(block.column4x4 >> subsampling_x);
2024     const int base_y = MultiplyBy4(block.row4x4 >> subsampling_y);
2025     if (bp.reference_frame[1] == kReferenceFrameIntra) {
2026       const int tr_row4x4 = sub_block_row4x4 >> subsampling_y;
2027       const int tr_column4x4 =
2028           (sub_block_column4x4 >> subsampling_x) + block_width4x4 + 1;
2029       const int bl_row4x4 =
2030           (sub_block_row4x4 >> subsampling_y) + block_height4x4;
2031       const int bl_column4x4 = (sub_block_column4x4 >> subsampling_x) + 1;
2032       const TransformSize tx_size =
2033           k4x4SizeToTransformSize[k4x4WidthLog2[plane_size]]
2034                                  [k4x4HeightLog2[plane_size]];
2035       const bool has_left = block.left_available[plane];
2036       const bool has_top = block.top_available[plane];
2037       CALL_BITDEPTH_FUNCTION(
2038           IntraPrediction, block, static_cast<Plane>(plane), base_x, base_y,
2039           has_left, has_top,
2040           block.scratch_buffer->block_decoded[plane][tr_row4x4][tr_column4x4],
2041           block.scratch_buffer->block_decoded[plane][bl_row4x4][bl_column4x4],
2042           kInterIntraToIntraMode[block.bp->prediction_parameters
2043                                      ->inter_intra_mode],
2044           tx_size);
2045     }
2046     int candidate_row = block.row4x4;
2047     int candidate_column = block.column4x4;
2048     bool some_use_intra = bp.reference_frame[0] == kReferenceFrameIntra;
2049     if (!some_use_intra && plane != 0) {
2050       candidate_row = (candidate_row >> subsampling_y) << subsampling_y;
2051       candidate_column = (candidate_column >> subsampling_x) << subsampling_x;
2052       if (candidate_row != block.row4x4) {
2053         // Top block.
2054         const BlockParameters& bp_top =
2055             *block_parameters_holder_.Find(candidate_row, block.column4x4);
2056         some_use_intra = bp_top.reference_frame[0] == kReferenceFrameIntra;
2057         if (!some_use_intra && candidate_column != block.column4x4) {
2058           // Top-left block.
2059           const BlockParameters& bp_top_left =
2060               *block_parameters_holder_.Find(candidate_row, candidate_column);
2061           some_use_intra =
2062               bp_top_left.reference_frame[0] == kReferenceFrameIntra;
2063         }
2064       }
2065       if (!some_use_intra && candidate_column != block.column4x4) {
2066         // Left block.
2067         const BlockParameters& bp_left =
2068             *block_parameters_holder_.Find(block.row4x4, candidate_column);
2069         some_use_intra = bp_left.reference_frame[0] == kReferenceFrameIntra;
2070       }
2071     }
2072     int prediction_width;
2073     int prediction_height;
2074     if (some_use_intra) {
2075       candidate_row = block.row4x4;
2076       candidate_column = block.column4x4;
2077       prediction_width = block_width;
2078       prediction_height = block_height;
2079     } else {
2080       prediction_width = block.width >> subsampling_x;
2081       prediction_height = block.height >> subsampling_y;
2082     }
2083     int r = 0;
2084     int y = 0;
2085     do {
2086       int c = 0;
2087       int x = 0;
2088       do {
2089         if (!InterPrediction(block, static_cast<Plane>(plane), base_x + x,
2090                              base_y + y, prediction_width, prediction_height,
2091                              candidate_row + r, candidate_column + c,
2092                              &is_local_valid, &local_warp_params)) {
2093           return false;
2094         }
2095         ++c;
2096         x += prediction_width;
2097       } while (x < block_width);
2098       ++r;
2099       y += prediction_height;
2100     } while (y < block_height);
2101   } while (++plane < plane_count);
2102   return true;
2103 }
2104 
2105 #undef CALL_BITDEPTH_FUNCTION
2106 
PopulateDeblockFilterLevel(const Block & block)2107 void Tile::PopulateDeblockFilterLevel(const Block& block) {
2108   if (!post_filter_.DoDeblock()) return;
2109   BlockParameters& bp = *block.bp;
2110   const int mode_id =
2111       static_cast<int>(kPredictionModeDeltasMask.Contains(bp.y_mode));
2112   for (int i = 0; i < kFrameLfCount; ++i) {
2113     if (delta_lf_all_zero_) {
2114       bp.deblock_filter_level[i] = post_filter_.GetZeroDeltaDeblockFilterLevel(
2115           bp.segment_id, i, bp.reference_frame[0], mode_id);
2116     } else {
2117       bp.deblock_filter_level[i] =
2118           deblock_filter_levels_[bp.segment_id][i][bp.reference_frame[0]]
2119                                 [mode_id];
2120     }
2121   }
2122 }
2123 
ProcessBlock(int row4x4,int column4x4,BlockSize block_size,TileScratchBuffer * const scratch_buffer,ResidualPtr * residual)2124 bool Tile::ProcessBlock(int row4x4, int column4x4, BlockSize block_size,
2125                         TileScratchBuffer* const scratch_buffer,
2126                         ResidualPtr* residual) {
2127   // Do not process the block if the starting point is beyond the visible frame.
2128   // This is equivalent to the has_row/has_column check in the
2129   // decode_partition() section of the spec when partition equals
2130   // kPartitionHorizontal or kPartitionVertical.
2131   if (row4x4 >= frame_header_.rows4x4 ||
2132       column4x4 >= frame_header_.columns4x4) {
2133     return true;
2134   }
2135 
2136   if (split_parse_and_decode_) {
2137     // Push block ordering info to the queue. DecodeBlock() will use this queue
2138     // to decode the blocks in the correct order.
2139     const int sb_row_index = SuperBlockRowIndex(row4x4);
2140     const int sb_column_index = SuperBlockColumnIndex(column4x4);
2141     residual_buffer_threaded_[sb_row_index][sb_column_index]
2142         ->partition_tree_order()
2143         ->Push(PartitionTreeNode(row4x4, column4x4, block_size));
2144   }
2145 
2146   BlockParameters* bp_ptr =
2147       block_parameters_holder_.Get(row4x4, column4x4, block_size);
2148   if (bp_ptr == nullptr) {
2149     LIBGAV1_DLOG(ERROR, "Failed to get BlockParameters.");
2150     return false;
2151   }
2152   BlockParameters& bp = *bp_ptr;
2153   Block block(*this, block_size, row4x4, column4x4, scratch_buffer, residual);
2154   bp.size = block_size;
2155   bp.prediction_parameters =
2156       split_parse_and_decode_ ? std::unique_ptr<PredictionParameters>(
2157                                     new (std::nothrow) PredictionParameters())
2158                               : std::move(prediction_parameters_);
2159   if (bp.prediction_parameters == nullptr) return false;
2160   if (!DecodeModeInfo(block)) return false;
2161   bp.is_global_mv_block = (bp.y_mode == kPredictionModeGlobalMv ||
2162                            bp.y_mode == kPredictionModeGlobalGlobalMv) &&
2163                           !IsBlockDimension4(bp.size);
2164   PopulateDeblockFilterLevel(block);
2165   if (!ReadPaletteTokens(block)) return false;
2166   DecodeTransformSize(block);
2167   // Part of Section 5.11.37 in the spec (implemented as a simple lookup).
2168   bp.uv_transform_size = frame_header_.segmentation.lossless[bp.segment_id]
2169                              ? kTransformSize4x4
2170                              : kUVTransformSize[block.residual_size[kPlaneU]];
2171   if (bp.skip) ResetEntropyContext(block);
2172   if (split_parse_and_decode_) {
2173     if (!Residual(block, kProcessingModeParseOnly)) return false;
2174   } else {
2175     if (!ComputePrediction(block) ||
2176         !Residual(block, kProcessingModeParseAndDecode)) {
2177       return false;
2178     }
2179   }
2180   // If frame_header_.segmentation.enabled is false, bp.segment_id is 0 for all
2181   // blocks. We don't need to call save bp.segment_id in the current frame
2182   // because the current frame's segmentation map will be cleared to all 0s.
2183   //
2184   // If frame_header_.segmentation.enabled is true and
2185   // frame_header_.segmentation.update_map is false, we will copy the previous
2186   // frame's segmentation map to the current frame. So we don't need to call
2187   // save bp.segment_id in the current frame.
2188   if (frame_header_.segmentation.enabled &&
2189       frame_header_.segmentation.update_map) {
2190     const int x_limit = std::min(frame_header_.columns4x4 - column4x4,
2191                                  static_cast<int>(block.width4x4));
2192     const int y_limit = std::min(frame_header_.rows4x4 - row4x4,
2193                                  static_cast<int>(block.height4x4));
2194     current_frame_.segmentation_map()->FillBlock(row4x4, column4x4, x_limit,
2195                                                  y_limit, bp.segment_id);
2196   }
2197   StoreMotionFieldMvsIntoCurrentFrame(block);
2198   if (!split_parse_and_decode_) {
2199     prediction_parameters_ = std::move(bp.prediction_parameters);
2200   }
2201   return true;
2202 }
2203 
DecodeBlock(int row4x4,int column4x4,BlockSize block_size,TileScratchBuffer * const scratch_buffer,ResidualPtr * residual)2204 bool Tile::DecodeBlock(int row4x4, int column4x4, BlockSize block_size,
2205                        TileScratchBuffer* const scratch_buffer,
2206                        ResidualPtr* residual) {
2207   if (row4x4 >= frame_header_.rows4x4 ||
2208       column4x4 >= frame_header_.columns4x4) {
2209     return true;
2210   }
2211   Block block(*this, block_size, row4x4, column4x4, scratch_buffer, residual);
2212   if (!ComputePrediction(block) ||
2213       !Residual(block, kProcessingModeDecodeOnly)) {
2214     return false;
2215   }
2216   block.bp->prediction_parameters.reset(nullptr);
2217   return true;
2218 }
2219 
ProcessPartition(int row4x4_start,int column4x4_start,TileScratchBuffer * const scratch_buffer,ResidualPtr * residual)2220 bool Tile::ProcessPartition(int row4x4_start, int column4x4_start,
2221                             TileScratchBuffer* const scratch_buffer,
2222                             ResidualPtr* residual) {
2223   Stack<PartitionTreeNode, kDfsStackSize> stack;
2224 
2225   // Set up the first iteration.
2226   stack.Push(
2227       PartitionTreeNode(row4x4_start, column4x4_start, SuperBlockSize()));
2228 
2229   // DFS loop. If it sees a terminal node (leaf node), ProcessBlock is invoked.
2230   // Otherwise, the children are pushed into the stack for future processing.
2231   do {
2232     PartitionTreeNode node = stack.Pop();
2233     int row4x4 = node.row4x4;
2234     int column4x4 = node.column4x4;
2235     BlockSize block_size = node.block_size;
2236 
2237     if (row4x4 >= frame_header_.rows4x4 ||
2238         column4x4 >= frame_header_.columns4x4) {
2239       continue;
2240     }
2241     const int block_width4x4 = kNum4x4BlocksWide[block_size];
2242     assert(block_width4x4 == kNum4x4BlocksHigh[block_size]);
2243     const int half_block4x4 = block_width4x4 >> 1;
2244     const bool has_rows = (row4x4 + half_block4x4) < frame_header_.rows4x4;
2245     const bool has_columns =
2246         (column4x4 + half_block4x4) < frame_header_.columns4x4;
2247     Partition partition;
2248     if (!ReadPartition(row4x4, column4x4, block_size, has_rows, has_columns,
2249                        &partition)) {
2250       LIBGAV1_DLOG(ERROR, "Failed to read partition for row: %d column: %d",
2251                    row4x4, column4x4);
2252       return false;
2253     }
2254     const BlockSize sub_size = kSubSize[partition][block_size];
2255     // Section 6.10.4: It is a requirement of bitstream conformance that
2256     // get_plane_residual_size( subSize, 1 ) is not equal to BLOCK_INVALID
2257     // every time subSize is computed.
2258     if (sub_size == kBlockInvalid ||
2259         kPlaneResidualSize[sub_size]
2260                           [sequence_header_.color_config.subsampling_x]
2261                           [sequence_header_.color_config.subsampling_y] ==
2262             kBlockInvalid) {
2263       LIBGAV1_DLOG(
2264           ERROR,
2265           "Invalid sub-block/plane size for row: %d column: %d partition: "
2266           "%d block_size: %d sub_size: %d subsampling_x/y: %d, %d",
2267           row4x4, column4x4, partition, block_size, sub_size,
2268           sequence_header_.color_config.subsampling_x,
2269           sequence_header_.color_config.subsampling_y);
2270       return false;
2271     }
2272 
2273     const int quarter_block4x4 = half_block4x4 >> 1;
2274     const BlockSize split_size = kSubSize[kPartitionSplit][block_size];
2275     assert(partition == kPartitionNone || sub_size != kBlockInvalid);
2276     switch (partition) {
2277       case kPartitionNone:
2278         if (!ProcessBlock(row4x4, column4x4, sub_size, scratch_buffer,
2279                           residual)) {
2280           return false;
2281         }
2282         break;
2283       case kPartitionSplit:
2284         // The children must be added in reverse order since a stack is being
2285         // used.
2286         stack.Push(PartitionTreeNode(row4x4 + half_block4x4,
2287                                      column4x4 + half_block4x4, sub_size));
2288         stack.Push(
2289             PartitionTreeNode(row4x4 + half_block4x4, column4x4, sub_size));
2290         stack.Push(
2291             PartitionTreeNode(row4x4, column4x4 + half_block4x4, sub_size));
2292         stack.Push(PartitionTreeNode(row4x4, column4x4, sub_size));
2293         break;
2294       case kPartitionHorizontal:
2295         if (!ProcessBlock(row4x4, column4x4, sub_size, scratch_buffer,
2296                           residual) ||
2297             !ProcessBlock(row4x4 + half_block4x4, column4x4, sub_size,
2298                           scratch_buffer, residual)) {
2299           return false;
2300         }
2301         break;
2302       case kPartitionVertical:
2303         if (!ProcessBlock(row4x4, column4x4, sub_size, scratch_buffer,
2304                           residual) ||
2305             !ProcessBlock(row4x4, column4x4 + half_block4x4, sub_size,
2306                           scratch_buffer, residual)) {
2307           return false;
2308         }
2309         break;
2310       case kPartitionHorizontalWithTopSplit:
2311         if (!ProcessBlock(row4x4, column4x4, split_size, scratch_buffer,
2312                           residual) ||
2313             !ProcessBlock(row4x4, column4x4 + half_block4x4, split_size,
2314                           scratch_buffer, residual) ||
2315             !ProcessBlock(row4x4 + half_block4x4, column4x4, sub_size,
2316                           scratch_buffer, residual)) {
2317           return false;
2318         }
2319         break;
2320       case kPartitionHorizontalWithBottomSplit:
2321         if (!ProcessBlock(row4x4, column4x4, sub_size, scratch_buffer,
2322                           residual) ||
2323             !ProcessBlock(row4x4 + half_block4x4, column4x4, split_size,
2324                           scratch_buffer, residual) ||
2325             !ProcessBlock(row4x4 + half_block4x4, column4x4 + half_block4x4,
2326                           split_size, scratch_buffer, residual)) {
2327           return false;
2328         }
2329         break;
2330       case kPartitionVerticalWithLeftSplit:
2331         if (!ProcessBlock(row4x4, column4x4, split_size, scratch_buffer,
2332                           residual) ||
2333             !ProcessBlock(row4x4 + half_block4x4, column4x4, split_size,
2334                           scratch_buffer, residual) ||
2335             !ProcessBlock(row4x4, column4x4 + half_block4x4, sub_size,
2336                           scratch_buffer, residual)) {
2337           return false;
2338         }
2339         break;
2340       case kPartitionVerticalWithRightSplit:
2341         if (!ProcessBlock(row4x4, column4x4, sub_size, scratch_buffer,
2342                           residual) ||
2343             !ProcessBlock(row4x4, column4x4 + half_block4x4, split_size,
2344                           scratch_buffer, residual) ||
2345             !ProcessBlock(row4x4 + half_block4x4, column4x4 + half_block4x4,
2346                           split_size, scratch_buffer, residual)) {
2347           return false;
2348         }
2349         break;
2350       case kPartitionHorizontal4:
2351         for (int i = 0; i < 4; ++i) {
2352           if (!ProcessBlock(row4x4 + i * quarter_block4x4, column4x4, sub_size,
2353                             scratch_buffer, residual)) {
2354             return false;
2355           }
2356         }
2357         break;
2358       case kPartitionVertical4:
2359         for (int i = 0; i < 4; ++i) {
2360           if (!ProcessBlock(row4x4, column4x4 + i * quarter_block4x4, sub_size,
2361                             scratch_buffer, residual)) {
2362             return false;
2363           }
2364         }
2365         break;
2366     }
2367   } while (!stack.Empty());
2368   return true;
2369 }
2370 
ResetLoopRestorationParams()2371 void Tile::ResetLoopRestorationParams() {
2372   for (int plane = kPlaneY; plane < kMaxPlanes; ++plane) {
2373     for (int i = WienerInfo::kVertical; i <= WienerInfo::kHorizontal; ++i) {
2374       reference_unit_info_[plane].sgr_proj_info.multiplier[i] =
2375           kSgrProjDefaultMultiplier[i];
2376       for (int j = 0; j < kNumWienerCoefficients; ++j) {
2377         reference_unit_info_[plane].wiener_info.filter[i][j] =
2378             kWienerDefaultFilter[j];
2379       }
2380     }
2381   }
2382 }
2383 
ResetCdef(const int row4x4,const int column4x4)2384 void Tile::ResetCdef(const int row4x4, const int column4x4) {
2385   if (!sequence_header_.enable_cdef) return;
2386   const int row = DivideBy16(row4x4);
2387   const int column = DivideBy16(column4x4);
2388   cdef_index_[row][column] = -1;
2389   if (sequence_header_.use_128x128_superblock) {
2390     const int cdef_size4x4 = kNum4x4BlocksWide[kBlock64x64];
2391     const int border_row = DivideBy16(row4x4 + cdef_size4x4);
2392     const int border_column = DivideBy16(column4x4 + cdef_size4x4);
2393     cdef_index_[row][border_column] = -1;
2394     cdef_index_[border_row][column] = -1;
2395     cdef_index_[border_row][border_column] = -1;
2396   }
2397 }
2398 
ClearBlockDecoded(TileScratchBuffer * const scratch_buffer,int row4x4,int column4x4)2399 void Tile::ClearBlockDecoded(TileScratchBuffer* const scratch_buffer,
2400                              int row4x4, int column4x4) {
2401   // Set everything to false.
2402   memset(scratch_buffer->block_decoded, 0,
2403          sizeof(scratch_buffer->block_decoded));
2404   // Set specific edge cases to true.
2405   const int sb_size4 = sequence_header_.use_128x128_superblock ? 32 : 16;
2406   for (int plane = kPlaneY; plane < PlaneCount(); ++plane) {
2407     const int subsampling_x = subsampling_x_[plane];
2408     const int subsampling_y = subsampling_y_[plane];
2409     const int sb_width4 = (column4x4_end_ - column4x4) >> subsampling_x;
2410     const int sb_height4 = (row4x4_end_ - row4x4) >> subsampling_y;
2411     // The memset is equivalent to the following lines in the spec:
2412     // for ( x = -1; x <= ( sbSize4 >> subX ); x++ ) {
2413     //   if ( y < 0 && x < sbWidth4 ) {
2414     //     BlockDecoded[plane][y][x] = 1
2415     //   }
2416     // }
2417     const int num_elements =
2418         std::min((sb_size4 >> subsampling_x_[plane]) + 1, sb_width4) + 1;
2419     memset(&scratch_buffer->block_decoded[plane][0][0], 1, num_elements);
2420     // The for loop is equivalent to the following lines in the spec:
2421     // for ( y = -1; y <= ( sbSize4 >> subY ); y++ )
2422     //   if ( x < 0 && y < sbHeight4 )
2423     //     BlockDecoded[plane][y][x] = 1
2424     //   }
2425     // }
2426     // BlockDecoded[plane][sbSize4 >> subY][-1] = 0
2427     for (int y = -1; y < std::min((sb_size4 >> subsampling_y), sb_height4);
2428          ++y) {
2429       scratch_buffer->block_decoded[plane][y + 1][0] = true;
2430     }
2431   }
2432 }
2433 
ProcessSuperBlock(int row4x4,int column4x4,TileScratchBuffer * const scratch_buffer,ProcessingMode mode)2434 bool Tile::ProcessSuperBlock(int row4x4, int column4x4,
2435                              TileScratchBuffer* const scratch_buffer,
2436                              ProcessingMode mode) {
2437   const bool parsing =
2438       mode == kProcessingModeParseOnly || mode == kProcessingModeParseAndDecode;
2439   const bool decoding = mode == kProcessingModeDecodeOnly ||
2440                         mode == kProcessingModeParseAndDecode;
2441   if (parsing) {
2442     read_deltas_ = frame_header_.delta_q.present;
2443     ResetCdef(row4x4, column4x4);
2444   }
2445   if (decoding) {
2446     ClearBlockDecoded(scratch_buffer, row4x4, column4x4);
2447   }
2448   const BlockSize block_size = SuperBlockSize();
2449   if (parsing) {
2450     ReadLoopRestorationCoefficients(row4x4, column4x4, block_size);
2451   }
2452   if (parsing && decoding) {
2453     uint8_t* residual_buffer = residual_buffer_.get();
2454     if (!ProcessPartition(row4x4, column4x4, scratch_buffer,
2455                           &residual_buffer)) {
2456       LIBGAV1_DLOG(ERROR, "Error decoding partition row: %d column: %d", row4x4,
2457                    column4x4);
2458       return false;
2459     }
2460     return true;
2461   }
2462   const int sb_row_index = SuperBlockRowIndex(row4x4);
2463   const int sb_column_index = SuperBlockColumnIndex(column4x4);
2464   if (parsing) {
2465     residual_buffer_threaded_[sb_row_index][sb_column_index] =
2466         residual_buffer_pool_->Get();
2467     if (residual_buffer_threaded_[sb_row_index][sb_column_index] == nullptr) {
2468       LIBGAV1_DLOG(ERROR, "Failed to get residual buffer.");
2469       return false;
2470     }
2471     uint8_t* residual_buffer =
2472         residual_buffer_threaded_[sb_row_index][sb_column_index]->buffer();
2473     if (!ProcessPartition(row4x4, column4x4, scratch_buffer,
2474                           &residual_buffer)) {
2475       LIBGAV1_DLOG(ERROR, "Error parsing partition row: %d column: %d", row4x4,
2476                    column4x4);
2477       return false;
2478     }
2479   } else {
2480     if (!DecodeSuperBlock(sb_row_index, sb_column_index, scratch_buffer)) {
2481       LIBGAV1_DLOG(ERROR, "Error decoding superblock row: %d column: %d",
2482                    row4x4, column4x4);
2483       return false;
2484     }
2485     residual_buffer_pool_->Release(
2486         std::move(residual_buffer_threaded_[sb_row_index][sb_column_index]));
2487   }
2488   return true;
2489 }
2490 
DecodeSuperBlock(int sb_row_index,int sb_column_index,TileScratchBuffer * const scratch_buffer)2491 bool Tile::DecodeSuperBlock(int sb_row_index, int sb_column_index,
2492                             TileScratchBuffer* const scratch_buffer) {
2493   uint8_t* residual_buffer =
2494       residual_buffer_threaded_[sb_row_index][sb_column_index]->buffer();
2495   Queue<PartitionTreeNode>& partition_tree_order =
2496       *residual_buffer_threaded_[sb_row_index][sb_column_index]
2497            ->partition_tree_order();
2498   while (!partition_tree_order.Empty()) {
2499     PartitionTreeNode block = partition_tree_order.Front();
2500     if (!DecodeBlock(block.row4x4, block.column4x4, block.block_size,
2501                      scratch_buffer, &residual_buffer)) {
2502       LIBGAV1_DLOG(ERROR, "Error decoding block row: %d column: %d",
2503                    block.row4x4, block.column4x4);
2504       return false;
2505     }
2506     partition_tree_order.Pop();
2507   }
2508   return true;
2509 }
2510 
ReadLoopRestorationCoefficients(int row4x4,int column4x4,BlockSize block_size)2511 void Tile::ReadLoopRestorationCoefficients(int row4x4, int column4x4,
2512                                            BlockSize block_size) {
2513   if (frame_header_.allow_intrabc) return;
2514   LoopRestorationInfo* const restoration_info = post_filter_.restoration_info();
2515   const bool is_superres_scaled =
2516       frame_header_.width != frame_header_.upscaled_width;
2517   for (int plane = kPlaneY; plane < PlaneCount(); ++plane) {
2518     LoopRestorationUnitInfo unit_info;
2519     if (restoration_info->PopulateUnitInfoForSuperBlock(
2520             static_cast<Plane>(plane), block_size, is_superres_scaled,
2521             frame_header_.superres_scale_denominator, row4x4, column4x4,
2522             &unit_info)) {
2523       for (int unit_row = unit_info.row_start; unit_row < unit_info.row_end;
2524            ++unit_row) {
2525         for (int unit_column = unit_info.column_start;
2526              unit_column < unit_info.column_end; ++unit_column) {
2527           const int unit_id = unit_row * restoration_info->num_horizontal_units(
2528                                              static_cast<Plane>(plane)) +
2529                               unit_column;
2530           restoration_info->ReadUnitCoefficients(
2531               &reader_, &symbol_decoder_context_, static_cast<Plane>(plane),
2532               unit_id, &reference_unit_info_);
2533         }
2534       }
2535     }
2536   }
2537 }
2538 
StoreMotionFieldMvsIntoCurrentFrame(const Block & block)2539 void Tile::StoreMotionFieldMvsIntoCurrentFrame(const Block& block) {
2540   if (frame_header_.refresh_frame_flags == 0 ||
2541       IsIntraFrame(frame_header_.frame_type)) {
2542     return;
2543   }
2544   // Iterate over odd rows/columns beginning at the first odd row/column for the
2545   // block. It is done this way because motion field mvs are only needed at a
2546   // 8x8 granularity.
2547   const int row_start4x4 = block.row4x4 | 1;
2548   const int row_limit4x4 =
2549       std::min(block.row4x4 + block.height4x4, frame_header_.rows4x4);
2550   if (row_start4x4 >= row_limit4x4) return;
2551   const int column_start4x4 = block.column4x4 | 1;
2552   const int column_limit4x4 =
2553       std::min(block.column4x4 + block.width4x4, frame_header_.columns4x4);
2554   if (column_start4x4 >= column_limit4x4) return;
2555 
2556   // The largest reference MV component that can be saved.
2557   constexpr int kRefMvsLimit = (1 << 12) - 1;
2558   const BlockParameters& bp = *block.bp;
2559   ReferenceInfo* reference_info = current_frame_.reference_info();
2560   for (int i = 1; i >= 0; --i) {
2561     const ReferenceFrameType reference_frame_to_store = bp.reference_frame[i];
2562     // Must make a local copy so that StoreMotionFieldMvs() knows there is no
2563     // overlap between load and store.
2564     const MotionVector mv_to_store = bp.mv.mv[i];
2565     const int mv_row = std::abs(mv_to_store.mv[MotionVector::kRow]);
2566     const int mv_column = std::abs(mv_to_store.mv[MotionVector::kColumn]);
2567     if (reference_frame_to_store > kReferenceFrameIntra &&
2568         // kRefMvsLimit equals 0x07FF, so we can first bitwise OR the two
2569         // absolute values and then compare with kRefMvsLimit to save a branch.
2570         // The next line is equivalent to:
2571         // mv_row <= kRefMvsLimit && mv_column <= kRefMvsLimit
2572         (mv_row | mv_column) <= kRefMvsLimit &&
2573         reference_info->relative_distance_from[reference_frame_to_store] < 0) {
2574       const int row_start8x8 = DivideBy2(row_start4x4);
2575       const int row_limit8x8 = DivideBy2(row_limit4x4);
2576       const int column_start8x8 = DivideBy2(column_start4x4);
2577       const int column_limit8x8 = DivideBy2(column_limit4x4);
2578       const int rows = row_limit8x8 - row_start8x8;
2579       const int columns = column_limit8x8 - column_start8x8;
2580       const ptrdiff_t stride = DivideBy2(current_frame_.columns4x4());
2581       ReferenceFrameType* const reference_frame_row_start =
2582           &reference_info
2583                ->motion_field_reference_frame[row_start8x8][column_start8x8];
2584       MotionVector* const mv =
2585           &reference_info->motion_field_mv[row_start8x8][column_start8x8];
2586 
2587       // Specialize columns cases 1, 2, 4, 8 and 16. This makes memset() inlined
2588       // and simplifies std::fill() for these cases.
2589       if (columns <= 1) {
2590         // Don't change the above condition to (columns == 1).
2591         // Condition (columns <= 1) may help the compiler simplify the inlining
2592         // of the general case of StoreMotionFieldMvs() by eliminating the
2593         // (columns == 0) case.
2594         assert(columns == 1);
2595         StoreMotionFieldMvs(reference_frame_to_store, mv_to_store, stride, rows,
2596                             1, reference_frame_row_start, mv);
2597       } else if (columns == 2) {
2598         StoreMotionFieldMvs(reference_frame_to_store, mv_to_store, stride, rows,
2599                             2, reference_frame_row_start, mv);
2600       } else if (columns == 4) {
2601         StoreMotionFieldMvs(reference_frame_to_store, mv_to_store, stride, rows,
2602                             4, reference_frame_row_start, mv);
2603       } else if (columns == 8) {
2604         StoreMotionFieldMvs(reference_frame_to_store, mv_to_store, stride, rows,
2605                             8, reference_frame_row_start, mv);
2606       } else if (columns == 16) {
2607         StoreMotionFieldMvs(reference_frame_to_store, mv_to_store, stride, rows,
2608                             16, reference_frame_row_start, mv);
2609       } else if (columns < 16) {
2610         // This always true condition (columns < 16) may help the compiler
2611         // simplify the inlining of the following function.
2612         // This general case is rare and usually only happens to the blocks
2613         // which contain the right boundary of the frame.
2614         StoreMotionFieldMvs(reference_frame_to_store, mv_to_store, stride, rows,
2615                             columns, reference_frame_row_start, mv);
2616       } else {
2617         assert(false);
2618       }
2619       return;
2620     }
2621   }
2622 }
2623 
2624 }  // namespace libgav1
2625