1 /*
2  * Copyright (c) 2017, Alliance for Open Media. All rights reserved
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 
12 #include "av1/encoder/encodetxb.h"
13 
14 #include "aom_ports/mem.h"
15 #include "av1/common/blockd.h"
16 #include "av1/common/idct.h"
17 #include "av1/common/pred_common.h"
18 #include "av1/common/scan.h"
19 #include "av1/encoder/bitstream.h"
20 #include "av1/encoder/cost.h"
21 #include "av1/encoder/encodeframe.h"
22 #include "av1/encoder/hash.h"
23 #include "av1/encoder/rdopt.h"
24 #include "av1/encoder/tokenize.h"
25 
26 #if CONFIG_HTB_TRELLIS
27 static int hbt_needs_init = 1;
28 static CRC32C crc_calculator;
29 static const int HBT_EOB = 16;            // also the length in opt_qcoeff
30 static const int HBT_TABLE_SIZE = 65536;  // 16 bit: holds 65536 'arrays'
31 static const int HBT_ARRAY_LENGTH = 256;  // 8 bit: 256 entries
32 // If removed in hbt_create_hashes or increased beyond int8_t, widen deltas type
33 static const int HBT_KICKOUT = 3;
34 
35 typedef struct OptTxbQcoeff {
36   // Use larger type if larger/no kickout value is used in hbt_create_hashes
37   int8_t deltas[16];
38   uint32_t hbt_qc_hash;
39   uint32_t hbt_ctx_hash;
40   int init;
41   int rate_cost;
42 } OptTxbQcoeff;
43 
44 OptTxbQcoeff *hbt_hash_table;
45 #endif  // CONFIG_HTB_TRELLIS
46 
47 typedef struct LevelDownStats {
48   int update;
49   tran_low_t low_qc;
50   tran_low_t low_dqc;
51   int64_t dist0;
52   int rate;
53   int rate_low;
54   int64_t dist;
55   int64_t dist_low;
56   int64_t rd;
57   int64_t rd_low;
58   int64_t nz_rd;
59   int64_t rd_diff;
60   int cost_diff;
61   int64_t dist_diff;
62   int new_eob;
63 } LevelDownStats;
64 
get_dqv(const int16_t * dequant,int coeff_idx,const qm_val_t * iqmatrix)65 static INLINE int get_dqv(const int16_t *dequant, int coeff_idx,
66                           const qm_val_t *iqmatrix) {
67   int dqv = dequant[!!coeff_idx];
68   if (iqmatrix != NULL)
69     dqv =
70         ((iqmatrix[coeff_idx] * dqv) + (1 << (AOM_QM_BITS - 1))) >> AOM_QM_BITS;
71   return dqv;
72 }
73 
av1_alloc_txb_buf(AV1_COMP * cpi)74 void av1_alloc_txb_buf(AV1_COMP *cpi) {
75   AV1_COMMON *cm = &cpi->common;
76   int size = ((cm->mi_params.mi_rows >> cm->seq_params.mib_size_log2) + 1) *
77              ((cm->mi_params.mi_cols >> cm->seq_params.mib_size_log2) + 1);
78 
79   av1_free_txb_buf(cpi);
80   // TODO(jingning): This should be further reduced.
81   CHECK_MEM_ERROR(cm, cpi->coeff_buffer_base,
82                   aom_memalign(32, sizeof(*cpi->coeff_buffer_base) * size));
83 }
84 
av1_free_txb_buf(AV1_COMP * cpi)85 void av1_free_txb_buf(AV1_COMP *cpi) { aom_free(cpi->coeff_buffer_base); }
86 
write_golomb(aom_writer * w,int level)87 static void write_golomb(aom_writer *w, int level) {
88   int x = level + 1;
89   int i = x;
90   int length = 0;
91 
92   while (i) {
93     i >>= 1;
94     ++length;
95   }
96   assert(length > 0);
97 
98   for (i = 0; i < length - 1; ++i) aom_write_bit(w, 0);
99 
100   for (i = length - 1; i >= 0; --i) aom_write_bit(w, (x >> i) & 0x01);
101 }
102 
get_lower_coeff(tran_low_t qc)103 static INLINE tran_low_t get_lower_coeff(tran_low_t qc) {
104   if (qc == 0) {
105     return 0;
106   }
107   return qc > 0 ? qc - 1 : qc + 1;
108 }
109 
qcoeff_to_dqcoeff(tran_low_t qc,int coeff_idx,int dqv,int shift,const qm_val_t * iqmatrix)110 static INLINE tran_low_t qcoeff_to_dqcoeff(tran_low_t qc, int coeff_idx,
111                                            int dqv, int shift,
112                                            const qm_val_t *iqmatrix) {
113   int sign = qc < 0 ? -1 : 1;
114   if (iqmatrix != NULL)
115     dqv =
116         ((iqmatrix[coeff_idx] * dqv) + (1 << (AOM_QM_BITS - 1))) >> AOM_QM_BITS;
117   return sign * ((abs(qc) * dqv) >> shift);
118 }
119 
get_coeff_dist(tran_low_t tcoeff,tran_low_t dqcoeff,int shift)120 static INLINE int64_t get_coeff_dist(tran_low_t tcoeff, tran_low_t dqcoeff,
121                                      int shift) {
122   const int64_t diff = (tcoeff - dqcoeff) * (1 << shift);
123   const int64_t error = diff * diff;
124   return error;
125 }
126 
127 static const int8_t eob_to_pos_small[33] = {
128   0, 1, 2,                                        // 0-2
129   3, 3,                                           // 3-4
130   4, 4, 4, 4,                                     // 5-8
131   5, 5, 5, 5, 5, 5, 5, 5,                         // 9-16
132   6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6  // 17-32
133 };
134 
135 static const int8_t eob_to_pos_large[17] = {
136   6,                               // place holder
137   7,                               // 33-64
138   8,  8,                           // 65-128
139   9,  9,  9,  9,                   // 129-256
140   10, 10, 10, 10, 10, 10, 10, 10,  // 257-512
141   11                               // 513-
142 };
143 
get_eob_pos_token(const int eob,int * const extra)144 static INLINE int get_eob_pos_token(const int eob, int *const extra) {
145   int t;
146 
147   if (eob < 33) {
148     t = eob_to_pos_small[eob];
149   } else {
150     const int e = AOMMIN((eob - 1) >> 5, 16);
151     t = eob_to_pos_large[e];
152   }
153 
154   *extra = eob - av1_eob_group_start[t];
155 
156   return t;
157 }
158 
159 #if CONFIG_ENTROPY_STATS
av1_update_eob_context(int cdf_idx,int eob,TX_SIZE tx_size,TX_CLASS tx_class,PLANE_TYPE plane,FRAME_CONTEXT * ec_ctx,FRAME_COUNTS * counts,uint8_t allow_update_cdf)160 void av1_update_eob_context(int cdf_idx, int eob, TX_SIZE tx_size,
161                             TX_CLASS tx_class, PLANE_TYPE plane,
162                             FRAME_CONTEXT *ec_ctx, FRAME_COUNTS *counts,
163                             uint8_t allow_update_cdf) {
164 #else
165 void av1_update_eob_context(int eob, TX_SIZE tx_size, TX_CLASS tx_class,
166                             PLANE_TYPE plane, FRAME_CONTEXT *ec_ctx,
167                             uint8_t allow_update_cdf) {
168 #endif
169   int eob_extra;
170   const int eob_pt = get_eob_pos_token(eob, &eob_extra);
171   TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size);
172 
173   const int eob_multi_size = txsize_log2_minus4[tx_size];
174   const int eob_multi_ctx = (tx_class == TX_CLASS_2D) ? 0 : 1;
175 
176   switch (eob_multi_size) {
177     case 0:
178 #if CONFIG_ENTROPY_STATS
179       ++counts->eob_multi16[cdf_idx][plane][eob_multi_ctx][eob_pt - 1];
180 #endif
181       if (allow_update_cdf)
182         update_cdf(ec_ctx->eob_flag_cdf16[plane][eob_multi_ctx], eob_pt - 1, 5);
183       break;
184     case 1:
185 #if CONFIG_ENTROPY_STATS
186       ++counts->eob_multi32[cdf_idx][plane][eob_multi_ctx][eob_pt - 1];
187 #endif
188       if (allow_update_cdf)
189         update_cdf(ec_ctx->eob_flag_cdf32[plane][eob_multi_ctx], eob_pt - 1, 6);
190       break;
191     case 2:
192 #if CONFIG_ENTROPY_STATS
193       ++counts->eob_multi64[cdf_idx][plane][eob_multi_ctx][eob_pt - 1];
194 #endif
195       if (allow_update_cdf)
196         update_cdf(ec_ctx->eob_flag_cdf64[plane][eob_multi_ctx], eob_pt - 1, 7);
197       break;
198     case 3:
199 #if CONFIG_ENTROPY_STATS
200       ++counts->eob_multi128[cdf_idx][plane][eob_multi_ctx][eob_pt - 1];
201 #endif
202       if (allow_update_cdf) {
203         update_cdf(ec_ctx->eob_flag_cdf128[plane][eob_multi_ctx], eob_pt - 1,
204                    8);
205       }
206       break;
207     case 4:
208 #if CONFIG_ENTROPY_STATS
209       ++counts->eob_multi256[cdf_idx][plane][eob_multi_ctx][eob_pt - 1];
210 #endif
211       if (allow_update_cdf) {
212         update_cdf(ec_ctx->eob_flag_cdf256[plane][eob_multi_ctx], eob_pt - 1,
213                    9);
214       }
215       break;
216     case 5:
217 #if CONFIG_ENTROPY_STATS
218       ++counts->eob_multi512[cdf_idx][plane][eob_multi_ctx][eob_pt - 1];
219 #endif
220       if (allow_update_cdf) {
221         update_cdf(ec_ctx->eob_flag_cdf512[plane][eob_multi_ctx], eob_pt - 1,
222                    10);
223       }
224       break;
225     case 6:
226     default:
227 #if CONFIG_ENTROPY_STATS
228       ++counts->eob_multi1024[cdf_idx][plane][eob_multi_ctx][eob_pt - 1];
229 #endif
230       if (allow_update_cdf) {
231         update_cdf(ec_ctx->eob_flag_cdf1024[plane][eob_multi_ctx], eob_pt - 1,
232                    11);
233       }
234       break;
235   }
236 
237   if (av1_eob_offset_bits[eob_pt] > 0) {
238     int eob_ctx = eob_pt - 3;
239     int eob_shift = av1_eob_offset_bits[eob_pt] - 1;
240     int bit = (eob_extra & (1 << eob_shift)) ? 1 : 0;
241 #if CONFIG_ENTROPY_STATS
242     counts->eob_extra[cdf_idx][txs_ctx][plane][eob_pt][bit]++;
243 #endif  // CONFIG_ENTROPY_STATS
244     if (allow_update_cdf)
245       update_cdf(ec_ctx->eob_extra_cdf[txs_ctx][plane][eob_ctx], bit, 2);
246   }
247 }
248 
249 static int get_eob_cost(int eob, const LV_MAP_EOB_COST *txb_eob_costs,
250                         const LV_MAP_COEFF_COST *txb_costs, TX_CLASS tx_class) {
251   int eob_extra;
252   const int eob_pt = get_eob_pos_token(eob, &eob_extra);
253   int eob_cost = 0;
254   const int eob_multi_ctx = (tx_class == TX_CLASS_2D) ? 0 : 1;
255   eob_cost = txb_eob_costs->eob_cost[eob_multi_ctx][eob_pt - 1];
256 
257   if (av1_eob_offset_bits[eob_pt] > 0) {
258     const int eob_ctx = eob_pt - 3;
259     const int eob_shift = av1_eob_offset_bits[eob_pt] - 1;
260     const int bit = (eob_extra & (1 << eob_shift)) ? 1 : 0;
261     eob_cost += txb_costs->eob_extra_cost[eob_ctx][bit];
262     const int offset_bits = av1_eob_offset_bits[eob_pt];
263     if (offset_bits > 1) eob_cost += av1_cost_literal(offset_bits - 1);
264   }
265   return eob_cost;
266 }
267 
268 static INLINE int get_sign_bit_cost(tran_low_t qc, int coeff_idx,
269                                     const int (*dc_sign_cost)[2],
270                                     int dc_sign_ctx) {
271   if (coeff_idx == 0) {
272     const int sign = (qc < 0) ? 1 : 0;
273     return dc_sign_cost[dc_sign_ctx][sign];
274   }
275   return av1_cost_literal(1);
276 }
277 
278 static const int golomb_bits_cost[32] = {
279   0,       512,     512 * 3, 512 * 3, 512 * 5, 512 * 5, 512 * 5, 512 * 5,
280   512 * 7, 512 * 7, 512 * 7, 512 * 7, 512 * 7, 512 * 7, 512 * 7, 512 * 7,
281   512 * 9, 512 * 9, 512 * 9, 512 * 9, 512 * 9, 512 * 9, 512 * 9, 512 * 9,
282   512 * 9, 512 * 9, 512 * 9, 512 * 9, 512 * 9, 512 * 9, 512 * 9, 512 * 9
283 };
284 static const int golomb_cost_diff[32] = {
285   0,       512, 512 * 2, 0, 512 * 2, 0, 0, 0, 512 * 2, 0, 0, 0, 0, 0, 0, 0,
286   512 * 2, 0,   0,       0, 0,       0, 0, 0, 0,       0, 0, 0, 0, 0, 0, 0
287 };
288 
289 static INLINE int get_golomb_cost(int abs_qc) {
290   if (abs_qc >= 1 + NUM_BASE_LEVELS + COEFF_BASE_RANGE) {
291     const int r = abs_qc - COEFF_BASE_RANGE - NUM_BASE_LEVELS;
292     const int length = get_msb(r) + 1;
293     return av1_cost_literal(2 * length - 1);
294   }
295   return 0;
296 }
297 
298 static INLINE int get_br_cost_with_diff(tran_low_t level, const int *coeff_lps,
299                                         int *diff) {
300   const int base_range = AOMMIN(level - 1 - NUM_BASE_LEVELS, COEFF_BASE_RANGE);
301   int golomb_bits = 0;
302   if (level <= COEFF_BASE_RANGE + 1 + NUM_BASE_LEVELS)
303     *diff += coeff_lps[base_range + COEFF_BASE_RANGE + 1];
304 
305   if (level >= COEFF_BASE_RANGE + 1 + NUM_BASE_LEVELS) {
306     int r = level - COEFF_BASE_RANGE - NUM_BASE_LEVELS;
307     if (r < 32) {
308       golomb_bits = golomb_bits_cost[r];
309       *diff += golomb_cost_diff[r];
310     } else {
311       golomb_bits = get_golomb_cost(level);
312       *diff += (r & (r - 1)) == 0 ? 1024 : 0;
313     }
314   }
315 
316   return coeff_lps[base_range] + golomb_bits;
317 }
318 
319 static INLINE int get_br_cost(tran_low_t level, const int *coeff_lps) {
320   const int base_range = AOMMIN(level - 1 - NUM_BASE_LEVELS, COEFF_BASE_RANGE);
321   return coeff_lps[base_range] + get_golomb_cost(level);
322 }
323 
324 static int get_coeff_cost(const tran_low_t qc, const int scan_idx,
325                           const int is_eob, const TxbInfo *const txb_info,
326                           const LV_MAP_COEFF_COST *const txb_costs,
327                           const int coeff_ctx, const TX_CLASS tx_class) {
328   const TXB_CTX *const txb_ctx = txb_info->txb_ctx;
329   const int is_nz = (qc != 0);
330   const tran_low_t abs_qc = abs(qc);
331   int cost = 0;
332   const int16_t *const scan = txb_info->scan_order->scan;
333   const int pos = scan[scan_idx];
334 
335   if (is_eob) {
336     cost += txb_costs->base_eob_cost[coeff_ctx][AOMMIN(abs_qc, 3) - 1];
337   } else {
338     cost += txb_costs->base_cost[coeff_ctx][AOMMIN(abs_qc, 3)];
339   }
340   if (is_nz) {
341     cost += get_sign_bit_cost(qc, scan_idx, txb_costs->dc_sign_cost,
342                               txb_ctx->dc_sign_ctx);
343 
344     if (abs_qc > NUM_BASE_LEVELS) {
345       const int ctx =
346           get_br_ctx(txb_info->levels, pos, txb_info->bwl, tx_class);
347       cost += get_br_cost(abs_qc, txb_costs->lps_cost[ctx]);
348     }
349   }
350   return cost;
351 }
352 
353 static INLINE int get_nz_map_ctx(const uint8_t *const levels,
354                                  const int coeff_idx, const int bwl,
355                                  const int height, const int scan_idx,
356                                  const int is_eob, const TX_SIZE tx_size,
357                                  const TX_CLASS tx_class) {
358   if (is_eob) {
359     if (scan_idx == 0) return 0;
360     if (scan_idx <= (height << bwl) / 8) return 1;
361     if (scan_idx <= (height << bwl) / 4) return 2;
362     return 3;
363   }
364   const int stats =
365       get_nz_mag(levels + get_padded_idx(coeff_idx, bwl), bwl, tx_class);
366   return get_nz_map_ctx_from_stats(stats, coeff_idx, bwl, tx_size, tx_class);
367 }
368 
369 static void get_dist_cost_stats(LevelDownStats *const stats, const int scan_idx,
370                                 const int is_eob,
371                                 const LV_MAP_COEFF_COST *const txb_costs,
372                                 const TxbInfo *const txb_info,
373                                 const TX_CLASS tx_class) {
374   const int16_t *const scan = txb_info->scan_order->scan;
375   const int coeff_idx = scan[scan_idx];
376   const tran_low_t qc = txb_info->qcoeff[coeff_idx];
377   const uint8_t *const levels = txb_info->levels;
378   stats->new_eob = -1;
379   stats->update = 0;
380   stats->rd_low = 0;
381   stats->rd = 0;
382   stats->nz_rd = 0;
383   stats->dist_low = 0;
384   stats->rate_low = 0;
385   stats->low_qc = 0;
386 
387   const tran_low_t tqc = txb_info->tcoeff[coeff_idx];
388   const int dqv = txb_info->dequant[coeff_idx != 0];
389   const int coeff_ctx =
390       get_nz_map_ctx(levels, coeff_idx, txb_info->bwl, txb_info->height,
391                      scan_idx, is_eob, txb_info->tx_size, tx_class);
392   const int qc_cost = get_coeff_cost(qc, scan_idx, is_eob, txb_info, txb_costs,
393                                      coeff_ctx, tx_class);
394   assert(qc != 0);
395   const tran_low_t dqc = qcoeff_to_dqcoeff(qc, coeff_idx, dqv, txb_info->shift,
396                                            txb_info->iqmatrix);
397   const int64_t dqc_dist = get_coeff_dist(tqc, dqc, txb_info->shift);
398 
399   // distortion difference when coefficient is quantized to 0
400   const tran_low_t dqc0 =
401       qcoeff_to_dqcoeff(0, coeff_idx, dqv, txb_info->shift, txb_info->iqmatrix);
402 
403   stats->dist0 = get_coeff_dist(tqc, dqc0, txb_info->shift);
404   stats->dist = dqc_dist - stats->dist0;
405   stats->rate = qc_cost;
406 
407   stats->rd = RDCOST(txb_info->rdmult, stats->rate, stats->dist);
408 
409   stats->low_qc = get_lower_coeff(qc);
410 
411   if (is_eob && stats->low_qc == 0) {
412     stats->rd_low = stats->rd;  // disable selection of low_qc in this case.
413   } else {
414     if (stats->low_qc == 0) {
415       stats->dist_low = 0;
416     } else {
417       stats->low_dqc = qcoeff_to_dqcoeff(stats->low_qc, coeff_idx, dqv,
418                                          txb_info->shift, txb_info->iqmatrix);
419       const int64_t low_dqc_dist =
420           get_coeff_dist(tqc, stats->low_dqc, txb_info->shift);
421       stats->dist_low = low_dqc_dist - stats->dist0;
422     }
423     const int low_qc_cost =
424         get_coeff_cost(stats->low_qc, scan_idx, is_eob, txb_info, txb_costs,
425                        coeff_ctx, tx_class);
426     stats->rate_low = low_qc_cost;
427     stats->rd_low = RDCOST(txb_info->rdmult, stats->rate_low, stats->dist_low);
428   }
429 }
430 
431 static void get_dist_cost_stats_with_eob(
432     LevelDownStats *const stats, const int scan_idx,
433     const LV_MAP_COEFF_COST *const txb_costs, const TxbInfo *const txb_info,
434     const TX_CLASS tx_class) {
435   const int is_eob = 0;
436   get_dist_cost_stats(stats, scan_idx, is_eob, txb_costs, txb_info, tx_class);
437 
438   const int16_t *const scan = txb_info->scan_order->scan;
439   const int coeff_idx = scan[scan_idx];
440   const tran_low_t qc = txb_info->qcoeff[coeff_idx];
441   const int coeff_ctx_temp = get_nz_map_ctx(
442       txb_info->levels, coeff_idx, txb_info->bwl, txb_info->height, scan_idx, 1,
443       txb_info->tx_size, tx_class);
444   const int qc_eob_cost = get_coeff_cost(qc, scan_idx, 1, txb_info, txb_costs,
445                                          coeff_ctx_temp, tx_class);
446   int64_t rd_eob = RDCOST(txb_info->rdmult, qc_eob_cost, stats->dist);
447   if (stats->low_qc != 0) {
448     const int low_qc_eob_cost =
449         get_coeff_cost(stats->low_qc, scan_idx, 1, txb_info, txb_costs,
450                        coeff_ctx_temp, tx_class);
451     int64_t rd_eob_low =
452         RDCOST(txb_info->rdmult, low_qc_eob_cost, stats->dist_low);
453     rd_eob = (rd_eob > rd_eob_low) ? rd_eob_low : rd_eob;
454   }
455 
456   stats->nz_rd = AOMMIN(stats->rd_low, stats->rd) - rd_eob;
457 }
458 
459 static INLINE void update_qcoeff(const int coeff_idx, const tran_low_t qc,
460                                  const TxbInfo *const txb_info) {
461   txb_info->qcoeff[coeff_idx] = qc;
462   txb_info->levels[get_padded_idx(coeff_idx, txb_info->bwl)] =
463       (uint8_t)clamp(abs(qc), 0, INT8_MAX);
464 }
465 
466 static INLINE void update_coeff(const int coeff_idx, const tran_low_t qc,
467                                 const TxbInfo *const txb_info) {
468   update_qcoeff(coeff_idx, qc, txb_info);
469   const int dqv = txb_info->dequant[coeff_idx != 0];
470   txb_info->dqcoeff[coeff_idx] = qcoeff_to_dqcoeff(
471       qc, coeff_idx, dqv, txb_info->shift, txb_info->iqmatrix);
472 }
473 
474 void av1_txb_init_levels_c(const tran_low_t *const coeff, const int width,
475                            const int height, uint8_t *const levels) {
476   const int stride = width + TX_PAD_HOR;
477   uint8_t *ls = levels;
478 
479   memset(levels + stride * height, 0,
480          sizeof(*levels) * (TX_PAD_BOTTOM * stride + TX_PAD_END));
481 
482   for (int i = 0; i < height; i++) {
483     for (int j = 0; j < width; j++) {
484       *ls++ = (uint8_t)clamp(abs(coeff[i * width + j]), 0, INT8_MAX);
485     }
486     for (int j = 0; j < TX_PAD_HOR; j++) {
487       *ls++ = 0;
488     }
489   }
490 }
491 
492 void av1_get_nz_map_contexts_c(const uint8_t *const levels,
493                                const int16_t *const scan, const uint16_t eob,
494                                const TX_SIZE tx_size, const TX_CLASS tx_class,
495                                int8_t *const coeff_contexts) {
496   const int bwl = get_txb_bwl(tx_size);
497   const int height = get_txb_high(tx_size);
498   for (int i = 0; i < eob; ++i) {
499     const int pos = scan[i];
500     coeff_contexts[pos] = get_nz_map_ctx(levels, pos, bwl, height, i,
501                                          i == eob - 1, tx_size, tx_class);
502   }
503 }
504 
505 void av1_write_coeffs_txb(const AV1_COMMON *const cm, MACROBLOCK *const x,
506                           aom_writer *w, int blk_row, int blk_col, int plane,
507                           int block, TX_SIZE tx_size) {
508   MACROBLOCKD *xd = &x->e_mbd;
509   const CB_COEFF_BUFFER *cb_coef_buff = x->cb_coef_buff;
510   const int txb_offset =
511       x->mbmi_ext_frame->cb_offset / (TX_SIZE_W_MIN * TX_SIZE_H_MIN);
512   const uint16_t *eob_txb = cb_coef_buff->eobs[plane] + txb_offset;
513   const uint16_t eob = eob_txb[block];
514   const uint8_t *entropy_ctx = cb_coef_buff->entropy_ctx[plane] + txb_offset;
515   const int txb_skip_ctx = entropy_ctx[block] & TXB_SKIP_CTX_MASK;
516   const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size);
517   FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
518   aom_write_symbol(w, eob == 0, ec_ctx->txb_skip_cdf[txs_ctx][txb_skip_ctx], 2);
519   if (eob == 0) return;
520 
521   const PLANE_TYPE plane_type = get_plane_type(plane);
522   const TX_TYPE tx_type =
523       av1_get_tx_type(xd, plane_type, blk_row, blk_col, tx_size,
524                       cm->features.reduced_tx_set_used);
525   // Only y plane's tx_type is transmitted
526   if (plane == 0) {
527     av1_write_tx_type(cm, xd, tx_type, tx_size, w);
528   }
529 
530   int eob_extra;
531   const int eob_pt = get_eob_pos_token(eob, &eob_extra);
532   const int eob_multi_size = txsize_log2_minus4[tx_size];
533   const TX_CLASS tx_class = tx_type_to_class[tx_type];
534   const int eob_multi_ctx = (tx_class == TX_CLASS_2D) ? 0 : 1;
535   switch (eob_multi_size) {
536     case 0:
537       aom_write_symbol(w, eob_pt - 1,
538                        ec_ctx->eob_flag_cdf16[plane_type][eob_multi_ctx], 5);
539       break;
540     case 1:
541       aom_write_symbol(w, eob_pt - 1,
542                        ec_ctx->eob_flag_cdf32[plane_type][eob_multi_ctx], 6);
543       break;
544     case 2:
545       aom_write_symbol(w, eob_pt - 1,
546                        ec_ctx->eob_flag_cdf64[plane_type][eob_multi_ctx], 7);
547       break;
548     case 3:
549       aom_write_symbol(w, eob_pt - 1,
550                        ec_ctx->eob_flag_cdf128[plane_type][eob_multi_ctx], 8);
551       break;
552     case 4:
553       aom_write_symbol(w, eob_pt - 1,
554                        ec_ctx->eob_flag_cdf256[plane_type][eob_multi_ctx], 9);
555       break;
556     case 5:
557       aom_write_symbol(w, eob_pt - 1,
558                        ec_ctx->eob_flag_cdf512[plane_type][eob_multi_ctx], 10);
559       break;
560     default:
561       aom_write_symbol(w, eob_pt - 1,
562                        ec_ctx->eob_flag_cdf1024[plane_type][eob_multi_ctx], 11);
563       break;
564   }
565 
566   const int eob_offset_bits = av1_eob_offset_bits[eob_pt];
567   if (eob_offset_bits > 0) {
568     const int eob_ctx = eob_pt - 3;
569     int eob_shift = eob_offset_bits - 1;
570     int bit = (eob_extra & (1 << eob_shift)) ? 1 : 0;
571     aom_write_symbol(w, bit,
572                      ec_ctx->eob_extra_cdf[txs_ctx][plane_type][eob_ctx], 2);
573     for (int i = 1; i < eob_offset_bits; i++) {
574       eob_shift = eob_offset_bits - 1 - i;
575       bit = (eob_extra & (1 << eob_shift)) ? 1 : 0;
576       aom_write_bit(w, bit);
577     }
578   }
579 
580   const int width = get_txb_wide(tx_size);
581   const int height = get_txb_high(tx_size);
582   uint8_t levels_buf[TX_PAD_2D];
583   uint8_t *const levels = set_levels(levels_buf, width);
584   const tran_low_t *tcoeff_txb =
585       cb_coef_buff->tcoeff[plane] + x->mbmi_ext_frame->cb_offset;
586   const tran_low_t *tcoeff = tcoeff_txb + BLOCK_OFFSET(block);
587   av1_txb_init_levels(tcoeff, width, height, levels);
588   const SCAN_ORDER *const scan_order = get_scan(tx_size, tx_type);
589   const int16_t *const scan = scan_order->scan;
590   DECLARE_ALIGNED(16, int8_t, coeff_contexts[MAX_TX_SQUARE]);
591   av1_get_nz_map_contexts(levels, scan, eob, tx_size, tx_class, coeff_contexts);
592 
593   const int bwl = get_txb_bwl(tx_size);
594   for (int c = eob - 1; c >= 0; --c) {
595     const int pos = scan[c];
596     const int coeff_ctx = coeff_contexts[pos];
597     const tran_low_t v = tcoeff[pos];
598     const tran_low_t level = abs(v);
599 
600     if (c == eob - 1) {
601       aom_write_symbol(
602           w, AOMMIN(level, 3) - 1,
603           ec_ctx->coeff_base_eob_cdf[txs_ctx][plane_type][coeff_ctx], 3);
604     } else {
605       aom_write_symbol(w, AOMMIN(level, 3),
606                        ec_ctx->coeff_base_cdf[txs_ctx][plane_type][coeff_ctx],
607                        4);
608     }
609     if (level > NUM_BASE_LEVELS) {
610       // level is above 1.
611       const int base_range = level - 1 - NUM_BASE_LEVELS;
612       const int br_ctx = get_br_ctx(levels, pos, bwl, tx_class);
613       aom_cdf_prob *cdf =
614           ec_ctx->coeff_br_cdf[AOMMIN(txs_ctx, TX_32X32)][plane_type][br_ctx];
615       for (int idx = 0; idx < COEFF_BASE_RANGE; idx += BR_CDF_SIZE - 1) {
616         const int k = AOMMIN(base_range - idx, BR_CDF_SIZE - 1);
617         aom_write_symbol(w, k, cdf, BR_CDF_SIZE);
618         if (k < BR_CDF_SIZE - 1) break;
619       }
620     }
621   }
622 
623   // Loop to code all signs in the transform block,
624   // starting with the sign of DC (if applicable)
625   for (int c = 0; c < eob; ++c) {
626     const tran_low_t v = tcoeff[scan[c]];
627     const tran_low_t level = abs(v);
628     const int sign = (v < 0) ? 1 : 0;
629     if (level) {
630       if (c == 0) {
631         const int dc_sign_ctx =
632             (entropy_ctx[block] >> DC_SIGN_CTX_SHIFT) & DC_SIGN_CTX_MASK;
633         aom_write_symbol(w, sign, ec_ctx->dc_sign_cdf[plane_type][dc_sign_ctx],
634                          2);
635       } else {
636         aom_write_bit(w, sign);
637       }
638       if (level > COEFF_BASE_RANGE + NUM_BASE_LEVELS)
639         write_golomb(w, level - COEFF_BASE_RANGE - 1 - NUM_BASE_LEVELS);
640     }
641   }
642 }
643 
644 typedef struct encode_txb_args {
645   const AV1_COMMON *cm;
646   MACROBLOCK *x;
647   aom_writer *w;
648 } ENCODE_TXB_ARGS;
649 
650 void av1_write_coeffs_mb(const AV1_COMMON *const cm, MACROBLOCK *x,
651                          aom_writer *w, BLOCK_SIZE bsize) {
652   MACROBLOCKD *xd = &x->e_mbd;
653   const int num_planes = av1_num_planes(cm);
654   int block[MAX_MB_PLANE] = { 0 };
655   int row, col;
656   assert(bsize == get_plane_block_size(bsize, xd->plane[0].subsampling_x,
657                                        xd->plane[0].subsampling_y));
658   const int max_blocks_wide = max_block_wide(xd, bsize, 0);
659   const int max_blocks_high = max_block_high(xd, bsize, 0);
660   const BLOCK_SIZE max_unit_bsize = BLOCK_64X64;
661   int mu_blocks_wide = mi_size_wide[max_unit_bsize];
662   int mu_blocks_high = mi_size_high[max_unit_bsize];
663   mu_blocks_wide = AOMMIN(max_blocks_wide, mu_blocks_wide);
664   mu_blocks_high = AOMMIN(max_blocks_high, mu_blocks_high);
665 
666   for (row = 0; row < max_blocks_high; row += mu_blocks_high) {
667     for (col = 0; col < max_blocks_wide; col += mu_blocks_wide) {
668       for (int plane = 0; plane < num_planes; ++plane) {
669         if (plane && !xd->is_chroma_ref) break;
670         const TX_SIZE tx_size = av1_get_tx_size(plane, xd);
671         const int stepr = tx_size_high_unit[tx_size];
672         const int stepc = tx_size_wide_unit[tx_size];
673         const int step = stepr * stepc;
674         const struct macroblockd_plane *const pd = &xd->plane[plane];
675         const int unit_height = ROUND_POWER_OF_TWO(
676             AOMMIN(mu_blocks_high + row, max_blocks_high), pd->subsampling_y);
677         const int unit_width = ROUND_POWER_OF_TWO(
678             AOMMIN(mu_blocks_wide + col, max_blocks_wide), pd->subsampling_x);
679         for (int blk_row = row >> pd->subsampling_y; blk_row < unit_height;
680              blk_row += stepr) {
681           for (int blk_col = col >> pd->subsampling_x; blk_col < unit_width;
682                blk_col += stepc) {
683             av1_write_coeffs_txb(cm, x, w, blk_row, blk_col, plane,
684                                  block[plane], tx_size);
685             block[plane] += step;
686           }
687         }
688       }
689     }
690   }
691 }
692 
693 // TODO(angiebird): use this function whenever it's possible
694 static int get_tx_type_cost(const MACROBLOCK *x, const MACROBLOCKD *xd,
695                             int plane, TX_SIZE tx_size, TX_TYPE tx_type,
696                             int reduced_tx_set_used) {
697   if (plane > 0) return 0;
698 
699   const TX_SIZE square_tx_size = txsize_sqr_map[tx_size];
700 
701   const MB_MODE_INFO *mbmi = xd->mi[0];
702   const int is_inter = is_inter_block(mbmi);
703   if (get_ext_tx_types(tx_size, is_inter, reduced_tx_set_used) > 1 &&
704       !xd->lossless[xd->mi[0]->segment_id]) {
705     const int ext_tx_set =
706         get_ext_tx_set(tx_size, is_inter, reduced_tx_set_used);
707     if (is_inter) {
708       if (ext_tx_set > 0)
709         return x->inter_tx_type_costs[ext_tx_set][square_tx_size][tx_type];
710     } else {
711       if (ext_tx_set > 0) {
712         PREDICTION_MODE intra_dir;
713         if (mbmi->filter_intra_mode_info.use_filter_intra)
714           intra_dir = fimode_to_intradir[mbmi->filter_intra_mode_info
715                                              .filter_intra_mode];
716         else
717           intra_dir = mbmi->mode;
718         return x->intra_tx_type_costs[ext_tx_set][square_tx_size][intra_dir]
719                                      [tx_type];
720       }
721     }
722   }
723   return 0;
724 }
725 
726 static INLINE void update_coeff_eob_fast(int *eob, int shift,
727                                          const int16_t *dequant_ptr,
728                                          const int16_t *scan,
729                                          const tran_low_t *coeff_ptr,
730                                          tran_low_t *qcoeff_ptr,
731                                          tran_low_t *dqcoeff_ptr) {
732   // TODO(sarahparker) make this work for aomqm
733   int eob_out = *eob;
734   int zbin[2] = { dequant_ptr[0] + ROUND_POWER_OF_TWO(dequant_ptr[0] * 70, 7),
735                   dequant_ptr[1] + ROUND_POWER_OF_TWO(dequant_ptr[1] * 70, 7) };
736 
737   for (int i = *eob - 1; i >= 0; i--) {
738     const int rc = scan[i];
739     const int qcoeff = qcoeff_ptr[rc];
740     const int coeff = coeff_ptr[rc];
741     const int coeff_sign = AOMSIGN(coeff);
742     int64_t abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
743 
744     if (((abs_coeff << (1 + shift)) < zbin[rc != 0]) || (qcoeff == 0)) {
745       eob_out--;
746       qcoeff_ptr[rc] = 0;
747       dqcoeff_ptr[rc] = 0;
748     } else {
749       break;
750     }
751   }
752 
753   *eob = eob_out;
754 }
755 
756 static AOM_FORCE_INLINE int warehouse_efficients_txb(
757     const MACROBLOCK *x, const int plane, const int block,
758     const TX_SIZE tx_size, const TXB_CTX *const txb_ctx,
759     const struct macroblock_plane *p, const int eob,
760     const PLANE_TYPE plane_type, const LV_MAP_COEFF_COST *const coeff_costs,
761     const MACROBLOCKD *const xd, const TX_TYPE tx_type, const TX_CLASS tx_class,
762     int reduced_tx_set_used) {
763   const tran_low_t *const qcoeff = p->qcoeff + BLOCK_OFFSET(block);
764   const int txb_skip_ctx = txb_ctx->txb_skip_ctx;
765   const int bwl = get_txb_bwl(tx_size);
766   const int width = get_txb_wide(tx_size);
767   const int height = get_txb_high(tx_size);
768   const SCAN_ORDER *const scan_order = get_scan(tx_size, tx_type);
769   const int16_t *const scan = scan_order->scan;
770   uint8_t levels_buf[TX_PAD_2D];
771   uint8_t *const levels = set_levels(levels_buf, width);
772   DECLARE_ALIGNED(16, int8_t, coeff_contexts[MAX_TX_SQUARE]);
773   const int eob_multi_size = txsize_log2_minus4[tx_size];
774   const LV_MAP_EOB_COST *const eob_costs =
775       &x->eob_costs[eob_multi_size][plane_type];
776   int cost = coeff_costs->txb_skip_cost[txb_skip_ctx][0];
777 
778   av1_txb_init_levels(qcoeff, width, height, levels);
779 
780   cost += get_tx_type_cost(x, xd, plane, tx_size, tx_type, reduced_tx_set_used);
781 
782   cost += get_eob_cost(eob, eob_costs, coeff_costs, tx_class);
783 
784   av1_get_nz_map_contexts(levels, scan, eob, tx_size, tx_class, coeff_contexts);
785 
786   const int(*lps_cost)[COEFF_BASE_RANGE + 1 + COEFF_BASE_RANGE + 1] =
787       coeff_costs->lps_cost;
788   int c = eob - 1;
789   {
790     const int pos = scan[c];
791     const tran_low_t v = qcoeff[pos];
792     const int sign = AOMSIGN(v);
793     const int level = (v ^ sign) - sign;
794     const int coeff_ctx = coeff_contexts[pos];
795     cost += coeff_costs->base_eob_cost[coeff_ctx][AOMMIN(level, 3) - 1];
796 
797     if (v) {
798       // sign bit cost
799       if (level > NUM_BASE_LEVELS) {
800         const int ctx = get_br_ctx_eob(pos, bwl, tx_class);
801         cost += get_br_cost(level, lps_cost[ctx]);
802       }
803       if (c) {
804         cost += av1_cost_literal(1);
805       } else {
806         const int sign01 = (sign ^ sign) - sign;
807         const int dc_sign_ctx = txb_ctx->dc_sign_ctx;
808         cost += coeff_costs->dc_sign_cost[dc_sign_ctx][sign01];
809         return cost;
810       }
811     }
812   }
813   const int(*base_cost)[8] = coeff_costs->base_cost;
814   for (c = eob - 2; c >= 1; --c) {
815     const int pos = scan[c];
816     const int coeff_ctx = coeff_contexts[pos];
817     const tran_low_t v = qcoeff[pos];
818     const int level = abs(v);
819     cost += base_cost[coeff_ctx][AOMMIN(level, 3)];
820     if (v) {
821       // sign bit cost
822       cost += av1_cost_literal(1);
823       if (level > NUM_BASE_LEVELS) {
824         const int ctx = get_br_ctx(levels, pos, bwl, tx_class);
825         cost += get_br_cost(level, lps_cost[ctx]);
826       }
827     }
828   }
829   // c == 0 after previous loop
830   {
831     const int pos = scan[c];
832     const tran_low_t v = qcoeff[pos];
833     const int coeff_ctx = coeff_contexts[pos];
834     const int sign = AOMSIGN(v);
835     const int level = (v ^ sign) - sign;
836     cost += base_cost[coeff_ctx][AOMMIN(level, 3)];
837 
838     if (v) {
839       // sign bit cost
840       const int sign01 = (sign ^ sign) - sign;
841       const int dc_sign_ctx = txb_ctx->dc_sign_ctx;
842       cost += coeff_costs->dc_sign_cost[dc_sign_ctx][sign01];
843       if (level > NUM_BASE_LEVELS) {
844         const int ctx = get_br_ctx(levels, pos, bwl, tx_class);
845         cost += get_br_cost(level, lps_cost[ctx]);
846       }
847     }
848   }
849   return cost;
850 }
851 
852 static AOM_FORCE_INLINE int warehouse_efficients_txb_laplacian(
853     const MACROBLOCK *x, const int plane, const int block,
854     const TX_SIZE tx_size, const TXB_CTX *const txb_ctx, const int eob,
855     const PLANE_TYPE plane_type, const LV_MAP_COEFF_COST *const coeff_costs,
856     const MACROBLOCKD *const xd, const TX_TYPE tx_type, const TX_CLASS tx_class,
857     int reduced_tx_set_used) {
858   const int txb_skip_ctx = txb_ctx->txb_skip_ctx;
859 
860   const int eob_multi_size = txsize_log2_minus4[tx_size];
861   const LV_MAP_EOB_COST *const eob_costs =
862       &x->eob_costs[eob_multi_size][plane_type];
863   int cost = coeff_costs->txb_skip_cost[txb_skip_ctx][0];
864 
865   cost += get_tx_type_cost(x, xd, plane, tx_size, tx_type, reduced_tx_set_used);
866 
867   cost += get_eob_cost(eob, eob_costs, coeff_costs, tx_class);
868 
869   cost += av1_cost_coeffs_txb_estimate(x, plane, block, tx_size, tx_type);
870   return cost;
871 }
872 
873 // Look up table of individual cost of coefficient by its quantization level.
874 // determined based on Laplacian distribution conditioned on estimated context
875 static const int costLUT[15] = { -1143, 53,   545,  825,  1031,
876                                  1209,  1393, 1577, 1763, 1947,
877                                  2132,  2317, 2501, 2686, 2871 };
878 static const int const_term = (1 << AV1_PROB_COST_SHIFT);
879 static const int loge_par = ((14427 << AV1_PROB_COST_SHIFT) + 5000) / 10000;
880 int av1_cost_coeffs_txb_estimate(const MACROBLOCK *x, const int plane,
881                                  const int block, const TX_SIZE tx_size,
882                                  const TX_TYPE tx_type) {
883   assert(plane == 0);
884 
885   int cost = 0;
886   const struct macroblock_plane *p = &x->plane[plane];
887   const SCAN_ORDER *scan_order = get_scan(tx_size, tx_type);
888   const int16_t *scan = scan_order->scan;
889   tran_low_t *qcoeff = p->qcoeff + BLOCK_OFFSET(block);
890 
891   int eob = p->eobs[block];
892 
893   // coeffs
894   int c = eob - 1;
895   // eob
896   {
897     const int pos = scan[c];
898     const tran_low_t v = abs(qcoeff[pos]) - 1;
899     cost += (v << (AV1_PROB_COST_SHIFT + 2));
900   }
901   // other coeffs
902   for (c = eob - 2; c >= 0; c--) {
903     const int pos = scan[c];
904     const tran_low_t v = abs(qcoeff[pos]);
905     const int idx = AOMMIN(v, 14);
906 
907     cost += costLUT[idx];
908   }
909 
910   // const_term does not contain DC, and log(e) does not contain eob, so both
911   // (eob-1)
912   cost += (const_term + loge_par) * (eob - 1);
913 
914   return cost;
915 }
916 
917 int av1_cost_coeffs_txb(const MACROBLOCK *x, const int plane, const int block,
918                         const TX_SIZE tx_size, const TX_TYPE tx_type,
919                         const TXB_CTX *const txb_ctx, int reduced_tx_set_used) {
920   const struct macroblock_plane *p = &x->plane[plane];
921   const int eob = p->eobs[block];
922   const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size);
923   const PLANE_TYPE plane_type = get_plane_type(plane);
924   const LV_MAP_COEFF_COST *const coeff_costs =
925       &x->coeff_costs[txs_ctx][plane_type];
926   if (eob == 0) {
927     return coeff_costs->txb_skip_cost[txb_ctx->txb_skip_ctx][1];
928   }
929 
930   const MACROBLOCKD *const xd = &x->e_mbd;
931   const TX_CLASS tx_class = tx_type_to_class[tx_type];
932 
933   return warehouse_efficients_txb(x, plane, block, tx_size, txb_ctx, p, eob,
934                                   plane_type, coeff_costs, xd, tx_type,
935                                   tx_class, reduced_tx_set_used);
936 }
937 
938 int av1_cost_coeffs_txb_laplacian(const MACROBLOCK *x, const int plane,
939                                   const int block, const TX_SIZE tx_size,
940                                   const TX_TYPE tx_type,
941                                   const TXB_CTX *const txb_ctx,
942                                   const int reduced_tx_set_used,
943                                   const int adjust_eob) {
944   const struct macroblock_plane *p = &x->plane[plane];
945   int eob = p->eobs[block];
946 
947   if (adjust_eob) {
948     const SCAN_ORDER *scan_order = get_scan(tx_size, tx_type);
949     const int16_t *scan = scan_order->scan;
950     tran_low_t *tcoeff = p->coeff + BLOCK_OFFSET(block);
951     tran_low_t *qcoeff = p->qcoeff + BLOCK_OFFSET(block);
952     const MACROBLOCKD *xd = &x->e_mbd;
953     const struct macroblockd_plane *const pd = &xd->plane[plane];
954     tran_low_t *dqcoeff = pd->dqcoeff + BLOCK_OFFSET(block);
955     update_coeff_eob_fast(&eob, av1_get_tx_scale(tx_size), p->dequant_QTX, scan,
956                           tcoeff, qcoeff, dqcoeff);
957     p->eobs[block] = eob;
958   }
959 
960   const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size);
961   const PLANE_TYPE plane_type = get_plane_type(plane);
962   const LV_MAP_COEFF_COST *const coeff_costs =
963       &x->coeff_costs[txs_ctx][plane_type];
964   if (eob == 0) {
965     return coeff_costs->txb_skip_cost[txb_ctx->txb_skip_ctx][1];
966   }
967 
968   const MACROBLOCKD *const xd = &x->e_mbd;
969   const TX_CLASS tx_class = tx_type_to_class[tx_type];
970 
971   return warehouse_efficients_txb_laplacian(
972       x, plane, block, tx_size, txb_ctx, eob, plane_type, coeff_costs, xd,
973       tx_type, tx_class, reduced_tx_set_used);
974 }
975 
976 static int optimize_txb(TxbInfo *txb_info, const LV_MAP_COEFF_COST *txb_costs,
977                         const LV_MAP_EOB_COST *txb_eob_costs, int *rate_cost) {
978   int update = 0;
979   if (txb_info->eob == 0) return update;
980   const int16_t *const scan = txb_info->scan_order->scan;
981   // forward optimize the nz_map`
982   const int init_eob = txb_info->eob;
983   const TX_CLASS tx_class = tx_type_to_class[txb_info->tx_type];
984   const int eob_cost =
985       get_eob_cost(init_eob, txb_eob_costs, txb_costs, tx_class);
986 
987   // backward optimize the level-k map
988   int accu_rate = eob_cost;
989   int64_t accu_dist = 0;
990   int64_t prev_eob_rd_cost = INT64_MAX;
991   int64_t cur_eob_rd_cost = 0;
992 
993   {
994     const int si = init_eob - 1;
995     const int coeff_idx = scan[si];
996     LevelDownStats stats;
997     get_dist_cost_stats(&stats, si, si == init_eob - 1, txb_costs, txb_info,
998                         tx_class);
999     if ((stats.rd_low < stats.rd) && (stats.low_qc != 0)) {
1000       update = 1;
1001       update_coeff(coeff_idx, stats.low_qc, txb_info);
1002       accu_rate += stats.rate_low;
1003       accu_dist += stats.dist_low;
1004     } else {
1005       accu_rate += stats.rate;
1006       accu_dist += stats.dist;
1007     }
1008   }
1009 
1010   int si = init_eob - 2;
1011   int8_t has_nz_tail = 0;
1012   // eob is not fixed
1013   for (; si >= 0 && has_nz_tail < 2; --si) {
1014     assert(si != init_eob - 1);
1015     const int coeff_idx = scan[si];
1016     tran_low_t qc = txb_info->qcoeff[coeff_idx];
1017 
1018     if (qc == 0) {
1019       const int coeff_ctx =
1020           get_lower_levels_ctx(txb_info->levels, coeff_idx, txb_info->bwl,
1021                                txb_info->tx_size, tx_class);
1022       accu_rate += txb_costs->base_cost[coeff_ctx][0];
1023     } else {
1024       LevelDownStats stats;
1025       get_dist_cost_stats_with_eob(&stats, si, txb_costs, txb_info, tx_class);
1026       // check if it is better to make this the last significant coefficient
1027       int cur_eob_rate =
1028           get_eob_cost(si + 1, txb_eob_costs, txb_costs, tx_class);
1029       cur_eob_rd_cost = RDCOST(txb_info->rdmult, cur_eob_rate, 0);
1030       prev_eob_rd_cost =
1031           RDCOST(txb_info->rdmult, accu_rate, accu_dist) + stats.nz_rd;
1032       if (cur_eob_rd_cost <= prev_eob_rd_cost) {
1033         update = 1;
1034         for (int j = si + 1; j < txb_info->eob; j++) {
1035           const int coeff_pos_j = scan[j];
1036           update_coeff(coeff_pos_j, 0, txb_info);
1037         }
1038         txb_info->eob = si + 1;
1039 
1040         // rerun cost calculation due to change of eob
1041         accu_rate = cur_eob_rate;
1042         accu_dist = 0;
1043         get_dist_cost_stats(&stats, si, 1, txb_costs, txb_info, tx_class);
1044         if ((stats.rd_low < stats.rd) && (stats.low_qc != 0)) {
1045           update = 1;
1046           update_coeff(coeff_idx, stats.low_qc, txb_info);
1047           accu_rate += stats.rate_low;
1048           accu_dist += stats.dist_low;
1049         } else {
1050           accu_rate += stats.rate;
1051           accu_dist += stats.dist;
1052         }
1053 
1054         // reset non zero tail when new eob is found
1055         has_nz_tail = 0;
1056       } else {
1057         int bUpdCoeff = 0;
1058         if (stats.rd_low < stats.rd) {
1059           if ((si < txb_info->eob - 1)) {
1060             bUpdCoeff = 1;
1061             update = 1;
1062           }
1063         } else {
1064           ++has_nz_tail;
1065         }
1066 
1067         if (bUpdCoeff) {
1068           update_coeff(coeff_idx, stats.low_qc, txb_info);
1069           accu_rate += stats.rate_low;
1070           accu_dist += stats.dist_low;
1071         } else {
1072           accu_rate += stats.rate;
1073           accu_dist += stats.dist;
1074         }
1075       }
1076     }
1077   }  // for (si)
1078 
1079   // eob is fixed
1080   for (; si >= 0; --si) {
1081     assert(si != init_eob - 1);
1082     const int coeff_idx = scan[si];
1083     tran_low_t qc = txb_info->qcoeff[coeff_idx];
1084 
1085     if (qc == 0) {
1086       const int coeff_ctx =
1087           get_lower_levels_ctx(txb_info->levels, coeff_idx, txb_info->bwl,
1088                                txb_info->tx_size, tx_class);
1089       accu_rate += txb_costs->base_cost[coeff_ctx][0];
1090     } else {
1091       LevelDownStats stats;
1092       get_dist_cost_stats(&stats, si, 0, txb_costs, txb_info, tx_class);
1093 
1094       int bUpdCoeff = 0;
1095       if (stats.rd_low < stats.rd) {
1096         if ((si < txb_info->eob - 1)) {
1097           bUpdCoeff = 1;
1098           update = 1;
1099         }
1100       }
1101       if (bUpdCoeff) {
1102         update_coeff(coeff_idx, stats.low_qc, txb_info);
1103         accu_rate += stats.rate_low;
1104         accu_dist += stats.dist_low;
1105       } else {
1106         accu_rate += stats.rate;
1107         accu_dist += stats.dist;
1108       }
1109     }
1110   }  // for (si)
1111 
1112   int non_zero_blk_rate =
1113       txb_costs->txb_skip_cost[txb_info->txb_ctx->txb_skip_ctx][0];
1114   prev_eob_rd_cost =
1115       RDCOST(txb_info->rdmult, accu_rate + non_zero_blk_rate, accu_dist);
1116 
1117   int zero_blk_rate =
1118       txb_costs->txb_skip_cost[txb_info->txb_ctx->txb_skip_ctx][1];
1119   int64_t zero_blk_rd_cost = RDCOST(txb_info->rdmult, zero_blk_rate, 0);
1120   if (zero_blk_rd_cost <= prev_eob_rd_cost) {
1121     update = 1;
1122     for (int j = 0; j < txb_info->eob; j++) {
1123       const int coeff_pos_j = scan[j];
1124       update_coeff(coeff_pos_j, 0, txb_info);
1125     }
1126     txb_info->eob = 0;
1127   }
1128 
1129   // record total rate cost
1130   *rate_cost = zero_blk_rd_cost <= prev_eob_rd_cost
1131                    ? zero_blk_rate
1132                    : accu_rate + non_zero_blk_rate;
1133 
1134   if (txb_info->eob > 0) {
1135     *rate_cost += txb_info->tx_type_cost;
1136   }
1137 
1138   return update;
1139 }
1140 
1141 #if CONFIG_HTB_TRELLIS
1142 static void hbt_init() {
1143   hbt_hash_table =
1144       aom_malloc(sizeof(OptTxbQcoeff) * HBT_TABLE_SIZE * HBT_ARRAY_LENGTH);
1145   memset(hbt_hash_table, 0,
1146          sizeof(OptTxbQcoeff) * HBT_TABLE_SIZE * HBT_ARRAY_LENGTH);
1147   av1_crc32c_calculator_init(&crc_calculator);  // 31 bit: qc & ctx
1148 
1149   hbt_needs_init = 0;
1150 }
1151 
1152 void hbt_destroy() { aom_free(hbt_hash_table); }
1153 
1154 static int hbt_hash_miss(uint32_t hbt_ctx_hash, uint32_t hbt_qc_hash,
1155                          TxbInfo *txb_info, const LV_MAP_COEFF_COST *txb_costs,
1156                          const LV_MAP_EOB_COST *txb_eob_costs,
1157                          const struct macroblock_plane *p, int block,
1158                          int fast_mode, int *rate_cost) {
1159   (void)fast_mode;
1160   const int16_t *scan = txb_info->scan_order->scan;
1161   int prev_eob = txb_info->eob;
1162   assert(HBT_EOB <= 16);  // Lengthen array if allowing longer eob.
1163   int32_t prev_coeff[16];
1164   for (int i = 0; i < prev_eob; i++) {
1165     prev_coeff[i] = txb_info->qcoeff[scan[i]];
1166   }
1167   for (int i = prev_eob; i < HBT_EOB; i++) {
1168     prev_coeff[i] = 0;  // For compiler piece of mind.
1169   }
1170 
1171   av1_txb_init_levels(txb_info->qcoeff, txb_info->width, txb_info->height,
1172                       txb_info->levels);
1173 
1174   const int update =
1175       optimize_txb(txb_info, txb_costs, txb_eob_costs, rate_cost);
1176 
1177   // Overwrite old entry
1178   uint16_t hbt_table_index = hbt_ctx_hash % HBT_TABLE_SIZE;
1179   uint16_t hbt_array_index = hbt_qc_hash % HBT_ARRAY_LENGTH;
1180   hbt_hash_table[hbt_table_index * HBT_ARRAY_LENGTH + hbt_array_index]
1181       .rate_cost = *rate_cost;
1182   hbt_hash_table[hbt_table_index * HBT_ARRAY_LENGTH + hbt_array_index].init = 1;
1183   hbt_hash_table[hbt_table_index * HBT_ARRAY_LENGTH + hbt_array_index]
1184       .hbt_qc_hash = hbt_qc_hash;
1185   hbt_hash_table[hbt_table_index * HBT_ARRAY_LENGTH + hbt_array_index]
1186       .hbt_ctx_hash = hbt_ctx_hash;
1187   assert(prev_eob >= txb_info->eob);  // eob can't get longer
1188   for (int i = 0; i < txb_info->eob; i++) {
1189     // Record how coeff changed. Convention: towards zero is negative.
1190     if (txb_info->qcoeff[scan[i]] > 0)
1191       hbt_hash_table[hbt_table_index * HBT_ARRAY_LENGTH + hbt_array_index]
1192           .deltas[i] = txb_info->qcoeff[scan[i]] - prev_coeff[i];
1193     else
1194       hbt_hash_table[hbt_table_index * HBT_ARRAY_LENGTH + hbt_array_index]
1195           .deltas[i] = prev_coeff[i] - txb_info->qcoeff[scan[i]];
1196   }
1197   for (int i = txb_info->eob; i < prev_eob; i++) {
1198     // If eob got shorter, record that all after it changed to zero.
1199     if (prev_coeff[i] > 0)
1200       hbt_hash_table[hbt_table_index * HBT_ARRAY_LENGTH + hbt_array_index]
1201           .deltas[i] = -prev_coeff[i];
1202     else
1203       hbt_hash_table[hbt_table_index * HBT_ARRAY_LENGTH + hbt_array_index]
1204           .deltas[i] = prev_coeff[i];
1205   }
1206   for (int i = prev_eob; i < HBT_EOB; i++) {
1207     // Record 'no change' after optimized coefficients run out.
1208     hbt_hash_table[hbt_table_index * HBT_ARRAY_LENGTH + hbt_array_index]
1209         .deltas[i] = 0;
1210   }
1211 
1212   if (update) {
1213     p->eobs[block] = txb_info->eob;
1214     p->txb_entropy_ctx[block] = av1_get_txb_entropy_context(
1215         txb_info->qcoeff, txb_info->scan_order, txb_info->eob);
1216   }
1217   return txb_info->eob;
1218 }
1219 
1220 static int hbt_hash_hit(uint32_t hbt_table_index, int hbt_array_index,
1221                         TxbInfo *txb_info, const struct macroblock_plane *p,
1222                         int block, int *rate_cost) {
1223   const int16_t *scan = txb_info->scan_order->scan;
1224   int new_eob = 0;
1225   int update = 0;
1226 
1227   for (int i = 0; i < txb_info->eob; i++) {
1228     // Delta convention is negatives go towards zero, so only apply those ones.
1229     if (hbt_hash_table[hbt_table_index * HBT_ARRAY_LENGTH + hbt_array_index]
1230             .deltas[i] < 0) {
1231       if (txb_info->qcoeff[scan[i]] > 0)
1232         txb_info->qcoeff[scan[i]] +=
1233             hbt_hash_table[hbt_table_index * HBT_ARRAY_LENGTH + hbt_array_index]
1234                 .deltas[i];
1235       else
1236         txb_info->qcoeff[scan[i]] -=
1237             hbt_hash_table[hbt_table_index * HBT_ARRAY_LENGTH + hbt_array_index]
1238                 .deltas[i];
1239 
1240       update = 1;
1241       update_coeff(scan[i], txb_info->qcoeff[scan[i]], txb_info);
1242     }
1243     if (txb_info->qcoeff[scan[i]]) new_eob = i + 1;
1244   }
1245 
1246   // Rate_cost can be calculated here instead (av1_cost_coeffs_txb), but
1247   // it is expensive and gives little benefit as long as qc_hash is high bit
1248   *rate_cost =
1249       hbt_hash_table[hbt_table_index * HBT_ARRAY_LENGTH + hbt_array_index]
1250           .rate_cost;
1251 
1252   if (update) {
1253     txb_info->eob = new_eob;
1254     p->eobs[block] = txb_info->eob;
1255     p->txb_entropy_ctx[block] = av1_get_txb_entropy_context(
1256         txb_info->qcoeff, txb_info->scan_order, txb_info->eob);
1257   }
1258 
1259   return txb_info->eob;
1260 }
1261 
1262 static int hbt_search_match(uint32_t hbt_ctx_hash, uint32_t hbt_qc_hash,
1263                             TxbInfo *txb_info,
1264                             const LV_MAP_COEFF_COST *txb_costs,
1265                             const LV_MAP_EOB_COST *txb_eob_costs,
1266                             const struct macroblock_plane *p, int block,
1267                             int fast_mode, int *rate_cost) {
1268   // Check for qcoeff match
1269   int hbt_array_index = hbt_qc_hash % HBT_ARRAY_LENGTH;
1270   int hbt_table_index = hbt_ctx_hash % HBT_TABLE_SIZE;
1271 
1272   if (hbt_hash_table[hbt_table_index * HBT_ARRAY_LENGTH + hbt_array_index]
1273               .hbt_qc_hash == hbt_qc_hash &&
1274       hbt_hash_table[hbt_table_index * HBT_ARRAY_LENGTH + hbt_array_index]
1275               .hbt_ctx_hash == hbt_ctx_hash &&
1276       hbt_hash_table[hbt_table_index * HBT_ARRAY_LENGTH + hbt_array_index]
1277           .init) {
1278     return hbt_hash_hit(hbt_table_index, hbt_array_index, txb_info, p, block,
1279                         rate_cost);
1280   } else {
1281     return hbt_hash_miss(hbt_ctx_hash, hbt_qc_hash, txb_info, txb_costs,
1282                          txb_eob_costs, p, block, fast_mode, rate_cost);
1283   }
1284 }
1285 
1286 static int hbt_create_hashes(TxbInfo *txb_info,
1287                              const LV_MAP_COEFF_COST *txb_costs,
1288                              const LV_MAP_EOB_COST *txb_eob_costs,
1289                              const struct macroblock_plane *p, int block,
1290                              int fast_mode, int *rate_cost) {
1291   // Initialize hash table if needed.
1292   if (hbt_needs_init) {
1293     hbt_init();
1294   }
1295 
1296   //// Hash creation
1297   uint8_t txb_hash_data[256];  // Asserts below to ensure enough space.
1298   const int16_t *scan = txb_info->scan_order->scan;
1299   uint8_t chunk = 0;
1300   int hash_data_index = 0;
1301 
1302   // Make qc_hash.
1303   int packing_index = 0;  // needed for packing.
1304   for (int i = 0; i < txb_info->eob; i++) {
1305     tran_low_t prechunk = txb_info->qcoeff[scan[i]];
1306 
1307     // Softening: Improves speed. Aligns with signed deltas.
1308     if (prechunk < 0) prechunk *= -1;
1309 
1310     // Early kick out: Don't apply feature if there are large coeffs:
1311     // If this kickout value is removed or raised beyond int8_t,
1312     // widen deltas type in OptTxbQcoeff struct.
1313     assert((int8_t)HBT_KICKOUT == HBT_KICKOUT);  // If not, widen types.
1314     if (prechunk > HBT_KICKOUT) {
1315       av1_txb_init_levels(txb_info->qcoeff, txb_info->width, txb_info->height,
1316                           txb_info->levels);
1317 
1318       const int update =
1319           optimize_txb(txb_info, txb_costs, txb_eob_costs, rate_cost);
1320 
1321       if (update) {
1322         p->eobs[block] = txb_info->eob;
1323         p->txb_entropy_ctx[block] = av1_get_txb_entropy_context(
1324             txb_info->qcoeff, txb_info->scan_order, txb_info->eob);
1325       }
1326       return txb_info->eob;
1327     }
1328 
1329     // Since coeffs are 0 to 3, only 2 bits are needed: pack into bytes
1330     if (packing_index == 0) txb_hash_data[hash_data_index] = 0;
1331     chunk = prechunk << packing_index;
1332     packing_index += 2;
1333     txb_hash_data[hash_data_index] |= chunk;
1334 
1335     // Full byte:
1336     if (packing_index == 8) {
1337       packing_index = 0;
1338       hash_data_index++;
1339     }
1340   }
1341   // Needed when packing_index != 0, to include final byte.
1342   hash_data_index++;
1343   assert(hash_data_index <= 64);
1344   // 31 bit qc_hash: index to array
1345   uint32_t hbt_qc_hash =
1346       av1_get_crc32c_value(&crc_calculator, txb_hash_data, hash_data_index);
1347 
1348   // Make ctx_hash.
1349   hash_data_index = 0;
1350   tran_low_t prechunk;
1351 
1352   for (int i = 0; i < txb_info->eob; i++) {
1353     // Save as magnitudes towards or away from zero.
1354     if (txb_info->tcoeff[scan[i]] >= 0)
1355       prechunk = txb_info->tcoeff[scan[i]] - txb_info->dqcoeff[scan[i]];
1356     else
1357       prechunk = txb_info->dqcoeff[scan[i]] - txb_info->tcoeff[scan[i]];
1358 
1359     chunk = prechunk & 0xff;
1360     txb_hash_data[hash_data_index++] = chunk;
1361   }
1362 
1363   // Extra ctx data:
1364   // Include dequants.
1365   txb_hash_data[hash_data_index++] = txb_info->dequant[0] & 0xff;
1366   txb_hash_data[hash_data_index++] = txb_info->dequant[1] & 0xff;
1367   chunk = txb_info->txb_ctx->txb_skip_ctx & 0xff;
1368   txb_hash_data[hash_data_index++] = chunk;
1369   chunk = txb_info->txb_ctx->dc_sign_ctx & 0xff;
1370   txb_hash_data[hash_data_index++] = chunk;
1371   // eob
1372   chunk = txb_info->eob & 0xff;
1373   txb_hash_data[hash_data_index++] = chunk;
1374   // rdmult (int64)
1375   chunk = txb_info->rdmult & 0xff;
1376   txb_hash_data[hash_data_index++] = chunk;
1377   // tx_type
1378   chunk = txb_info->tx_type & 0xff;
1379   txb_hash_data[hash_data_index++] = chunk;
1380   // base_eob_cost
1381   for (int i = 1; i < 3; i++) {  // i = 0 are softened away
1382     for (int j = 0; j < SIG_COEF_CONTEXTS_EOB; j++) {
1383       chunk = (txb_costs->base_eob_cost[j][i] & 0xff00) >> 8;
1384       txb_hash_data[hash_data_index++] = chunk;
1385     }
1386   }
1387   // eob_cost
1388   for (int i = 0; i < 11; i++) {
1389     for (int j = 0; j < 2; j++) {
1390       chunk = (txb_eob_costs->eob_cost[j][i] & 0xff00) >> 8;
1391       txb_hash_data[hash_data_index++] = chunk;
1392     }
1393   }
1394   // dc_sign_cost
1395   for (int i = 0; i < 2; i++) {
1396     for (int j = 0; j < DC_SIGN_CONTEXTS; j++) {
1397       chunk = (txb_costs->dc_sign_cost[j][i] & 0xff00) >> 8;
1398       txb_hash_data[hash_data_index++] = chunk;
1399     }
1400   }
1401 
1402   assert(hash_data_index <= 256);
1403   // 31 bit ctx_hash: used to index table
1404   uint32_t hbt_ctx_hash =
1405       av1_get_crc32c_value(&crc_calculator, txb_hash_data, hash_data_index);
1406   //// End hash creation
1407 
1408   return hbt_search_match(hbt_ctx_hash, hbt_qc_hash, txb_info, txb_costs,
1409                           txb_eob_costs, p, block, fast_mode, rate_cost);
1410 }
1411 #endif  // CONFIG_HTB_TRELLIS
1412 
1413 static AOM_FORCE_INLINE int get_two_coeff_cost_simple(
1414     int ci, tran_low_t abs_qc, int coeff_ctx,
1415     const LV_MAP_COEFF_COST *txb_costs, int bwl, TX_CLASS tx_class,
1416     const uint8_t *levels, int *cost_low) {
1417   // this simple version assumes the coeff's scan_idx is not DC (scan_idx != 0)
1418   // and not the last (scan_idx != eob - 1)
1419   assert(ci > 0);
1420   int cost = txb_costs->base_cost[coeff_ctx][AOMMIN(abs_qc, 3)];
1421   int diff = 0;
1422   if (abs_qc <= 3) diff = txb_costs->base_cost[coeff_ctx][abs_qc + 4];
1423   if (abs_qc) {
1424     cost += av1_cost_literal(1);
1425     if (abs_qc > NUM_BASE_LEVELS) {
1426       const int br_ctx = get_br_ctx(levels, ci, bwl, tx_class);
1427       int brcost_diff = 0;
1428       cost += get_br_cost_with_diff(abs_qc, txb_costs->lps_cost[br_ctx],
1429                                     &brcost_diff);
1430       diff += brcost_diff;
1431     }
1432   }
1433   *cost_low = cost - diff;
1434 
1435   return cost;
1436 }
1437 
1438 static INLINE int get_coeff_cost_eob(int ci, tran_low_t abs_qc, int sign,
1439                                      int coeff_ctx, int dc_sign_ctx,
1440                                      const LV_MAP_COEFF_COST *txb_costs,
1441                                      int bwl, TX_CLASS tx_class) {
1442   int cost = 0;
1443   cost += txb_costs->base_eob_cost[coeff_ctx][AOMMIN(abs_qc, 3) - 1];
1444   if (abs_qc != 0) {
1445     if (ci == 0) {
1446       cost += txb_costs->dc_sign_cost[dc_sign_ctx][sign];
1447     } else {
1448       cost += av1_cost_literal(1);
1449     }
1450     if (abs_qc > NUM_BASE_LEVELS) {
1451       int br_ctx;
1452       br_ctx = get_br_ctx_eob(ci, bwl, tx_class);
1453       cost += get_br_cost(abs_qc, txb_costs->lps_cost[br_ctx]);
1454     }
1455   }
1456   return cost;
1457 }
1458 
1459 static INLINE int get_coeff_cost_general(int is_last, int ci, tran_low_t abs_qc,
1460                                          int sign, int coeff_ctx,
1461                                          int dc_sign_ctx,
1462                                          const LV_MAP_COEFF_COST *txb_costs,
1463                                          int bwl, TX_CLASS tx_class,
1464                                          const uint8_t *levels) {
1465   int cost = 0;
1466   if (is_last) {
1467     cost += txb_costs->base_eob_cost[coeff_ctx][AOMMIN(abs_qc, 3) - 1];
1468   } else {
1469     cost += txb_costs->base_cost[coeff_ctx][AOMMIN(abs_qc, 3)];
1470   }
1471   if (abs_qc != 0) {
1472     if (ci == 0) {
1473       cost += txb_costs->dc_sign_cost[dc_sign_ctx][sign];
1474     } else {
1475       cost += av1_cost_literal(1);
1476     }
1477     if (abs_qc > NUM_BASE_LEVELS) {
1478       int br_ctx;
1479       if (is_last)
1480         br_ctx = get_br_ctx_eob(ci, bwl, tx_class);
1481       else
1482         br_ctx = get_br_ctx(levels, ci, bwl, tx_class);
1483       cost += get_br_cost(abs_qc, txb_costs->lps_cost[br_ctx]);
1484     }
1485   }
1486   return cost;
1487 }
1488 
1489 static INLINE void get_qc_dqc_low(tran_low_t abs_qc, int sign, int dqv,
1490                                   int shift, tran_low_t *qc_low,
1491                                   tran_low_t *dqc_low) {
1492   tran_low_t abs_qc_low = abs_qc - 1;
1493   *qc_low = (-sign ^ abs_qc_low) + sign;
1494   assert((sign ? -abs_qc_low : abs_qc_low) == *qc_low);
1495   tran_low_t abs_dqc_low = (abs_qc_low * dqv) >> shift;
1496   *dqc_low = (-sign ^ abs_dqc_low) + sign;
1497   assert((sign ? -abs_dqc_low : abs_dqc_low) == *dqc_low);
1498 }
1499 
1500 static INLINE void update_coeff_general(
1501     int *accu_rate, int64_t *accu_dist, int si, int eob, TX_SIZE tx_size,
1502     TX_CLASS tx_class, int bwl, int height, int64_t rdmult, int shift,
1503     int dc_sign_ctx, const int16_t *dequant, const int16_t *scan,
1504     const LV_MAP_COEFF_COST *txb_costs, const tran_low_t *tcoeff,
1505     tran_low_t *qcoeff, tran_low_t *dqcoeff, uint8_t *levels,
1506     const qm_val_t *iqmatrix) {
1507   const int dqv = get_dqv(dequant, scan[si], iqmatrix);
1508   const int ci = scan[si];
1509   const tran_low_t qc = qcoeff[ci];
1510   const int is_last = si == (eob - 1);
1511   const int coeff_ctx = get_lower_levels_ctx_general(
1512       is_last, si, bwl, height, levels, ci, tx_size, tx_class);
1513   if (qc == 0) {
1514     *accu_rate += txb_costs->base_cost[coeff_ctx][0];
1515   } else {
1516     const int sign = (qc < 0) ? 1 : 0;
1517     const tran_low_t abs_qc = abs(qc);
1518     const tran_low_t tqc = tcoeff[ci];
1519     const tran_low_t dqc = dqcoeff[ci];
1520     const int64_t dist = get_coeff_dist(tqc, dqc, shift);
1521     const int64_t dist0 = get_coeff_dist(tqc, 0, shift);
1522     const int rate =
1523         get_coeff_cost_general(is_last, ci, abs_qc, sign, coeff_ctx,
1524                                dc_sign_ctx, txb_costs, bwl, tx_class, levels);
1525     const int64_t rd = RDCOST(rdmult, rate, dist);
1526 
1527     tran_low_t qc_low, dqc_low;
1528     tran_low_t abs_qc_low;
1529     int64_t dist_low, rd_low;
1530     int rate_low;
1531     if (abs_qc == 1) {
1532       abs_qc_low = qc_low = dqc_low = 0;
1533       dist_low = dist0;
1534       rate_low = txb_costs->base_cost[coeff_ctx][0];
1535     } else {
1536       get_qc_dqc_low(abs_qc, sign, dqv, shift, &qc_low, &dqc_low);
1537       abs_qc_low = abs_qc - 1;
1538       dist_low = get_coeff_dist(tqc, dqc_low, shift);
1539       rate_low =
1540           get_coeff_cost_general(is_last, ci, abs_qc_low, sign, coeff_ctx,
1541                                  dc_sign_ctx, txb_costs, bwl, tx_class, levels);
1542     }
1543 
1544     rd_low = RDCOST(rdmult, rate_low, dist_low);
1545     if (rd_low < rd) {
1546       qcoeff[ci] = qc_low;
1547       dqcoeff[ci] = dqc_low;
1548       levels[get_padded_idx(ci, bwl)] = AOMMIN(abs_qc_low, INT8_MAX);
1549       *accu_rate += rate_low;
1550       *accu_dist += dist_low - dist0;
1551     } else {
1552       *accu_rate += rate;
1553       *accu_dist += dist - dist0;
1554     }
1555   }
1556 }
1557 
1558 static AOM_FORCE_INLINE void update_coeff_simple(
1559     int *accu_rate, int si, int eob, TX_SIZE tx_size, TX_CLASS tx_class,
1560     int bwl, int64_t rdmult, int shift, const int16_t *dequant,
1561     const int16_t *scan, const LV_MAP_COEFF_COST *txb_costs,
1562     const tran_low_t *tcoeff, tran_low_t *qcoeff, tran_low_t *dqcoeff,
1563     uint8_t *levels, const qm_val_t *iqmatrix) {
1564   const int dqv = get_dqv(dequant, scan[si], iqmatrix);
1565   (void)eob;
1566   // this simple version assumes the coeff's scan_idx is not DC (scan_idx != 0)
1567   // and not the last (scan_idx != eob - 1)
1568   assert(si != eob - 1);
1569   assert(si > 0);
1570   const int ci = scan[si];
1571   const tran_low_t qc = qcoeff[ci];
1572   const int coeff_ctx =
1573       get_lower_levels_ctx(levels, ci, bwl, tx_size, tx_class);
1574   if (qc == 0) {
1575     *accu_rate += txb_costs->base_cost[coeff_ctx][0];
1576   } else {
1577     const tran_low_t abs_qc = abs(qc);
1578     const tran_low_t abs_tqc = abs(tcoeff[ci]);
1579     const tran_low_t abs_dqc = abs(dqcoeff[ci]);
1580     int rate_low = 0;
1581     const int rate = get_two_coeff_cost_simple(
1582         ci, abs_qc, coeff_ctx, txb_costs, bwl, tx_class, levels, &rate_low);
1583     if (abs_dqc < abs_tqc) {
1584       *accu_rate += rate;
1585       return;
1586     }
1587 
1588     const int64_t dist = get_coeff_dist(abs_tqc, abs_dqc, shift);
1589     const int64_t rd = RDCOST(rdmult, rate, dist);
1590 
1591     const tran_low_t abs_qc_low = abs_qc - 1;
1592     const tran_low_t abs_dqc_low = (abs_qc_low * dqv) >> shift;
1593     const int64_t dist_low = get_coeff_dist(abs_tqc, abs_dqc_low, shift);
1594     const int64_t rd_low = RDCOST(rdmult, rate_low, dist_low);
1595 
1596     if (rd_low < rd) {
1597       const int sign = (qc < 0) ? 1 : 0;
1598       qcoeff[ci] = (-sign ^ abs_qc_low) + sign;
1599       dqcoeff[ci] = (-sign ^ abs_dqc_low) + sign;
1600       levels[get_padded_idx(ci, bwl)] = AOMMIN(abs_qc_low, INT8_MAX);
1601       *accu_rate += rate_low;
1602     } else {
1603       *accu_rate += rate;
1604     }
1605   }
1606 }
1607 
1608 static AOM_FORCE_INLINE void update_coeff_eob(
1609     int *accu_rate, int64_t *accu_dist, int *eob, int *nz_num, int *nz_ci,
1610     int si, TX_SIZE tx_size, TX_CLASS tx_class, int bwl, int height,
1611     int dc_sign_ctx, int64_t rdmult, int shift, const int16_t *dequant,
1612     const int16_t *scan, const LV_MAP_EOB_COST *txb_eob_costs,
1613     const LV_MAP_COEFF_COST *txb_costs, const tran_low_t *tcoeff,
1614     tran_low_t *qcoeff, tran_low_t *dqcoeff, uint8_t *levels, int sharpness,
1615     const qm_val_t *iqmatrix) {
1616   const int dqv = get_dqv(dequant, scan[si], iqmatrix);
1617   assert(si != *eob - 1);
1618   const int ci = scan[si];
1619   const tran_low_t qc = qcoeff[ci];
1620   const int coeff_ctx =
1621       get_lower_levels_ctx(levels, ci, bwl, tx_size, tx_class);
1622   if (qc == 0) {
1623     *accu_rate += txb_costs->base_cost[coeff_ctx][0];
1624   } else {
1625     int lower_level = 0;
1626     const tran_low_t abs_qc = abs(qc);
1627     const tran_low_t tqc = tcoeff[ci];
1628     const tran_low_t dqc = dqcoeff[ci];
1629     const int sign = (qc < 0) ? 1 : 0;
1630     const int64_t dist0 = get_coeff_dist(tqc, 0, shift);
1631     int64_t dist = get_coeff_dist(tqc, dqc, shift) - dist0;
1632     int rate =
1633         get_coeff_cost_general(0, ci, abs_qc, sign, coeff_ctx, dc_sign_ctx,
1634                                txb_costs, bwl, tx_class, levels);
1635     int64_t rd = RDCOST(rdmult, *accu_rate + rate, *accu_dist + dist);
1636 
1637     tran_low_t qc_low, dqc_low;
1638     tran_low_t abs_qc_low;
1639     int64_t dist_low, rd_low;
1640     int rate_low;
1641     if (abs_qc == 1) {
1642       abs_qc_low = 0;
1643       dqc_low = qc_low = 0;
1644       dist_low = 0;
1645       rate_low = txb_costs->base_cost[coeff_ctx][0];
1646       rd_low = RDCOST(rdmult, *accu_rate + rate_low, *accu_dist);
1647     } else {
1648       get_qc_dqc_low(abs_qc, sign, dqv, shift, &qc_low, &dqc_low);
1649       abs_qc_low = abs_qc - 1;
1650       dist_low = get_coeff_dist(tqc, dqc_low, shift) - dist0;
1651       rate_low =
1652           get_coeff_cost_general(0, ci, abs_qc_low, sign, coeff_ctx,
1653                                  dc_sign_ctx, txb_costs, bwl, tx_class, levels);
1654       rd_low = RDCOST(rdmult, *accu_rate + rate_low, *accu_dist + dist_low);
1655     }
1656 
1657     int lower_level_new_eob = 0;
1658     const int new_eob = si + 1;
1659     const int coeff_ctx_new_eob = get_lower_levels_ctx_eob(bwl, height, si);
1660     const int new_eob_cost =
1661         get_eob_cost(new_eob, txb_eob_costs, txb_costs, tx_class);
1662     int rate_coeff_eob =
1663         new_eob_cost + get_coeff_cost_eob(ci, abs_qc, sign, coeff_ctx_new_eob,
1664                                           dc_sign_ctx, txb_costs, bwl,
1665                                           tx_class);
1666     int64_t dist_new_eob = dist;
1667     int64_t rd_new_eob = RDCOST(rdmult, rate_coeff_eob, dist_new_eob);
1668 
1669     if (abs_qc_low > 0) {
1670       const int rate_coeff_eob_low =
1671           new_eob_cost + get_coeff_cost_eob(ci, abs_qc_low, sign,
1672                                             coeff_ctx_new_eob, dc_sign_ctx,
1673                                             txb_costs, bwl, tx_class);
1674       const int64_t dist_new_eob_low = dist_low;
1675       const int64_t rd_new_eob_low =
1676           RDCOST(rdmult, rate_coeff_eob_low, dist_new_eob_low);
1677       if (rd_new_eob_low < rd_new_eob) {
1678         lower_level_new_eob = 1;
1679         rd_new_eob = rd_new_eob_low;
1680         rate_coeff_eob = rate_coeff_eob_low;
1681         dist_new_eob = dist_new_eob_low;
1682       }
1683     }
1684 
1685     if (rd_low < rd) {
1686       lower_level = 1;
1687       rd = rd_low;
1688       rate = rate_low;
1689       dist = dist_low;
1690     }
1691 
1692     if (sharpness == 0 && rd_new_eob < rd) {
1693       for (int ni = 0; ni < *nz_num; ++ni) {
1694         int last_ci = nz_ci[ni];
1695         levels[get_padded_idx(last_ci, bwl)] = 0;
1696         qcoeff[last_ci] = 0;
1697         dqcoeff[last_ci] = 0;
1698       }
1699       *eob = new_eob;
1700       *nz_num = 0;
1701       *accu_rate = rate_coeff_eob;
1702       *accu_dist = dist_new_eob;
1703       lower_level = lower_level_new_eob;
1704     } else {
1705       *accu_rate += rate;
1706       *accu_dist += dist;
1707     }
1708 
1709     if (lower_level) {
1710       qcoeff[ci] = qc_low;
1711       dqcoeff[ci] = dqc_low;
1712       levels[get_padded_idx(ci, bwl)] = AOMMIN(abs_qc_low, INT8_MAX);
1713     }
1714     if (qcoeff[ci]) {
1715       nz_ci[*nz_num] = ci;
1716       ++*nz_num;
1717     }
1718   }
1719 }
1720 
1721 static INLINE void update_skip(int *accu_rate, int64_t accu_dist, int *eob,
1722                                int nz_num, int *nz_ci, int64_t rdmult,
1723                                int skip_cost, int non_skip_cost,
1724                                tran_low_t *qcoeff, tran_low_t *dqcoeff,
1725                                int sharpness) {
1726   const int64_t rd = RDCOST(rdmult, *accu_rate + non_skip_cost, accu_dist);
1727   const int64_t rd_new_eob = RDCOST(rdmult, skip_cost, 0);
1728   if (sharpness == 0 && rd_new_eob < rd) {
1729     for (int i = 0; i < nz_num; ++i) {
1730       const int ci = nz_ci[i];
1731       qcoeff[ci] = 0;
1732       dqcoeff[ci] = 0;
1733       // no need to set up levels because this is the last step
1734       // levels[get_padded_idx(ci, bwl)] = 0;
1735     }
1736     *accu_rate = 0;
1737     *eob = 0;
1738   }
1739 }
1740 
1741 int av1_optimize_txb_new(const struct AV1_COMP *cpi, MACROBLOCK *x, int plane,
1742                          int block, TX_SIZE tx_size, TX_TYPE tx_type,
1743                          const TXB_CTX *const txb_ctx, int *rate_cost,
1744                          int sharpness, int fast_mode) {
1745   MACROBLOCKD *xd = &x->e_mbd;
1746   struct macroblockd_plane *pd = &xd->plane[plane];
1747   const struct macroblock_plane *p = &x->plane[plane];
1748   const SCAN_ORDER *scan_order = get_scan(tx_size, tx_type);
1749   const int16_t *scan = scan_order->scan;
1750   const int shift = av1_get_tx_scale(tx_size);
1751   int eob = p->eobs[block];
1752   const int16_t *dequant = p->dequant_QTX;
1753   const qm_val_t *iqmatrix =
1754       av1_get_iqmatrix(&cpi->common.quant_params, xd, plane, tx_size, tx_type);
1755   const int block_offset = BLOCK_OFFSET(block);
1756   tran_low_t *qcoeff = p->qcoeff + block_offset;
1757   tran_low_t *dqcoeff = pd->dqcoeff + block_offset;
1758   const tran_low_t *tcoeff = p->coeff + block_offset;
1759 
1760   // This function is not called if eob = 0.
1761   assert(eob > 0);
1762 
1763   if (fast_mode) {
1764     update_coeff_eob_fast(&eob, shift, dequant, scan, tcoeff, qcoeff, dqcoeff);
1765     p->eobs[block] = eob;
1766     if (eob == 0) {
1767       *rate_cost = av1_cost_skip_txb(x, txb_ctx, plane, tx_size);
1768       return eob;
1769     }
1770   }
1771 
1772   const AV1_COMMON *cm = &cpi->common;
1773   const PLANE_TYPE plane_type = get_plane_type(plane);
1774   const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size);
1775   const TX_CLASS tx_class = tx_type_to_class[tx_type];
1776   const MB_MODE_INFO *mbmi = xd->mi[0];
1777   const int bwl = get_txb_bwl(tx_size);
1778   const int width = get_txb_wide(tx_size);
1779   const int height = get_txb_high(tx_size);
1780   assert(width == (1 << bwl));
1781   const int is_inter = is_inter_block(mbmi);
1782   const LV_MAP_COEFF_COST *txb_costs = &x->coeff_costs[txs_ctx][plane_type];
1783   const int eob_multi_size = txsize_log2_minus4[tx_size];
1784   const LV_MAP_EOB_COST *txb_eob_costs =
1785       &x->eob_costs[eob_multi_size][plane_type];
1786 
1787   const int rshift =
1788       (sharpness +
1789        (cpi->oxcf.aq_mode == VARIANCE_AQ && mbmi->segment_id < 4
1790             ? 7 - mbmi->segment_id
1791             : 2) +
1792        (cpi->oxcf.aq_mode != VARIANCE_AQ &&
1793                 cpi->oxcf.deltaq_mode == DELTA_Q_PERCEPTUAL &&
1794                 cm->delta_q_info.delta_q_present_flag && x->sb_energy_level < 0
1795             ? (3 - x->sb_energy_level)
1796             : 0));
1797   const int64_t rdmult =
1798       (((int64_t)x->rdmult *
1799         (plane_rd_mult[is_inter][plane_type] << (2 * (xd->bd - 8)))) +
1800        2) >>
1801       rshift;
1802 
1803   uint8_t levels_buf[TX_PAD_2D];
1804   uint8_t *const levels = set_levels(levels_buf, width);
1805 
1806   if (eob > 1) av1_txb_init_levels(qcoeff, width, height, levels);
1807 
1808   // TODO(angirbird): check iqmatrix
1809 
1810   const int non_skip_cost = txb_costs->txb_skip_cost[txb_ctx->txb_skip_ctx][0];
1811   const int skip_cost = txb_costs->txb_skip_cost[txb_ctx->txb_skip_ctx][1];
1812   const int eob_cost = get_eob_cost(eob, txb_eob_costs, txb_costs, tx_class);
1813   int accu_rate = eob_cost;
1814   int64_t accu_dist = 0;
1815   int si = eob - 1;
1816   const int ci = scan[si];
1817   const tran_low_t qc = qcoeff[ci];
1818   const tran_low_t abs_qc = abs(qc);
1819   const int sign = qc < 0;
1820   const int max_nz_num = 2;
1821   int nz_num = 1;
1822   int nz_ci[3] = { ci, 0, 0 };
1823   if (abs_qc >= 2) {
1824     update_coeff_general(&accu_rate, &accu_dist, si, eob, tx_size, tx_class,
1825                          bwl, height, rdmult, shift, txb_ctx->dc_sign_ctx,
1826                          dequant, scan, txb_costs, tcoeff, qcoeff, dqcoeff,
1827                          levels, iqmatrix);
1828     --si;
1829   } else {
1830     assert(abs_qc == 1);
1831     const int coeff_ctx = get_lower_levels_ctx_eob(bwl, height, si);
1832     accu_rate +=
1833         get_coeff_cost_eob(ci, abs_qc, sign, coeff_ctx, txb_ctx->dc_sign_ctx,
1834                            txb_costs, bwl, tx_class);
1835     const tran_low_t tqc = tcoeff[ci];
1836     const tran_low_t dqc = dqcoeff[ci];
1837     const int64_t dist = get_coeff_dist(tqc, dqc, shift);
1838     const int64_t dist0 = get_coeff_dist(tqc, 0, shift);
1839     accu_dist += dist - dist0;
1840     --si;
1841   }
1842 
1843 #define UPDATE_COEFF_EOB_CASE(tx_class_literal)                            \
1844   case tx_class_literal:                                                   \
1845     for (; si >= 0 && nz_num <= max_nz_num && !fast_mode; --si) {          \
1846       update_coeff_eob(&accu_rate, &accu_dist, &eob, &nz_num, nz_ci, si,   \
1847                        tx_size, tx_class_literal, bwl, height,             \
1848                        txb_ctx->dc_sign_ctx, rdmult, shift, dequant, scan, \
1849                        txb_eob_costs, txb_costs, tcoeff, qcoeff, dqcoeff,  \
1850                        levels, sharpness, iqmatrix);                       \
1851     }                                                                      \
1852     break;
1853   switch (tx_class) {
1854     UPDATE_COEFF_EOB_CASE(TX_CLASS_2D);
1855     UPDATE_COEFF_EOB_CASE(TX_CLASS_HORIZ);
1856     UPDATE_COEFF_EOB_CASE(TX_CLASS_VERT);
1857 #undef UPDATE_COEFF_EOB_CASE
1858     default: assert(false);
1859   }
1860 
1861   if (si == -1 && nz_num <= max_nz_num) {
1862     update_skip(&accu_rate, accu_dist, &eob, nz_num, nz_ci, rdmult, skip_cost,
1863                 non_skip_cost, qcoeff, dqcoeff, sharpness);
1864   }
1865 
1866 #define UPDATE_COEFF_SIMPLE_CASE(tx_class_literal)                             \
1867   case tx_class_literal:                                                       \
1868     for (; si >= 1; --si) {                                                    \
1869       update_coeff_simple(&accu_rate, si, eob, tx_size, tx_class_literal, bwl, \
1870                           rdmult, shift, dequant, scan, txb_costs, tcoeff,     \
1871                           qcoeff, dqcoeff, levels, iqmatrix);                  \
1872     }                                                                          \
1873     break;
1874   switch (tx_class) {
1875     UPDATE_COEFF_SIMPLE_CASE(TX_CLASS_2D);
1876     UPDATE_COEFF_SIMPLE_CASE(TX_CLASS_HORIZ);
1877     UPDATE_COEFF_SIMPLE_CASE(TX_CLASS_VERT);
1878 #undef UPDATE_COEFF_SIMPLE_CASE
1879     default: assert(false);
1880   }
1881 
1882   // DC position
1883   if (si == 0) {
1884     // no need to update accu_dist because it's not used after this point
1885     int64_t dummy_dist = 0;
1886     update_coeff_general(&accu_rate, &dummy_dist, si, eob, tx_size, tx_class,
1887                          bwl, height, rdmult, shift, txb_ctx->dc_sign_ctx,
1888                          dequant, scan, txb_costs, tcoeff, qcoeff, dqcoeff,
1889                          levels, iqmatrix);
1890   }
1891 
1892   const int tx_type_cost = get_tx_type_cost(x, xd, plane, tx_size, tx_type,
1893                                             cm->features.reduced_tx_set_used);
1894   if (eob == 0)
1895     accu_rate += skip_cost;
1896   else
1897     accu_rate += non_skip_cost + tx_type_cost;
1898 
1899   p->eobs[block] = eob;
1900   p->txb_entropy_ctx[block] =
1901       av1_get_txb_entropy_context(qcoeff, scan_order, p->eobs[block]);
1902 
1903   *rate_cost = accu_rate;
1904   return eob;
1905 }
1906 
1907 // This function is deprecated, but we keep it here because hash trellis
1908 // is not integrated with av1_optimize_txb_new yet
1909 int av1_optimize_txb(const struct AV1_COMP *cpi, MACROBLOCK *x, int plane,
1910                      int blk_row, int blk_col, int block, TX_SIZE tx_size,
1911                      TXB_CTX *txb_ctx, int fast_mode, int *rate_cost) {
1912   const AV1_COMMON *cm = &cpi->common;
1913   const int reduced_tx_set_used = cm->features.reduced_tx_set_used;
1914   MACROBLOCKD *const xd = &x->e_mbd;
1915   const PLANE_TYPE plane_type = get_plane_type(plane);
1916   const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size);
1917   const TX_TYPE tx_type = av1_get_tx_type(xd, plane_type, blk_row, blk_col,
1918                                           tx_size, reduced_tx_set_used);
1919   const MB_MODE_INFO *mbmi = xd->mi[0];
1920   const struct macroblock_plane *p = &x->plane[plane];
1921   struct macroblockd_plane *pd = &xd->plane[plane];
1922   const int eob = p->eobs[block];
1923   const int block_offset = BLOCK_OFFSET(block);
1924   tran_low_t *qcoeff = p->qcoeff + block_offset;
1925   tran_low_t *dqcoeff = pd->dqcoeff + block_offset;
1926   const tran_low_t *tcoeff = p->coeff + block_offset;
1927   const int16_t *dequant = p->dequant_QTX;
1928   const int seg_eob = av1_get_max_eob(tx_size);
1929   const int bwl = get_txb_bwl(tx_size);
1930   const int width = get_txb_wide(tx_size);
1931   const int height = get_txb_high(tx_size);
1932   const int is_inter = is_inter_block(mbmi);
1933   const SCAN_ORDER *const scan_order = get_scan(tx_size, tx_type);
1934   const LV_MAP_COEFF_COST *txb_costs = &x->coeff_costs[txs_ctx][plane_type];
1935   const int eob_multi_size = txsize_log2_minus4[tx_size];
1936   const LV_MAP_EOB_COST txb_eob_costs =
1937       x->eob_costs[eob_multi_size][plane_type];
1938 
1939   const int shift = av1_get_tx_scale(tx_size);
1940   const int64_t rdmult =
1941       (((int64_t)x->rdmult * plane_rd_mult[is_inter][plane_type]
1942         << (2 * (xd->bd - 8))) +
1943        2) >>
1944       2;
1945   uint8_t levels_buf[TX_PAD_2D];
1946   uint8_t *const levels = set_levels(levels_buf, width);
1947   const qm_val_t *iqmatrix =
1948       av1_get_iqmatrix(&cpi->common.quant_params, xd, plane, tx_size, tx_type);
1949   assert(width == (1 << bwl));
1950   const int tx_type_cost =
1951       get_tx_type_cost(x, xd, plane, tx_size, tx_type, reduced_tx_set_used);
1952   TxbInfo txb_info = {
1953     qcoeff,     levels,  dqcoeff, tcoeff,   dequant,      shift, tx_size,
1954     txs_ctx,    tx_type, bwl,     width,    height,       eob,   seg_eob,
1955     scan_order, txb_ctx, rdmult,  iqmatrix, tx_type_cost,
1956   };
1957 
1958 #if CONFIG_HTB_TRELLIS
1959   // Hash based trellis (hbt) speed feature: avoid expensive optimize_txb calls
1960   // by storing the coefficient deltas in a hash table.
1961   // Currently disabled in speedfeatures.c
1962   if (eob <= HBT_EOB && eob > 0 && cpi->sf.use_hash_based_trellis) {
1963     return hbt_create_hashes(&txb_info, txb_costs, &txb_eob_costs, p, block,
1964                              fast_mode, rate_cost);
1965   }
1966 #else
1967   (void)fast_mode;
1968 #endif  // CONFIG_HTB_TRELLIS
1969   av1_txb_init_levels(qcoeff, width, height, levels);
1970 
1971   const int update =
1972       optimize_txb(&txb_info, txb_costs, &txb_eob_costs, rate_cost);
1973 
1974   if (update) {
1975     p->eobs[block] = txb_info.eob;
1976     p->txb_entropy_ctx[block] =
1977         av1_get_txb_entropy_context(qcoeff, scan_order, txb_info.eob);
1978   }
1979   return txb_info.eob;
1980 }
1981 
1982 int av1_get_txb_entropy_context(const tran_low_t *qcoeff,
1983                                 const SCAN_ORDER *scan_order, int eob) {
1984   const int16_t *const scan = scan_order->scan;
1985   int cul_level = 0;
1986   int c;
1987 
1988   if (eob == 0) return 0;
1989   for (c = 0; c < eob; ++c) {
1990     cul_level += abs(qcoeff[scan[c]]);
1991     if (cul_level > COEFF_CONTEXT_MASK) break;
1992   }
1993 
1994   cul_level = AOMMIN(COEFF_CONTEXT_MASK, cul_level);
1995   set_dc_sign(&cul_level, qcoeff[0]);
1996 
1997   return cul_level;
1998 }
1999 
2000 static void update_tx_type_count(const AV1_COMP *cpi, const AV1_COMMON *cm,
2001                                  MACROBLOCKD *xd, int blk_row, int blk_col,
2002                                  int plane, TX_SIZE tx_size,
2003                                  FRAME_COUNTS *counts,
2004                                  uint8_t allow_update_cdf) {
2005   MB_MODE_INFO *mbmi = xd->mi[0];
2006   int is_inter = is_inter_block(mbmi);
2007   const int reduced_tx_set_used = cm->features.reduced_tx_set_used;
2008   FRAME_CONTEXT *fc = xd->tile_ctx;
2009 #if !CONFIG_ENTROPY_STATS
2010   (void)counts;
2011 #endif  // !CONFIG_ENTROPY_STATS
2012 
2013   // Only y plane's tx_type is updated
2014   if (plane > 0) return;
2015   const TX_TYPE tx_type = av1_get_tx_type(xd, PLANE_TYPE_Y, blk_row, blk_col,
2016                                           tx_size, reduced_tx_set_used);
2017   if (is_inter) {
2018     if (cpi->oxcf.use_inter_dct_only) {
2019       assert(tx_type == DCT_DCT);
2020     }
2021   } else {
2022     if (cpi->oxcf.use_intra_dct_only) {
2023       assert(tx_type == DCT_DCT);
2024     } else if (cpi->oxcf.use_intra_default_tx_only) {
2025       const TX_TYPE default_type = get_default_tx_type(
2026           PLANE_TYPE_Y, xd, tx_size, cpi->is_screen_content_type);
2027       (void)default_type;
2028       assert(tx_type == default_type);
2029     }
2030   }
2031 
2032   if (get_ext_tx_types(tx_size, is_inter, reduced_tx_set_used) > 1 &&
2033       cm->quant_params.base_qindex > 0 && !mbmi->skip &&
2034       !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
2035     const int eset = get_ext_tx_set(tx_size, is_inter, reduced_tx_set_used);
2036     if (eset > 0) {
2037       const TxSetType tx_set_type =
2038           av1_get_ext_tx_set_type(tx_size, is_inter, reduced_tx_set_used);
2039       if (is_inter) {
2040         if (allow_update_cdf) {
2041           update_cdf(fc->inter_ext_tx_cdf[eset][txsize_sqr_map[tx_size]],
2042                      av1_ext_tx_ind[tx_set_type][tx_type],
2043                      av1_num_ext_tx_set[tx_set_type]);
2044         }
2045 #if CONFIG_ENTROPY_STATS
2046         ++counts->inter_ext_tx[eset][txsize_sqr_map[tx_size]]
2047                               [av1_ext_tx_ind[tx_set_type][tx_type]];
2048 #endif  // CONFIG_ENTROPY_STATS
2049       } else {
2050         PREDICTION_MODE intra_dir;
2051         if (mbmi->filter_intra_mode_info.use_filter_intra)
2052           intra_dir = fimode_to_intradir[mbmi->filter_intra_mode_info
2053                                              .filter_intra_mode];
2054         else
2055           intra_dir = mbmi->mode;
2056 #if CONFIG_ENTROPY_STATS
2057         ++counts->intra_ext_tx[eset][txsize_sqr_map[tx_size]][intra_dir]
2058                               [av1_ext_tx_ind[tx_set_type][tx_type]];
2059 #endif  // CONFIG_ENTROPY_STATS
2060         if (allow_update_cdf) {
2061           update_cdf(
2062               fc->intra_ext_tx_cdf[eset][txsize_sqr_map[tx_size]][intra_dir],
2063               av1_ext_tx_ind[tx_set_type][tx_type],
2064               av1_num_ext_tx_set[tx_set_type]);
2065         }
2066       }
2067     }
2068   }
2069 }
2070 
2071 void av1_update_and_record_txb_context(int plane, int block, int blk_row,
2072                                        int blk_col, BLOCK_SIZE plane_bsize,
2073                                        TX_SIZE tx_size, void *arg) {
2074   struct tokenize_b_args *const args = arg;
2075   const AV1_COMP *cpi = args->cpi;
2076   const AV1_COMMON *cm = &cpi->common;
2077   ThreadData *const td = args->td;
2078   MACROBLOCK *const x = &td->mb;
2079   MACROBLOCKD *const xd = &x->e_mbd;
2080   struct macroblock_plane *p = &x->plane[plane];
2081   struct macroblockd_plane *pd = &xd->plane[plane];
2082   const int eob = p->eobs[block];
2083   const int block_offset = BLOCK_OFFSET(block);
2084   tran_low_t *qcoeff = p->qcoeff + block_offset;
2085   const PLANE_TYPE plane_type = pd->plane_type;
2086   const TX_TYPE tx_type =
2087       av1_get_tx_type(xd, plane_type, blk_row, blk_col, tx_size,
2088                       cm->features.reduced_tx_set_used);
2089   const SCAN_ORDER *const scan_order = get_scan(tx_size, tx_type);
2090   tran_low_t *tcoeff;
2091   assert(args->dry_run != DRY_RUN_COSTCOEFFS);
2092   if (args->dry_run == OUTPUT_ENABLED) {
2093     MB_MODE_INFO *mbmi = xd->mi[0];
2094     TXB_CTX txb_ctx;
2095     get_txb_ctx(plane_bsize, tx_size, plane,
2096                 pd->above_entropy_context + blk_col,
2097                 pd->left_entropy_context + blk_row, &txb_ctx);
2098     const int bwl = get_txb_bwl(tx_size);
2099     const int width = get_txb_wide(tx_size);
2100     const int height = get_txb_high(tx_size);
2101     const uint8_t allow_update_cdf = args->allow_update_cdf;
2102     const TX_SIZE txsize_ctx = get_txsize_entropy_ctx(tx_size);
2103     FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
2104 #if CONFIG_ENTROPY_STATS
2105     int cdf_idx = cm->coef_cdf_category;
2106     ++td->counts->txb_skip[cdf_idx][txsize_ctx][txb_ctx.txb_skip_ctx][eob == 0];
2107 #endif  // CONFIG_ENTROPY_STATS
2108     if (allow_update_cdf) {
2109       update_cdf(ec_ctx->txb_skip_cdf[txsize_ctx][txb_ctx.txb_skip_ctx],
2110                  eob == 0, 2);
2111     }
2112 
2113     CB_COEFF_BUFFER *cb_coef_buff = x->cb_coef_buff;
2114     const int txb_offset =
2115         x->mbmi_ext_frame->cb_offset / (TX_SIZE_W_MIN * TX_SIZE_H_MIN);
2116     uint16_t *eob_txb = cb_coef_buff->eobs[plane] + txb_offset;
2117     uint8_t *const entropy_ctx = cb_coef_buff->entropy_ctx[plane] + txb_offset;
2118     entropy_ctx[block] = txb_ctx.txb_skip_ctx;
2119     eob_txb[block] = eob;
2120 
2121     if (eob == 0) {
2122       av1_set_entropy_contexts(xd, pd, plane, plane_bsize, tx_size, 0, blk_col,
2123                                blk_row);
2124       return;
2125     }
2126     const int segment_id = mbmi->segment_id;
2127     const int seg_eob = av1_get_tx_eob(&cpi->common.seg, segment_id, tx_size);
2128     tran_low_t *tcoeff_txb =
2129         cb_coef_buff->tcoeff[plane] + x->mbmi_ext_frame->cb_offset;
2130     tcoeff = tcoeff_txb + block_offset;
2131     memcpy(tcoeff, qcoeff, sizeof(*tcoeff) * seg_eob);
2132 
2133     uint8_t levels_buf[TX_PAD_2D];
2134     uint8_t *const levels = set_levels(levels_buf, width);
2135     av1_txb_init_levels(tcoeff, width, height, levels);
2136     update_tx_type_count(cpi, cm, xd, blk_row, blk_col, plane, tx_size,
2137                          td->counts, allow_update_cdf);
2138 
2139     const TX_CLASS tx_class = tx_type_to_class[tx_type];
2140     const int16_t *const scan = scan_order->scan;
2141 
2142     // record tx type usage
2143     td->rd_counts.tx_type_used[tx_size][tx_type]++;
2144 
2145 #if CONFIG_ENTROPY_STATS
2146     av1_update_eob_context(cdf_idx, eob, tx_size, tx_class, plane_type, ec_ctx,
2147                            td->counts, allow_update_cdf);
2148 #else
2149     av1_update_eob_context(eob, tx_size, tx_class, plane_type, ec_ctx,
2150                            allow_update_cdf);
2151 #endif
2152 
2153     DECLARE_ALIGNED(16, int8_t, coeff_contexts[MAX_TX_SQUARE]);
2154     av1_get_nz_map_contexts(levels, scan, eob, tx_size, tx_class,
2155                             coeff_contexts);
2156 
2157     for (int c = eob - 1; c >= 0; --c) {
2158       const int pos = scan[c];
2159       const int coeff_ctx = coeff_contexts[pos];
2160       const tran_low_t v = qcoeff[pos];
2161       const tran_low_t level = abs(v);
2162 
2163       if (allow_update_cdf) {
2164         if (c == eob - 1) {
2165           assert(coeff_ctx < 4);
2166           update_cdf(
2167               ec_ctx->coeff_base_eob_cdf[txsize_ctx][plane_type][coeff_ctx],
2168               AOMMIN(level, 3) - 1, 3);
2169         } else {
2170           update_cdf(ec_ctx->coeff_base_cdf[txsize_ctx][plane_type][coeff_ctx],
2171                      AOMMIN(level, 3), 4);
2172         }
2173       }
2174       if (c == eob - 1) {
2175         assert(coeff_ctx < 4);
2176 #if CONFIG_ENTROPY_STATS
2177         ++td->counts->coeff_base_eob_multi[cdf_idx][txsize_ctx][plane_type]
2178                                           [coeff_ctx][AOMMIN(level, 3) - 1];
2179       } else {
2180         ++td->counts->coeff_base_multi[cdf_idx][txsize_ctx][plane_type]
2181                                       [coeff_ctx][AOMMIN(level, 3)];
2182 #endif
2183       }
2184       if (level > NUM_BASE_LEVELS) {
2185         const int base_range = level - 1 - NUM_BASE_LEVELS;
2186         const int br_ctx = get_br_ctx(levels, pos, bwl, tx_class);
2187         for (int idx = 0; idx < COEFF_BASE_RANGE; idx += BR_CDF_SIZE - 1) {
2188           const int k = AOMMIN(base_range - idx, BR_CDF_SIZE - 1);
2189           if (allow_update_cdf) {
2190             update_cdf(ec_ctx->coeff_br_cdf[AOMMIN(txsize_ctx, TX_32X32)]
2191                                            [plane_type][br_ctx],
2192                        k, BR_CDF_SIZE);
2193           }
2194           for (int lps = 0; lps < BR_CDF_SIZE - 1; lps++) {
2195 #if CONFIG_ENTROPY_STATS
2196             ++td->counts->coeff_lps[AOMMIN(txsize_ctx, TX_32X32)][plane_type]
2197                                    [lps][br_ctx][lps == k];
2198 #endif  // CONFIG_ENTROPY_STATS
2199             if (lps == k) break;
2200           }
2201 #if CONFIG_ENTROPY_STATS
2202           ++td->counts->coeff_lps_multi[cdf_idx][AOMMIN(txsize_ctx, TX_32X32)]
2203                                        [plane_type][br_ctx][k];
2204 #endif
2205           if (k < BR_CDF_SIZE - 1) break;
2206         }
2207       }
2208     }
2209     // Update the context needed to code the DC sign (if applicable)
2210     if (tcoeff[0] != 0) {
2211       const int dc_sign = (tcoeff[0] < 0) ? 1 : 0;
2212       const int dc_sign_ctx = txb_ctx.dc_sign_ctx;
2213 #if CONFIG_ENTROPY_STATS
2214       ++td->counts->dc_sign[plane_type][dc_sign_ctx][dc_sign];
2215 #endif  // CONFIG_ENTROPY_STATS
2216       if (allow_update_cdf)
2217         update_cdf(ec_ctx->dc_sign_cdf[plane_type][dc_sign_ctx], dc_sign, 2);
2218       entropy_ctx[block] |= dc_sign_ctx << DC_SIGN_CTX_SHIFT;
2219     }
2220   } else {
2221     tcoeff = qcoeff;
2222   }
2223   const int cul_level = av1_get_txb_entropy_context(tcoeff, scan_order, eob);
2224   av1_set_entropy_contexts(xd, pd, plane, plane_bsize, tx_size, cul_level,
2225                            blk_col, blk_row);
2226 }
2227 
2228 void av1_update_txb_context(const AV1_COMP *cpi, ThreadData *td,
2229                             RUN_TYPE dry_run, BLOCK_SIZE bsize,
2230                             uint8_t allow_update_cdf) {
2231   const AV1_COMMON *const cm = &cpi->common;
2232   const int num_planes = av1_num_planes(cm);
2233   MACROBLOCK *const x = &td->mb;
2234   MACROBLOCKD *const xd = &x->e_mbd;
2235   MB_MODE_INFO *const mbmi = xd->mi[0];
2236   struct tokenize_b_args arg = { cpi, td, 0, allow_update_cdf, dry_run };
2237   if (mbmi->skip) {
2238     av1_reset_entropy_context(xd, bsize, num_planes);
2239     return;
2240   }
2241 
2242   for (int plane = 0; plane < num_planes; ++plane) {
2243     if (plane && !xd->is_chroma_ref) break;
2244     const struct macroblockd_plane *const pd = &xd->plane[plane];
2245     const int ss_x = pd->subsampling_x;
2246     const int ss_y = pd->subsampling_y;
2247     const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, ss_x, ss_y);
2248     av1_foreach_transformed_block_in_plane(
2249         xd, plane_bsize, plane, av1_update_and_record_txb_context, &arg);
2250   }
2251 }
2252 
2253 CB_COEFF_BUFFER *av1_get_cb_coeff_buffer(const struct AV1_COMP *cpi, int mi_row,
2254                                          int mi_col) {
2255   const AV1_COMMON *const cm = &cpi->common;
2256   const int mib_size_log2 = cm->seq_params.mib_size_log2;
2257   const int stride = (cm->mi_params.mi_cols >> mib_size_log2) + 1;
2258   const int offset =
2259       (mi_row >> mib_size_log2) * stride + (mi_col >> mib_size_log2);
2260   return cpi->coeff_buffer_base + offset;
2261 }
2262