1 /*
2  *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include "./vpx_dsp_rtcd.h"
12 
13 #include "vpx_config.h"
14 #include "vp8_rtcd.h"
15 #include "encodemb.h"
16 #include "vp8/common/reconinter.h"
17 #include "vp8/encoder/quantize.h"
18 #include "tokenize.h"
19 #include "vp8/common/invtrans.h"
20 #include "vpx_mem/vpx_mem.h"
21 #include "rdopt.h"
22 
vp8_subtract_b(BLOCK * be,BLOCKD * bd,int pitch)23 void vp8_subtract_b(BLOCK *be, BLOCKD *bd, int pitch) {
24   unsigned char *src_ptr = (*(be->base_src) + be->src);
25   short *diff_ptr = be->src_diff;
26   unsigned char *pred_ptr = bd->predictor;
27   int src_stride = be->src_stride;
28 
29   vpx_subtract_block(4, 4, diff_ptr, pitch, src_ptr, src_stride, pred_ptr,
30                      pitch);
31 }
32 
vp8_subtract_mbuv(short * diff,unsigned char * usrc,unsigned char * vsrc,int src_stride,unsigned char * upred,unsigned char * vpred,int pred_stride)33 void vp8_subtract_mbuv(short *diff, unsigned char *usrc, unsigned char *vsrc,
34                        int src_stride, unsigned char *upred,
35                        unsigned char *vpred, int pred_stride) {
36   short *udiff = diff + 256;
37   short *vdiff = diff + 320;
38 
39   vpx_subtract_block(8, 8, udiff, 8, usrc, src_stride, upred, pred_stride);
40   vpx_subtract_block(8, 8, vdiff, 8, vsrc, src_stride, vpred, pred_stride);
41 }
42 
vp8_subtract_mby(short * diff,unsigned char * src,int src_stride,unsigned char * pred,int pred_stride)43 void vp8_subtract_mby(short *diff, unsigned char *src, int src_stride,
44                       unsigned char *pred, int pred_stride) {
45   vpx_subtract_block(16, 16, diff, 16, src, src_stride, pred, pred_stride);
46 }
47 
vp8_subtract_mb(MACROBLOCK * x)48 static void vp8_subtract_mb(MACROBLOCK *x) {
49   BLOCK *b = &x->block[0];
50 
51   vp8_subtract_mby(x->src_diff, *(b->base_src), b->src_stride,
52                    x->e_mbd.dst.y_buffer, x->e_mbd.dst.y_stride);
53   vp8_subtract_mbuv(x->src_diff, x->src.u_buffer, x->src.v_buffer,
54                     x->src.uv_stride, x->e_mbd.dst.u_buffer,
55                     x->e_mbd.dst.v_buffer, x->e_mbd.dst.uv_stride);
56 }
57 
build_dcblock(MACROBLOCK * x)58 static void build_dcblock(MACROBLOCK *x) {
59   short *src_diff_ptr = &x->src_diff[384];
60   int i;
61 
62   for (i = 0; i < 16; ++i) {
63     src_diff_ptr[i] = x->coeff[i * 16];
64   }
65 }
66 
vp8_transform_mbuv(MACROBLOCK * x)67 void vp8_transform_mbuv(MACROBLOCK *x) {
68   int i;
69 
70   for (i = 16; i < 24; i += 2) {
71     x->short_fdct8x4(&x->block[i].src_diff[0], &x->block[i].coeff[0], 16);
72   }
73 }
74 
vp8_transform_intra_mby(MACROBLOCK * x)75 void vp8_transform_intra_mby(MACROBLOCK *x) {
76   int i;
77 
78   for (i = 0; i < 16; i += 2) {
79     x->short_fdct8x4(&x->block[i].src_diff[0], &x->block[i].coeff[0], 32);
80   }
81 
82   /* build dc block from 16 y dc values */
83   build_dcblock(x);
84 
85   /* do 2nd order transform on the dc block */
86   x->short_walsh4x4(&x->block[24].src_diff[0], &x->block[24].coeff[0], 8);
87 }
88 
transform_mb(MACROBLOCK * x)89 static void transform_mb(MACROBLOCK *x) {
90   int i;
91 
92   for (i = 0; i < 16; i += 2) {
93     x->short_fdct8x4(&x->block[i].src_diff[0], &x->block[i].coeff[0], 32);
94   }
95 
96   /* build dc block from 16 y dc values */
97   if (x->e_mbd.mode_info_context->mbmi.mode != SPLITMV) build_dcblock(x);
98 
99   for (i = 16; i < 24; i += 2) {
100     x->short_fdct8x4(&x->block[i].src_diff[0], &x->block[i].coeff[0], 16);
101   }
102 
103   /* do 2nd order transform on the dc block */
104   if (x->e_mbd.mode_info_context->mbmi.mode != SPLITMV) {
105     x->short_walsh4x4(&x->block[24].src_diff[0], &x->block[24].coeff[0], 8);
106   }
107 }
108 
transform_mby(MACROBLOCK * x)109 static void transform_mby(MACROBLOCK *x) {
110   int i;
111 
112   for (i = 0; i < 16; i += 2) {
113     x->short_fdct8x4(&x->block[i].src_diff[0], &x->block[i].coeff[0], 32);
114   }
115 
116   /* build dc block from 16 y dc values */
117   if (x->e_mbd.mode_info_context->mbmi.mode != SPLITMV) {
118     build_dcblock(x);
119     x->short_walsh4x4(&x->block[24].src_diff[0], &x->block[24].coeff[0], 8);
120   }
121 }
122 
123 #define RDTRUNC(RM, DM, R, D) ((128 + (R) * (RM)) & 0xFF)
124 
125 typedef struct vp8_token_state vp8_token_state;
126 
127 struct vp8_token_state {
128   int rate;
129   int error;
130   signed char next;
131   signed char token;
132   short qc;
133 };
134 
135 /* TODO: experiments to find optimal multiple numbers */
136 #define Y1_RD_MULT 4
137 #define UV_RD_MULT 2
138 #define Y2_RD_MULT 16
139 
140 static const int plane_rd_mult[4] = { Y1_RD_MULT, Y2_RD_MULT, UV_RD_MULT,
141                                       Y1_RD_MULT };
142 
optimize_b(MACROBLOCK * mb,int ib,int type,ENTROPY_CONTEXT * a,ENTROPY_CONTEXT * l)143 static void optimize_b(MACROBLOCK *mb, int ib, int type, ENTROPY_CONTEXT *a,
144                        ENTROPY_CONTEXT *l) {
145   BLOCK *b;
146   BLOCKD *d;
147   vp8_token_state tokens[17][2];
148   unsigned best_mask[2];
149   const short *dequant_ptr;
150   const short *coeff_ptr;
151   short *qcoeff_ptr;
152   short *dqcoeff_ptr;
153   int eob;
154   int i0;
155   int rc;
156   int x;
157   int sz = 0;
158   int next;
159   int rdmult;
160   int rddiv;
161   int final_eob;
162   int rd_cost0;
163   int rd_cost1;
164   int rate0;
165   int rate1;
166   int error0;
167   int error1;
168   int t0;
169   int t1;
170   int best;
171   int band;
172   int pt;
173   int i;
174   int err_mult = plane_rd_mult[type];
175 
176   b = &mb->block[ib];
177   d = &mb->e_mbd.block[ib];
178 
179   dequant_ptr = d->dequant;
180   coeff_ptr = b->coeff;
181   qcoeff_ptr = d->qcoeff;
182   dqcoeff_ptr = d->dqcoeff;
183   i0 = !type;
184   eob = *d->eob;
185 
186   /* Now set up a Viterbi trellis to evaluate alternative roundings. */
187   rdmult = mb->rdmult * err_mult;
188   if (mb->e_mbd.mode_info_context->mbmi.ref_frame == INTRA_FRAME) {
189     rdmult = (rdmult * 9) >> 4;
190   }
191 
192   rddiv = mb->rddiv;
193   best_mask[0] = best_mask[1] = 0;
194   /* Initialize the sentinel node of the trellis. */
195   tokens[eob][0].rate = 0;
196   tokens[eob][0].error = 0;
197   tokens[eob][0].next = 16;
198   tokens[eob][0].token = DCT_EOB_TOKEN;
199   tokens[eob][0].qc = 0;
200   *(tokens[eob] + 1) = *(tokens[eob] + 0);
201   next = eob;
202   for (i = eob; i-- > i0;) {
203     int base_bits;
204     int d2;
205     int dx;
206 
207     rc = vp8_default_zig_zag1d[i];
208     x = qcoeff_ptr[rc];
209     /* Only add a trellis state for non-zero coefficients. */
210     if (x) {
211       int shortcut = 0;
212       error0 = tokens[next][0].error;
213       error1 = tokens[next][1].error;
214       /* Evaluate the first possibility for this state. */
215       rate0 = tokens[next][0].rate;
216       rate1 = tokens[next][1].rate;
217       t0 = (vp8_dct_value_tokens_ptr + x)->Token;
218       /* Consider both possible successor states. */
219       if (next < 16) {
220         band = vp8_coef_bands[i + 1];
221         pt = vp8_prev_token_class[t0];
222         rate0 += mb->token_costs[type][band][pt][tokens[next][0].token];
223         rate1 += mb->token_costs[type][band][pt][tokens[next][1].token];
224       }
225       rd_cost0 = RDCOST(rdmult, rddiv, rate0, error0);
226       rd_cost1 = RDCOST(rdmult, rddiv, rate1, error1);
227       if (rd_cost0 == rd_cost1) {
228         rd_cost0 = RDTRUNC(rdmult, rddiv, rate0, error0);
229         rd_cost1 = RDTRUNC(rdmult, rddiv, rate1, error1);
230       }
231       /* And pick the best. */
232       best = rd_cost1 < rd_cost0;
233       base_bits = *(vp8_dct_value_cost_ptr + x);
234       dx = dqcoeff_ptr[rc] - coeff_ptr[rc];
235       d2 = dx * dx;
236       tokens[i][0].rate = base_bits + (best ? rate1 : rate0);
237       tokens[i][0].error = d2 + (best ? error1 : error0);
238       tokens[i][0].next = next;
239       tokens[i][0].token = t0;
240       tokens[i][0].qc = x;
241       best_mask[0] |= best << i;
242       /* Evaluate the second possibility for this state. */
243       rate0 = tokens[next][0].rate;
244       rate1 = tokens[next][1].rate;
245 
246       if ((abs(x) * dequant_ptr[rc] > abs(coeff_ptr[rc])) &&
247           (abs(x) * dequant_ptr[rc] < abs(coeff_ptr[rc]) + dequant_ptr[rc])) {
248         shortcut = 1;
249       } else {
250         shortcut = 0;
251       }
252 
253       if (shortcut) {
254         sz = -(x < 0);
255         x -= 2 * sz + 1;
256       }
257 
258       /* Consider both possible successor states. */
259       if (!x) {
260         /* If we reduced this coefficient to zero, check to see if
261          *  we need to move the EOB back here.
262          */
263         t0 =
264             tokens[next][0].token == DCT_EOB_TOKEN ? DCT_EOB_TOKEN : ZERO_TOKEN;
265         t1 =
266             tokens[next][1].token == DCT_EOB_TOKEN ? DCT_EOB_TOKEN : ZERO_TOKEN;
267       } else {
268         t0 = t1 = (vp8_dct_value_tokens_ptr + x)->Token;
269       }
270       if (next < 16) {
271         band = vp8_coef_bands[i + 1];
272         if (t0 != DCT_EOB_TOKEN) {
273           pt = vp8_prev_token_class[t0];
274           rate0 += mb->token_costs[type][band][pt][tokens[next][0].token];
275         }
276         if (t1 != DCT_EOB_TOKEN) {
277           pt = vp8_prev_token_class[t1];
278           rate1 += mb->token_costs[type][band][pt][tokens[next][1].token];
279         }
280       }
281 
282       rd_cost0 = RDCOST(rdmult, rddiv, rate0, error0);
283       rd_cost1 = RDCOST(rdmult, rddiv, rate1, error1);
284       if (rd_cost0 == rd_cost1) {
285         rd_cost0 = RDTRUNC(rdmult, rddiv, rate0, error0);
286         rd_cost1 = RDTRUNC(rdmult, rddiv, rate1, error1);
287       }
288       /* And pick the best. */
289       best = rd_cost1 < rd_cost0;
290       base_bits = *(vp8_dct_value_cost_ptr + x);
291 
292       if (shortcut) {
293         dx -= (dequant_ptr[rc] + sz) ^ sz;
294         d2 = dx * dx;
295       }
296       tokens[i][1].rate = base_bits + (best ? rate1 : rate0);
297       tokens[i][1].error = d2 + (best ? error1 : error0);
298       tokens[i][1].next = next;
299       tokens[i][1].token = best ? t1 : t0;
300       tokens[i][1].qc = x;
301       best_mask[1] |= best << i;
302       /* Finally, make this the new head of the trellis. */
303       next = i;
304     }
305     /* There's no choice to make for a zero coefficient, so we don't
306      *  add a new trellis node, but we do need to update the costs.
307      */
308     else {
309       band = vp8_coef_bands[i + 1];
310       t0 = tokens[next][0].token;
311       t1 = tokens[next][1].token;
312       /* Update the cost of each path if we're past the EOB token. */
313       if (t0 != DCT_EOB_TOKEN) {
314         tokens[next][0].rate += mb->token_costs[type][band][0][t0];
315         tokens[next][0].token = ZERO_TOKEN;
316       }
317       if (t1 != DCT_EOB_TOKEN) {
318         tokens[next][1].rate += mb->token_costs[type][band][0][t1];
319         tokens[next][1].token = ZERO_TOKEN;
320       }
321       /* Don't update next, because we didn't add a new node. */
322     }
323   }
324 
325   /* Now pick the best path through the whole trellis. */
326   band = vp8_coef_bands[i + 1];
327   VP8_COMBINEENTROPYCONTEXTS(pt, *a, *l);
328   rate0 = tokens[next][0].rate;
329   rate1 = tokens[next][1].rate;
330   error0 = tokens[next][0].error;
331   error1 = tokens[next][1].error;
332   t0 = tokens[next][0].token;
333   t1 = tokens[next][1].token;
334   rate0 += mb->token_costs[type][band][pt][t0];
335   rate1 += mb->token_costs[type][band][pt][t1];
336   rd_cost0 = RDCOST(rdmult, rddiv, rate0, error0);
337   rd_cost1 = RDCOST(rdmult, rddiv, rate1, error1);
338   if (rd_cost0 == rd_cost1) {
339     rd_cost0 = RDTRUNC(rdmult, rddiv, rate0, error0);
340     rd_cost1 = RDTRUNC(rdmult, rddiv, rate1, error1);
341   }
342   best = rd_cost1 < rd_cost0;
343   final_eob = i0 - 1;
344   for (i = next; i < eob; i = next) {
345     x = tokens[i][best].qc;
346     if (x) final_eob = i;
347     rc = vp8_default_zig_zag1d[i];
348     qcoeff_ptr[rc] = x;
349     dqcoeff_ptr[rc] = x * dequant_ptr[rc];
350     next = tokens[i][best].next;
351     best = (best_mask[best] >> i) & 1;
352   }
353   final_eob++;
354 
355   *a = *l = (final_eob != !type);
356   *d->eob = (char)final_eob;
357 }
check_reset_2nd_coeffs(MACROBLOCKD * x,int type,ENTROPY_CONTEXT * a,ENTROPY_CONTEXT * l)358 static void check_reset_2nd_coeffs(MACROBLOCKD *x, int type, ENTROPY_CONTEXT *a,
359                                    ENTROPY_CONTEXT *l) {
360   int sum = 0;
361   int i;
362   BLOCKD *bd = &x->block[24];
363 
364   if (bd->dequant[0] >= 35 && bd->dequant[1] >= 35) return;
365 
366   for (i = 0; i < (*bd->eob); ++i) {
367     int coef = bd->dqcoeff[vp8_default_zig_zag1d[i]];
368     sum += (coef >= 0) ? coef : -coef;
369     if (sum >= 35) return;
370   }
371   /**************************************************************************
372   our inverse hadamard transform effectively is weighted sum of all 16 inputs
373   with weight either 1 or -1. It has a last stage scaling of (sum+3)>>3. And
374   dc only idct is (dc+4)>>3. So if all the sums are between -35 and 29, the
375   output after inverse wht and idct will be all zero. A sum of absolute value
376   smaller than 35 guarantees all 16 different (+1/-1) weighted sums in wht
377   fall between -35 and +35.
378   **************************************************************************/
379   if (sum < 35) {
380     for (i = 0; i < (*bd->eob); ++i) {
381       int rc = vp8_default_zig_zag1d[i];
382       bd->qcoeff[rc] = 0;
383       bd->dqcoeff[rc] = 0;
384     }
385     *bd->eob = 0;
386     *a = *l = (*bd->eob != !type);
387   }
388 }
389 
optimize_mb(MACROBLOCK * x)390 static void optimize_mb(MACROBLOCK *x) {
391   int b;
392   int type;
393   int has_2nd_order;
394 
395   ENTROPY_CONTEXT_PLANES t_above, t_left;
396   ENTROPY_CONTEXT *ta;
397   ENTROPY_CONTEXT *tl;
398 
399   memcpy(&t_above, x->e_mbd.above_context, sizeof(ENTROPY_CONTEXT_PLANES));
400   memcpy(&t_left, x->e_mbd.left_context, sizeof(ENTROPY_CONTEXT_PLANES));
401 
402   ta = (ENTROPY_CONTEXT *)&t_above;
403   tl = (ENTROPY_CONTEXT *)&t_left;
404 
405   has_2nd_order = (x->e_mbd.mode_info_context->mbmi.mode != B_PRED &&
406                    x->e_mbd.mode_info_context->mbmi.mode != SPLITMV);
407   type = has_2nd_order ? PLANE_TYPE_Y_NO_DC : PLANE_TYPE_Y_WITH_DC;
408 
409   for (b = 0; b < 16; ++b) {
410     optimize_b(x, b, type, ta + vp8_block2above[b], tl + vp8_block2left[b]);
411   }
412 
413   for (b = 16; b < 24; ++b) {
414     optimize_b(x, b, PLANE_TYPE_UV, ta + vp8_block2above[b],
415                tl + vp8_block2left[b]);
416   }
417 
418   if (has_2nd_order) {
419     b = 24;
420     optimize_b(x, b, PLANE_TYPE_Y2, ta + vp8_block2above[b],
421                tl + vp8_block2left[b]);
422     check_reset_2nd_coeffs(&x->e_mbd, PLANE_TYPE_Y2, ta + vp8_block2above[b],
423                            tl + vp8_block2left[b]);
424   }
425 }
426 
vp8_optimize_mby(MACROBLOCK * x)427 void vp8_optimize_mby(MACROBLOCK *x) {
428   int b;
429   int type;
430   int has_2nd_order;
431 
432   ENTROPY_CONTEXT_PLANES t_above, t_left;
433   ENTROPY_CONTEXT *ta;
434   ENTROPY_CONTEXT *tl;
435 
436   if (!x->e_mbd.above_context) return;
437 
438   if (!x->e_mbd.left_context) return;
439 
440   memcpy(&t_above, x->e_mbd.above_context, sizeof(ENTROPY_CONTEXT_PLANES));
441   memcpy(&t_left, x->e_mbd.left_context, sizeof(ENTROPY_CONTEXT_PLANES));
442 
443   ta = (ENTROPY_CONTEXT *)&t_above;
444   tl = (ENTROPY_CONTEXT *)&t_left;
445 
446   has_2nd_order = (x->e_mbd.mode_info_context->mbmi.mode != B_PRED &&
447                    x->e_mbd.mode_info_context->mbmi.mode != SPLITMV);
448   type = has_2nd_order ? PLANE_TYPE_Y_NO_DC : PLANE_TYPE_Y_WITH_DC;
449 
450   for (b = 0; b < 16; ++b) {
451     optimize_b(x, b, type, ta + vp8_block2above[b], tl + vp8_block2left[b]);
452   }
453 
454   if (has_2nd_order) {
455     b = 24;
456     optimize_b(x, b, PLANE_TYPE_Y2, ta + vp8_block2above[b],
457                tl + vp8_block2left[b]);
458     check_reset_2nd_coeffs(&x->e_mbd, PLANE_TYPE_Y2, ta + vp8_block2above[b],
459                            tl + vp8_block2left[b]);
460   }
461 }
462 
vp8_optimize_mbuv(MACROBLOCK * x)463 void vp8_optimize_mbuv(MACROBLOCK *x) {
464   int b;
465   ENTROPY_CONTEXT_PLANES t_above, t_left;
466   ENTROPY_CONTEXT *ta;
467   ENTROPY_CONTEXT *tl;
468 
469   if (!x->e_mbd.above_context) return;
470 
471   if (!x->e_mbd.left_context) return;
472 
473   memcpy(&t_above, x->e_mbd.above_context, sizeof(ENTROPY_CONTEXT_PLANES));
474   memcpy(&t_left, x->e_mbd.left_context, sizeof(ENTROPY_CONTEXT_PLANES));
475 
476   ta = (ENTROPY_CONTEXT *)&t_above;
477   tl = (ENTROPY_CONTEXT *)&t_left;
478 
479   for (b = 16; b < 24; ++b) {
480     optimize_b(x, b, PLANE_TYPE_UV, ta + vp8_block2above[b],
481                tl + vp8_block2left[b]);
482   }
483 }
484 
vp8_encode_inter16x16(MACROBLOCK * x)485 void vp8_encode_inter16x16(MACROBLOCK *x) {
486   vp8_build_inter_predictors_mb(&x->e_mbd);
487 
488   vp8_subtract_mb(x);
489 
490   transform_mb(x);
491 
492   vp8_quantize_mb(x);
493 
494   if (x->optimize) optimize_mb(x);
495 }
496 
497 /* this funciton is used by first pass only */
vp8_encode_inter16x16y(MACROBLOCK * x)498 void vp8_encode_inter16x16y(MACROBLOCK *x) {
499   BLOCK *b = &x->block[0];
500 
501   vp8_build_inter16x16_predictors_mby(&x->e_mbd, x->e_mbd.dst.y_buffer,
502                                       x->e_mbd.dst.y_stride);
503 
504   vp8_subtract_mby(x->src_diff, *(b->base_src), b->src_stride,
505                    x->e_mbd.dst.y_buffer, x->e_mbd.dst.y_stride);
506 
507   transform_mby(x);
508 
509   vp8_quantize_mby(x);
510 
511   vp8_inverse_transform_mby(&x->e_mbd);
512 }
513