1 // Copyright 2011 Google Inc. All Rights Reserved.
2 //
3 // Use of this source code is governed by a BSD-style license
4 // that can be found in the COPYING file in the root of the source
5 // tree. An additional intellectual property rights grant can be found
6 // in the file PATENTS. All contributing project authors may
7 // be found in the AUTHORS file in the root of the source tree.
8 // -----------------------------------------------------------------------------
9 //
10 //   Quantization
11 //
12 // Author: Skal (pascal.massimino@gmail.com)
13 
14 #include <assert.h>
15 #include <math.h>
16 #include <stdlib.h>  // for abs()
17 
18 #include "./vp8i_enc.h"
19 #include "./cost_enc.h"
20 
21 #define DO_TRELLIS_I4  1
22 #define DO_TRELLIS_I16 1   // not a huge gain, but ok at low bitrate.
23 #define DO_TRELLIS_UV  0   // disable trellis for UV. Risky. Not worth.
24 #define USE_TDISTO 1
25 
26 #define MID_ALPHA 64      // neutral value for susceptibility
27 #define MIN_ALPHA 30      // lowest usable value for susceptibility
28 #define MAX_ALPHA 100     // higher meaningful value for susceptibility
29 
30 #define SNS_TO_DQ 0.9     // Scaling constant between the sns value and the QP
31                           // power-law modulation. Must be strictly less than 1.
32 
33 // number of non-zero coeffs below which we consider the block very flat
34 // (and apply a penalty to complex predictions)
35 #define FLATNESS_LIMIT_I16 10      // I16 mode
36 #define FLATNESS_LIMIT_I4  3       // I4 mode
37 #define FLATNESS_LIMIT_UV  2       // UV mode
38 #define FLATNESS_PENALTY   140     // roughly ~1bit per block
39 
40 #define MULT_8B(a, b) (((a) * (b) + 128) >> 8)
41 
42 #define RD_DISTO_MULT      256  // distortion multiplier (equivalent of lambda)
43 
44 // #define DEBUG_BLOCK
45 
46 //------------------------------------------------------------------------------
47 
48 #if defined(DEBUG_BLOCK)
49 
50 #include <stdio.h>
51 #include <stdlib.h>
52 
PrintBlockInfo(const VP8EncIterator * const it,const VP8ModeScore * const rd)53 static void PrintBlockInfo(const VP8EncIterator* const it,
54                            const VP8ModeScore* const rd) {
55   int i, j;
56   const int is_i16 = (it->mb_->type_ == 1);
57   const uint8_t* const y_in = it->yuv_in_ + Y_OFF_ENC;
58   const uint8_t* const y_out = it->yuv_out_ + Y_OFF_ENC;
59   const uint8_t* const uv_in = it->yuv_in_ + U_OFF_ENC;
60   const uint8_t* const uv_out = it->yuv_out_ + U_OFF_ENC;
61   printf("SOURCE / OUTPUT / ABS DELTA\n");
62   for (j = 0; j < 16; ++j) {
63     for (i = 0; i < 16; ++i) printf("%3d ", y_in[i + j * BPS]);
64     printf("     ");
65     for (i = 0; i < 16; ++i) printf("%3d ", y_out[i + j * BPS]);
66     printf("     ");
67     for (i = 0; i < 16; ++i) {
68       printf("%1d ", abs(y_in[i + j * BPS] - y_out[i + j * BPS]));
69     }
70     printf("\n");
71   }
72   printf("\n");   // newline before the U/V block
73   for (j = 0; j < 8; ++j) {
74     for (i = 0; i < 8; ++i) printf("%3d ", uv_in[i + j * BPS]);
75     printf(" ");
76     for (i = 8; i < 16; ++i) printf("%3d ", uv_in[i + j * BPS]);
77     printf("    ");
78     for (i = 0; i < 8; ++i) printf("%3d ", uv_out[i + j * BPS]);
79     printf(" ");
80     for (i = 8; i < 16; ++i) printf("%3d ", uv_out[i + j * BPS]);
81     printf("   ");
82     for (i = 0; i < 8; ++i) {
83       printf("%1d ", abs(uv_out[i + j * BPS] - uv_in[i + j * BPS]));
84     }
85     printf(" ");
86     for (i = 8; i < 16; ++i) {
87       printf("%1d ", abs(uv_out[i + j * BPS] - uv_in[i + j * BPS]));
88     }
89     printf("\n");
90   }
91   printf("\nD:%d SD:%d R:%d H:%d nz:0x%x score:%d\n",
92     (int)rd->D, (int)rd->SD, (int)rd->R, (int)rd->H, (int)rd->nz,
93     (int)rd->score);
94   if (is_i16) {
95     printf("Mode: %d\n", rd->mode_i16);
96     printf("y_dc_levels:");
97     for (i = 0; i < 16; ++i) printf("%3d ", rd->y_dc_levels[i]);
98     printf("\n");
99   } else {
100     printf("Modes[16]: ");
101     for (i = 0; i < 16; ++i) printf("%d ", rd->modes_i4[i]);
102     printf("\n");
103   }
104   printf("y_ac_levels:\n");
105   for (j = 0; j < 16; ++j) {
106     for (i = is_i16 ? 1 : 0; i < 16; ++i) {
107       printf("%4d ", rd->y_ac_levels[j][i]);
108     }
109     printf("\n");
110   }
111   printf("\n");
112   printf("uv_levels (mode=%d):\n", rd->mode_uv);
113   for (j = 0; j < 8; ++j) {
114     for (i = 0; i < 16; ++i) {
115       printf("%4d ", rd->uv_levels[j][i]);
116     }
117     printf("\n");
118   }
119 }
120 
121 #endif   // DEBUG_BLOCK
122 
123 //------------------------------------------------------------------------------
124 
clip(int v,int m,int M)125 static WEBP_INLINE int clip(int v, int m, int M) {
126   return v < m ? m : v > M ? M : v;
127 }
128 
129 static const uint8_t kZigzag[16] = {
130   0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15
131 };
132 
133 static const uint8_t kDcTable[128] = {
134   4,     5,   6,   7,   8,   9,  10,  10,
135   11,   12,  13,  14,  15,  16,  17,  17,
136   18,   19,  20,  20,  21,  21,  22,  22,
137   23,   23,  24,  25,  25,  26,  27,  28,
138   29,   30,  31,  32,  33,  34,  35,  36,
139   37,   37,  38,  39,  40,  41,  42,  43,
140   44,   45,  46,  46,  47,  48,  49,  50,
141   51,   52,  53,  54,  55,  56,  57,  58,
142   59,   60,  61,  62,  63,  64,  65,  66,
143   67,   68,  69,  70,  71,  72,  73,  74,
144   75,   76,  76,  77,  78,  79,  80,  81,
145   82,   83,  84,  85,  86,  87,  88,  89,
146   91,   93,  95,  96,  98, 100, 101, 102,
147   104, 106, 108, 110, 112, 114, 116, 118,
148   122, 124, 126, 128, 130, 132, 134, 136,
149   138, 140, 143, 145, 148, 151, 154, 157
150 };
151 
152 static const uint16_t kAcTable[128] = {
153   4,     5,   6,   7,   8,   9,  10,  11,
154   12,   13,  14,  15,  16,  17,  18,  19,
155   20,   21,  22,  23,  24,  25,  26,  27,
156   28,   29,  30,  31,  32,  33,  34,  35,
157   36,   37,  38,  39,  40,  41,  42,  43,
158   44,   45,  46,  47,  48,  49,  50,  51,
159   52,   53,  54,  55,  56,  57,  58,  60,
160   62,   64,  66,  68,  70,  72,  74,  76,
161   78,   80,  82,  84,  86,  88,  90,  92,
162   94,   96,  98, 100, 102, 104, 106, 108,
163   110, 112, 114, 116, 119, 122, 125, 128,
164   131, 134, 137, 140, 143, 146, 149, 152,
165   155, 158, 161, 164, 167, 170, 173, 177,
166   181, 185, 189, 193, 197, 201, 205, 209,
167   213, 217, 221, 225, 229, 234, 239, 245,
168   249, 254, 259, 264, 269, 274, 279, 284
169 };
170 
171 static const uint16_t kAcTable2[128] = {
172   8,     8,   9,  10,  12,  13,  15,  17,
173   18,   20,  21,  23,  24,  26,  27,  29,
174   31,   32,  34,  35,  37,  38,  40,  41,
175   43,   44,  46,  48,  49,  51,  52,  54,
176   55,   57,  58,  60,  62,  63,  65,  66,
177   68,   69,  71,  72,  74,  75,  77,  79,
178   80,   82,  83,  85,  86,  88,  89,  93,
179   96,   99, 102, 105, 108, 111, 114, 117,
180   120, 124, 127, 130, 133, 136, 139, 142,
181   145, 148, 151, 155, 158, 161, 164, 167,
182   170, 173, 176, 179, 184, 189, 193, 198,
183   203, 207, 212, 217, 221, 226, 230, 235,
184   240, 244, 249, 254, 258, 263, 268, 274,
185   280, 286, 292, 299, 305, 311, 317, 323,
186   330, 336, 342, 348, 354, 362, 370, 379,
187   385, 393, 401, 409, 416, 424, 432, 440
188 };
189 
190 static const uint8_t kBiasMatrices[3][2] = {  // [luma-ac,luma-dc,chroma][dc,ac]
191   { 96, 110 }, { 96, 108 }, { 110, 115 }
192 };
193 
194 // Sharpening by (slightly) raising the hi-frequency coeffs.
195 // Hack-ish but helpful for mid-bitrate range. Use with care.
196 #define SHARPEN_BITS 11  // number of descaling bits for sharpening bias
197 static const uint8_t kFreqSharpening[16] = {
198   0,  30, 60, 90,
199   30, 60, 90, 90,
200   60, 90, 90, 90,
201   90, 90, 90, 90
202 };
203 
204 //------------------------------------------------------------------------------
205 // Initialize quantization parameters in VP8Matrix
206 
207 // Returns the average quantizer
ExpandMatrix(VP8Matrix * const m,int type)208 static int ExpandMatrix(VP8Matrix* const m, int type) {
209   int i, sum;
210   for (i = 0; i < 2; ++i) {
211     const int is_ac_coeff = (i > 0);
212     const int bias = kBiasMatrices[type][is_ac_coeff];
213     m->iq_[i] = (1 << QFIX) / m->q_[i];
214     m->bias_[i] = BIAS(bias);
215     // zthresh_ is the exact value such that QUANTDIV(coeff, iQ, B) is:
216     //   * zero if coeff <= zthresh
217     //   * non-zero if coeff > zthresh
218     m->zthresh_[i] = ((1 << QFIX) - 1 - m->bias_[i]) / m->iq_[i];
219   }
220   for (i = 2; i < 16; ++i) {
221     m->q_[i] = m->q_[1];
222     m->iq_[i] = m->iq_[1];
223     m->bias_[i] = m->bias_[1];
224     m->zthresh_[i] = m->zthresh_[1];
225   }
226   for (sum = 0, i = 0; i < 16; ++i) {
227     if (type == 0) {  // we only use sharpening for AC luma coeffs
228       m->sharpen_[i] = (kFreqSharpening[i] * m->q_[i]) >> SHARPEN_BITS;
229     } else {
230       m->sharpen_[i] = 0;
231     }
232     sum += m->q_[i];
233   }
234   return (sum + 8) >> 4;
235 }
236 
CheckLambdaValue(int * const v)237 static void CheckLambdaValue(int* const v) { if (*v < 1) *v = 1; }
238 
SetupMatrices(VP8Encoder * enc)239 static void SetupMatrices(VP8Encoder* enc) {
240   int i;
241   const int tlambda_scale =
242     (enc->method_ >= 4) ? enc->config_->sns_strength
243                         : 0;
244   const int num_segments = enc->segment_hdr_.num_segments_;
245   for (i = 0; i < num_segments; ++i) {
246     VP8SegmentInfo* const m = &enc->dqm_[i];
247     const int q = m->quant_;
248     int q_i4, q_i16, q_uv;
249     m->y1_.q_[0] = kDcTable[clip(q + enc->dq_y1_dc_, 0, 127)];
250     m->y1_.q_[1] = kAcTable[clip(q,                  0, 127)];
251 
252     m->y2_.q_[0] = kDcTable[ clip(q + enc->dq_y2_dc_, 0, 127)] * 2;
253     m->y2_.q_[1] = kAcTable2[clip(q + enc->dq_y2_ac_, 0, 127)];
254 
255     m->uv_.q_[0] = kDcTable[clip(q + enc->dq_uv_dc_, 0, 117)];
256     m->uv_.q_[1] = kAcTable[clip(q + enc->dq_uv_ac_, 0, 127)];
257 
258     q_i4  = ExpandMatrix(&m->y1_, 0);
259     q_i16 = ExpandMatrix(&m->y2_, 1);
260     q_uv  = ExpandMatrix(&m->uv_, 2);
261 
262     m->lambda_i4_          = (3 * q_i4 * q_i4) >> 7;
263     m->lambda_i16_         = (3 * q_i16 * q_i16);
264     m->lambda_uv_          = (3 * q_uv * q_uv) >> 6;
265     m->lambda_mode_        = (1 * q_i4 * q_i4) >> 7;
266     m->lambda_trellis_i4_  = (7 * q_i4 * q_i4) >> 3;
267     m->lambda_trellis_i16_ = (q_i16 * q_i16) >> 2;
268     m->lambda_trellis_uv_  = (q_uv * q_uv) << 1;
269     m->tlambda_            = (tlambda_scale * q_i4) >> 5;
270 
271     // none of these constants should be < 1
272     CheckLambdaValue(&m->lambda_i4_);
273     CheckLambdaValue(&m->lambda_i16_);
274     CheckLambdaValue(&m->lambda_uv_);
275     CheckLambdaValue(&m->lambda_mode_);
276     CheckLambdaValue(&m->lambda_trellis_i4_);
277     CheckLambdaValue(&m->lambda_trellis_i16_);
278     CheckLambdaValue(&m->lambda_trellis_uv_);
279     CheckLambdaValue(&m->tlambda_);
280 
281     m->min_disto_ = 20 * m->y1_.q_[0];   // quantization-aware min disto
282     m->max_edge_  = 0;
283 
284     m->i4_penalty_ = 1000 * q_i4 * q_i4;
285   }
286 }
287 
288 //------------------------------------------------------------------------------
289 // Initialize filtering parameters
290 
291 // Very small filter-strength values have close to no visual effect. So we can
292 // save a little decoding-CPU by turning filtering off for these.
293 #define FSTRENGTH_CUTOFF 2
294 
SetupFilterStrength(VP8Encoder * const enc)295 static void SetupFilterStrength(VP8Encoder* const enc) {
296   int i;
297   // level0 is in [0..500]. Using '-f 50' as filter_strength is mid-filtering.
298   const int level0 = 5 * enc->config_->filter_strength;
299   for (i = 0; i < NUM_MB_SEGMENTS; ++i) {
300     VP8SegmentInfo* const m = &enc->dqm_[i];
301     // We focus on the quantization of AC coeffs.
302     const int qstep = kAcTable[clip(m->quant_, 0, 127)] >> 2;
303     const int base_strength =
304         VP8FilterStrengthFromDelta(enc->filter_hdr_.sharpness_, qstep);
305     // Segments with lower complexity ('beta') will be less filtered.
306     const int f = base_strength * level0 / (256 + m->beta_);
307     m->fstrength_ = (f < FSTRENGTH_CUTOFF) ? 0 : (f > 63) ? 63 : f;
308   }
309   // We record the initial strength (mainly for the case of 1-segment only).
310   enc->filter_hdr_.level_ = enc->dqm_[0].fstrength_;
311   enc->filter_hdr_.simple_ = (enc->config_->filter_type == 0);
312   enc->filter_hdr_.sharpness_ = enc->config_->filter_sharpness;
313 }
314 
315 //------------------------------------------------------------------------------
316 
317 // Note: if you change the values below, remember that the max range
318 // allowed by the syntax for DQ_UV is [-16,16].
319 #define MAX_DQ_UV (6)
320 #define MIN_DQ_UV (-4)
321 
322 // We want to emulate jpeg-like behaviour where the expected "good" quality
323 // is around q=75. Internally, our "good" middle is around c=50. So we
324 // map accordingly using linear piece-wise function
QualityToCompression(double c)325 static double QualityToCompression(double c) {
326   const double linear_c = (c < 0.75) ? c * (2. / 3.) : 2. * c - 1.;
327   // The file size roughly scales as pow(quantizer, 3.). Actually, the
328   // exponent is somewhere between 2.8 and 3.2, but we're mostly interested
329   // in the mid-quant range. So we scale the compressibility inversely to
330   // this power-law: quant ~= compression ^ 1/3. This law holds well for
331   // low quant. Finer modeling for high-quant would make use of kAcTable[]
332   // more explicitly.
333   const double v = pow(linear_c, 1 / 3.);
334   return v;
335 }
336 
QualityToJPEGCompression(double c,double alpha)337 static double QualityToJPEGCompression(double c, double alpha) {
338   // We map the complexity 'alpha' and quality setting 'c' to a compression
339   // exponent empirically matched to the compression curve of libjpeg6b.
340   // On average, the WebP output size will be roughly similar to that of a
341   // JPEG file compressed with same quality factor.
342   const double amin = 0.30;
343   const double amax = 0.85;
344   const double exp_min = 0.4;
345   const double exp_max = 0.9;
346   const double slope = (exp_min - exp_max) / (amax - amin);
347   // Linearly interpolate 'expn' from exp_min to exp_max
348   // in the [amin, amax] range.
349   const double expn = (alpha > amax) ? exp_min
350                     : (alpha < amin) ? exp_max
351                     : exp_max + slope * (alpha - amin);
352   const double v = pow(c, expn);
353   return v;
354 }
355 
SegmentsAreEquivalent(const VP8SegmentInfo * const S1,const VP8SegmentInfo * const S2)356 static int SegmentsAreEquivalent(const VP8SegmentInfo* const S1,
357                                  const VP8SegmentInfo* const S2) {
358   return (S1->quant_ == S2->quant_) && (S1->fstrength_ == S2->fstrength_);
359 }
360 
SimplifySegments(VP8Encoder * const enc)361 static void SimplifySegments(VP8Encoder* const enc) {
362   int map[NUM_MB_SEGMENTS] = { 0, 1, 2, 3 };
363   // 'num_segments_' is previously validated and <= NUM_MB_SEGMENTS, but an
364   // explicit check is needed to avoid a spurious warning about 'i' exceeding
365   // array bounds of 'dqm_' with some compilers (noticed with gcc-4.9).
366   const int num_segments = (enc->segment_hdr_.num_segments_ < NUM_MB_SEGMENTS)
367                                ? enc->segment_hdr_.num_segments_
368                                : NUM_MB_SEGMENTS;
369   int num_final_segments = 1;
370   int s1, s2;
371   for (s1 = 1; s1 < num_segments; ++s1) {    // find similar segments
372     const VP8SegmentInfo* const S1 = &enc->dqm_[s1];
373     int found = 0;
374     // check if we already have similar segment
375     for (s2 = 0; s2 < num_final_segments; ++s2) {
376       const VP8SegmentInfo* const S2 = &enc->dqm_[s2];
377       if (SegmentsAreEquivalent(S1, S2)) {
378         found = 1;
379         break;
380       }
381     }
382     map[s1] = s2;
383     if (!found) {
384       if (num_final_segments != s1) {
385         enc->dqm_[num_final_segments] = enc->dqm_[s1];
386       }
387       ++num_final_segments;
388     }
389   }
390   if (num_final_segments < num_segments) {  // Remap
391     int i = enc->mb_w_ * enc->mb_h_;
392     while (i-- > 0) enc->mb_info_[i].segment_ = map[enc->mb_info_[i].segment_];
393     enc->segment_hdr_.num_segments_ = num_final_segments;
394     // Replicate the trailing segment infos (it's mostly cosmetics)
395     for (i = num_final_segments; i < num_segments; ++i) {
396       enc->dqm_[i] = enc->dqm_[num_final_segments - 1];
397     }
398   }
399 }
400 
VP8SetSegmentParams(VP8Encoder * const enc,float quality)401 void VP8SetSegmentParams(VP8Encoder* const enc, float quality) {
402   int i;
403   int dq_uv_ac, dq_uv_dc;
404   const int num_segments = enc->segment_hdr_.num_segments_;
405   const double amp = SNS_TO_DQ * enc->config_->sns_strength / 100. / 128.;
406   const double Q = quality / 100.;
407   const double c_base = enc->config_->emulate_jpeg_size ?
408       QualityToJPEGCompression(Q, enc->alpha_ / 255.) :
409       QualityToCompression(Q);
410   for (i = 0; i < num_segments; ++i) {
411     // We modulate the base coefficient to accommodate for the quantization
412     // susceptibility and allow denser segments to be quantized more.
413     const double expn = 1. - amp * enc->dqm_[i].alpha_;
414     const double c = pow(c_base, expn);
415     const int q = (int)(127. * (1. - c));
416     assert(expn > 0.);
417     enc->dqm_[i].quant_ = clip(q, 0, 127);
418   }
419 
420   // purely indicative in the bitstream (except for the 1-segment case)
421   enc->base_quant_ = enc->dqm_[0].quant_;
422 
423   // fill-in values for the unused segments (required by the syntax)
424   for (i = num_segments; i < NUM_MB_SEGMENTS; ++i) {
425     enc->dqm_[i].quant_ = enc->base_quant_;
426   }
427 
428   // uv_alpha_ is normally spread around ~60. The useful range is
429   // typically ~30 (quite bad) to ~100 (ok to decimate UV more).
430   // We map it to the safe maximal range of MAX/MIN_DQ_UV for dq_uv.
431   dq_uv_ac = (enc->uv_alpha_ - MID_ALPHA) * (MAX_DQ_UV - MIN_DQ_UV)
432                                           / (MAX_ALPHA - MIN_ALPHA);
433   // we rescale by the user-defined strength of adaptation
434   dq_uv_ac = dq_uv_ac * enc->config_->sns_strength / 100;
435   // and make it safe.
436   dq_uv_ac = clip(dq_uv_ac, MIN_DQ_UV, MAX_DQ_UV);
437   // We also boost the dc-uv-quant a little, based on sns-strength, since
438   // U/V channels are quite more reactive to high quants (flat DC-blocks
439   // tend to appear, and are unpleasant).
440   dq_uv_dc = -4 * enc->config_->sns_strength / 100;
441   dq_uv_dc = clip(dq_uv_dc, -15, 15);   // 4bit-signed max allowed
442 
443   enc->dq_y1_dc_ = 0;       // TODO(skal): dq-lum
444   enc->dq_y2_dc_ = 0;
445   enc->dq_y2_ac_ = 0;
446   enc->dq_uv_dc_ = dq_uv_dc;
447   enc->dq_uv_ac_ = dq_uv_ac;
448 
449   SetupFilterStrength(enc);   // initialize segments' filtering, eventually
450 
451   if (num_segments > 1) SimplifySegments(enc);
452 
453   SetupMatrices(enc);         // finalize quantization matrices
454 }
455 
456 //------------------------------------------------------------------------------
457 // Form the predictions in cache
458 
459 // Must be ordered using {DC_PRED, TM_PRED, V_PRED, H_PRED} as index
460 const int VP8I16ModeOffsets[4] = { I16DC16, I16TM16, I16VE16, I16HE16 };
461 const int VP8UVModeOffsets[4] = { C8DC8, C8TM8, C8VE8, C8HE8 };
462 
463 // Must be indexed using {B_DC_PRED -> B_HU_PRED} as index
464 const int VP8I4ModeOffsets[NUM_BMODES] = {
465   I4DC4, I4TM4, I4VE4, I4HE4, I4RD4, I4VR4, I4LD4, I4VL4, I4HD4, I4HU4
466 };
467 
VP8MakeLuma16Preds(const VP8EncIterator * const it)468 void VP8MakeLuma16Preds(const VP8EncIterator* const it) {
469   const uint8_t* const left = it->x_ ? it->y_left_ : NULL;
470   const uint8_t* const top = it->y_ ? it->y_top_ : NULL;
471   VP8EncPredLuma16(it->yuv_p_, left, top);
472 }
473 
VP8MakeChroma8Preds(const VP8EncIterator * const it)474 void VP8MakeChroma8Preds(const VP8EncIterator* const it) {
475   const uint8_t* const left = it->x_ ? it->u_left_ : NULL;
476   const uint8_t* const top = it->y_ ? it->uv_top_ : NULL;
477   VP8EncPredChroma8(it->yuv_p_, left, top);
478 }
479 
VP8MakeIntra4Preds(const VP8EncIterator * const it)480 void VP8MakeIntra4Preds(const VP8EncIterator* const it) {
481   VP8EncPredLuma4(it->yuv_p_, it->i4_top_);
482 }
483 
484 //------------------------------------------------------------------------------
485 // Quantize
486 
487 // Layout:
488 // +----+----+
489 // |YYYY|UUVV| 0
490 // |YYYY|UUVV| 4
491 // |YYYY|....| 8
492 // |YYYY|....| 12
493 // +----+----+
494 
495 const int VP8Scan[16] = {  // Luma
496   0 +  0 * BPS,  4 +  0 * BPS, 8 +  0 * BPS, 12 +  0 * BPS,
497   0 +  4 * BPS,  4 +  4 * BPS, 8 +  4 * BPS, 12 +  4 * BPS,
498   0 +  8 * BPS,  4 +  8 * BPS, 8 +  8 * BPS, 12 +  8 * BPS,
499   0 + 12 * BPS,  4 + 12 * BPS, 8 + 12 * BPS, 12 + 12 * BPS,
500 };
501 
502 static const int VP8ScanUV[4 + 4] = {
503   0 + 0 * BPS,   4 + 0 * BPS, 0 + 4 * BPS,  4 + 4 * BPS,    // U
504   8 + 0 * BPS,  12 + 0 * BPS, 8 + 4 * BPS, 12 + 4 * BPS     // V
505 };
506 
507 //------------------------------------------------------------------------------
508 // Distortion measurement
509 
510 static const uint16_t kWeightY[16] = {
511   38, 32, 20, 9, 32, 28, 17, 7, 20, 17, 10, 4, 9, 7, 4, 2
512 };
513 
514 static const uint16_t kWeightTrellis[16] = {
515 #if USE_TDISTO == 0
516   16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
517 #else
518   30, 27, 19, 11,
519   27, 24, 17, 10,
520   19, 17, 12,  8,
521   11, 10,  8,  6
522 #endif
523 };
524 
525 // Init/Copy the common fields in score.
InitScore(VP8ModeScore * const rd)526 static void InitScore(VP8ModeScore* const rd) {
527   rd->D  = 0;
528   rd->SD = 0;
529   rd->R  = 0;
530   rd->H  = 0;
531   rd->nz = 0;
532   rd->score = MAX_COST;
533 }
534 
CopyScore(VP8ModeScore * const dst,const VP8ModeScore * const src)535 static void CopyScore(VP8ModeScore* const dst, const VP8ModeScore* const src) {
536   dst->D  = src->D;
537   dst->SD = src->SD;
538   dst->R  = src->R;
539   dst->H  = src->H;
540   dst->nz = src->nz;      // note that nz is not accumulated, but just copied.
541   dst->score = src->score;
542 }
543 
AddScore(VP8ModeScore * const dst,const VP8ModeScore * const src)544 static void AddScore(VP8ModeScore* const dst, const VP8ModeScore* const src) {
545   dst->D  += src->D;
546   dst->SD += src->SD;
547   dst->R  += src->R;
548   dst->H  += src->H;
549   dst->nz |= src->nz;     // here, new nz bits are accumulated.
550   dst->score += src->score;
551 }
552 
553 //------------------------------------------------------------------------------
554 // Performs trellis-optimized quantization.
555 
556 // Trellis node
557 typedef struct {
558   int8_t prev;            // best previous node
559   int8_t sign;            // sign of coeff_i
560   int16_t level;          // level
561 } Node;
562 
563 // Score state
564 typedef struct {
565   score_t score;          // partial RD score
566   const uint16_t* costs;  // shortcut to cost tables
567 } ScoreState;
568 
569 // If a coefficient was quantized to a value Q (using a neutral bias),
570 // we test all alternate possibilities between [Q-MIN_DELTA, Q+MAX_DELTA]
571 // We don't test negative values though.
572 #define MIN_DELTA 0   // how much lower level to try
573 #define MAX_DELTA 1   // how much higher
574 #define NUM_NODES (MIN_DELTA + 1 + MAX_DELTA)
575 #define NODE(n, l) (nodes[(n)][(l) + MIN_DELTA])
576 #define SCORE_STATE(n, l) (score_states[n][(l) + MIN_DELTA])
577 
SetRDScore(int lambda,VP8ModeScore * const rd)578 static WEBP_INLINE void SetRDScore(int lambda, VP8ModeScore* const rd) {
579   rd->score = (rd->R + rd->H) * lambda + RD_DISTO_MULT * (rd->D + rd->SD);
580 }
581 
RDScoreTrellis(int lambda,score_t rate,score_t distortion)582 static WEBP_INLINE score_t RDScoreTrellis(int lambda, score_t rate,
583                                           score_t distortion) {
584   return rate * lambda + RD_DISTO_MULT * distortion;
585 }
586 
TrellisQuantizeBlock(const VP8Encoder * const enc,int16_t in[16],int16_t out[16],int ctx0,int coeff_type,const VP8Matrix * const mtx,int lambda)587 static int TrellisQuantizeBlock(const VP8Encoder* const enc,
588                                 int16_t in[16], int16_t out[16],
589                                 int ctx0, int coeff_type,
590                                 const VP8Matrix* const mtx,
591                                 int lambda) {
592   const ProbaArray* const probas = enc->proba_.coeffs_[coeff_type];
593   CostArrayPtr const costs =
594       (CostArrayPtr)enc->proba_.remapped_costs_[coeff_type];
595   const int first = (coeff_type == 0) ? 1 : 0;
596   Node nodes[16][NUM_NODES];
597   ScoreState score_states[2][NUM_NODES];
598   ScoreState* ss_cur = &SCORE_STATE(0, MIN_DELTA);
599   ScoreState* ss_prev = &SCORE_STATE(1, MIN_DELTA);
600   int best_path[3] = {-1, -1, -1};   // store best-last/best-level/best-previous
601   score_t best_score;
602   int n, m, p, last;
603 
604   {
605     score_t cost;
606     const int thresh = mtx->q_[1] * mtx->q_[1] / 4;
607     const int last_proba = probas[VP8EncBands[first]][ctx0][0];
608 
609     // compute the position of the last interesting coefficient
610     last = first - 1;
611     for (n = 15; n >= first; --n) {
612       const int j = kZigzag[n];
613       const int err = in[j] * in[j];
614       if (err > thresh) {
615         last = n;
616         break;
617       }
618     }
619     // we don't need to go inspect up to n = 16 coeffs. We can just go up
620     // to last + 1 (inclusive) without losing much.
621     if (last < 15) ++last;
622 
623     // compute 'skip' score. This is the max score one can do.
624     cost = VP8BitCost(0, last_proba);
625     best_score = RDScoreTrellis(lambda, cost, 0);
626 
627     // initialize source node.
628     for (m = -MIN_DELTA; m <= MAX_DELTA; ++m) {
629       const score_t rate = (ctx0 == 0) ? VP8BitCost(1, last_proba) : 0;
630       ss_cur[m].score = RDScoreTrellis(lambda, rate, 0);
631       ss_cur[m].costs = costs[first][ctx0];
632     }
633   }
634 
635   // traverse trellis.
636   for (n = first; n <= last; ++n) {
637     const int j = kZigzag[n];
638     const uint32_t Q  = mtx->q_[j];
639     const uint32_t iQ = mtx->iq_[j];
640     const uint32_t B = BIAS(0x00);     // neutral bias
641     // note: it's important to take sign of the _original_ coeff,
642     // so we don't have to consider level < 0 afterward.
643     const int sign = (in[j] < 0);
644     const uint32_t coeff0 = (sign ? -in[j] : in[j]) + mtx->sharpen_[j];
645     int level0 = QUANTDIV(coeff0, iQ, B);
646     int thresh_level = QUANTDIV(coeff0, iQ, BIAS(0x80));
647     if (thresh_level > MAX_LEVEL) thresh_level = MAX_LEVEL;
648     if (level0 > MAX_LEVEL) level0 = MAX_LEVEL;
649 
650     {   // Swap current and previous score states
651       ScoreState* const tmp = ss_cur;
652       ss_cur = ss_prev;
653       ss_prev = tmp;
654     }
655 
656     // test all alternate level values around level0.
657     for (m = -MIN_DELTA; m <= MAX_DELTA; ++m) {
658       Node* const cur = &NODE(n, m);
659       int level = level0 + m;
660       const int ctx = (level > 2) ? 2 : level;
661       const int band = VP8EncBands[n + 1];
662       score_t base_score;
663       score_t best_cur_score = MAX_COST;
664       int best_prev = 0;   // default, in case
665 
666       ss_cur[m].score = MAX_COST;
667       ss_cur[m].costs = costs[n + 1][ctx];
668       if (level < 0 || level > thresh_level) {
669         // Node is dead.
670         continue;
671       }
672 
673       {
674         // Compute delta_error = how much coding this level will
675         // subtract to max_error as distortion.
676         // Here, distortion = sum of (|coeff_i| - level_i * Q_i)^2
677         const int new_error = coeff0 - level * Q;
678         const int delta_error =
679             kWeightTrellis[j] * (new_error * new_error - coeff0 * coeff0);
680         base_score = RDScoreTrellis(lambda, 0, delta_error);
681       }
682 
683       // Inspect all possible non-dead predecessors. Retain only the best one.
684       for (p = -MIN_DELTA; p <= MAX_DELTA; ++p) {
685         // Dead nodes (with ss_prev[p].score >= MAX_COST) are automatically
686         // eliminated since their score can't be better than the current best.
687         const score_t cost = VP8LevelCost(ss_prev[p].costs, level);
688         // Examine node assuming it's a non-terminal one.
689         const score_t score =
690             base_score + ss_prev[p].score + RDScoreTrellis(lambda, cost, 0);
691         if (score < best_cur_score) {
692           best_cur_score = score;
693           best_prev = p;
694         }
695       }
696       // Store best finding in current node.
697       cur->sign = sign;
698       cur->level = level;
699       cur->prev = best_prev;
700       ss_cur[m].score = best_cur_score;
701 
702       // Now, record best terminal node (and thus best entry in the graph).
703       if (level != 0) {
704         const score_t last_pos_cost =
705             (n < 15) ? VP8BitCost(0, probas[band][ctx][0]) : 0;
706         const score_t last_pos_score = RDScoreTrellis(lambda, last_pos_cost, 0);
707         const score_t score = best_cur_score + last_pos_score;
708         if (score < best_score) {
709           best_score = score;
710           best_path[0] = n;                     // best eob position
711           best_path[1] = m;                     // best node index
712           best_path[2] = best_prev;             // best predecessor
713         }
714       }
715     }
716   }
717 
718   // Fresh start
719   memset(in + first, 0, (16 - first) * sizeof(*in));
720   memset(out + first, 0, (16 - first) * sizeof(*out));
721   if (best_path[0] == -1) {
722     return 0;   // skip!
723   }
724 
725   {
726     // Unwind the best path.
727     // Note: best-prev on terminal node is not necessarily equal to the
728     // best_prev for non-terminal. So we patch best_path[2] in.
729     int nz = 0;
730     int best_node = best_path[1];
731     n = best_path[0];
732     NODE(n, best_node).prev = best_path[2];   // force best-prev for terminal
733 
734     for (; n >= first; --n) {
735       const Node* const node = &NODE(n, best_node);
736       const int j = kZigzag[n];
737       out[n] = node->sign ? -node->level : node->level;
738       nz |= node->level;
739       in[j] = out[n] * mtx->q_[j];
740       best_node = node->prev;
741     }
742     return (nz != 0);
743   }
744 }
745 
746 #undef NODE
747 
748 //------------------------------------------------------------------------------
749 // Performs: difference, transform, quantize, back-transform, add
750 // all at once. Output is the reconstructed block in *yuv_out, and the
751 // quantized levels in *levels.
752 
ReconstructIntra16(VP8EncIterator * const it,VP8ModeScore * const rd,uint8_t * const yuv_out,int mode)753 static int ReconstructIntra16(VP8EncIterator* const it,
754                               VP8ModeScore* const rd,
755                               uint8_t* const yuv_out,
756                               int mode) {
757   const VP8Encoder* const enc = it->enc_;
758   const uint8_t* const ref = it->yuv_p_ + VP8I16ModeOffsets[mode];
759   const uint8_t* const src = it->yuv_in_ + Y_OFF_ENC;
760   const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
761   int nz = 0;
762   int n;
763   int16_t tmp[16][16], dc_tmp[16];
764 
765   for (n = 0; n < 16; n += 2) {
766     VP8FTransform2(src + VP8Scan[n], ref + VP8Scan[n], tmp[n]);
767   }
768   VP8FTransformWHT(tmp[0], dc_tmp);
769   nz |= VP8EncQuantizeBlockWHT(dc_tmp, rd->y_dc_levels, &dqm->y2_) << 24;
770 
771   if (DO_TRELLIS_I16 && it->do_trellis_) {
772     int x, y;
773     VP8IteratorNzToBytes(it);
774     for (y = 0, n = 0; y < 4; ++y) {
775       for (x = 0; x < 4; ++x, ++n) {
776         const int ctx = it->top_nz_[x] + it->left_nz_[y];
777         const int non_zero =
778             TrellisQuantizeBlock(enc, tmp[n], rd->y_ac_levels[n], ctx, 0,
779                                  &dqm->y1_, dqm->lambda_trellis_i16_);
780         it->top_nz_[x] = it->left_nz_[y] = non_zero;
781         rd->y_ac_levels[n][0] = 0;
782         nz |= non_zero << n;
783       }
784     }
785   } else {
786     for (n = 0; n < 16; n += 2) {
787       // Zero-out the first coeff, so that: a) nz is correct below, and
788       // b) finding 'last' non-zero coeffs in SetResidualCoeffs() is simplified.
789       tmp[n][0] = tmp[n + 1][0] = 0;
790       nz |= VP8EncQuantize2Blocks(tmp[n], rd->y_ac_levels[n], &dqm->y1_) << n;
791       assert(rd->y_ac_levels[n + 0][0] == 0);
792       assert(rd->y_ac_levels[n + 1][0] == 0);
793     }
794   }
795 
796   // Transform back
797   VP8TransformWHT(dc_tmp, tmp[0]);
798   for (n = 0; n < 16; n += 2) {
799     VP8ITransform(ref + VP8Scan[n], tmp[n], yuv_out + VP8Scan[n], 1);
800   }
801 
802   return nz;
803 }
804 
ReconstructIntra4(VP8EncIterator * const it,int16_t levels[16],const uint8_t * const src,uint8_t * const yuv_out,int mode)805 static int ReconstructIntra4(VP8EncIterator* const it,
806                              int16_t levels[16],
807                              const uint8_t* const src,
808                              uint8_t* const yuv_out,
809                              int mode) {
810   const VP8Encoder* const enc = it->enc_;
811   const uint8_t* const ref = it->yuv_p_ + VP8I4ModeOffsets[mode];
812   const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
813   int nz = 0;
814   int16_t tmp[16];
815 
816   VP8FTransform(src, ref, tmp);
817   if (DO_TRELLIS_I4 && it->do_trellis_) {
818     const int x = it->i4_ & 3, y = it->i4_ >> 2;
819     const int ctx = it->top_nz_[x] + it->left_nz_[y];
820     nz = TrellisQuantizeBlock(enc, tmp, levels, ctx, 3, &dqm->y1_,
821                               dqm->lambda_trellis_i4_);
822   } else {
823     nz = VP8EncQuantizeBlock(tmp, levels, &dqm->y1_);
824   }
825   VP8ITransform(ref, tmp, yuv_out, 0);
826   return nz;
827 }
828 
ReconstructUV(VP8EncIterator * const it,VP8ModeScore * const rd,uint8_t * const yuv_out,int mode)829 static int ReconstructUV(VP8EncIterator* const it, VP8ModeScore* const rd,
830                          uint8_t* const yuv_out, int mode) {
831   const VP8Encoder* const enc = it->enc_;
832   const uint8_t* const ref = it->yuv_p_ + VP8UVModeOffsets[mode];
833   const uint8_t* const src = it->yuv_in_ + U_OFF_ENC;
834   const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
835   int nz = 0;
836   int n;
837   int16_t tmp[8][16];
838 
839   for (n = 0; n < 8; n += 2) {
840     VP8FTransform2(src + VP8ScanUV[n], ref + VP8ScanUV[n], tmp[n]);
841   }
842   if (DO_TRELLIS_UV && it->do_trellis_) {
843     int ch, x, y;
844     for (ch = 0, n = 0; ch <= 2; ch += 2) {
845       for (y = 0; y < 2; ++y) {
846         for (x = 0; x < 2; ++x, ++n) {
847           const int ctx = it->top_nz_[4 + ch + x] + it->left_nz_[4 + ch + y];
848           const int non_zero =
849               TrellisQuantizeBlock(enc, tmp[n], rd->uv_levels[n], ctx, 2,
850                                    &dqm->uv_, dqm->lambda_trellis_uv_);
851           it->top_nz_[4 + ch + x] = it->left_nz_[4 + ch + y] = non_zero;
852           nz |= non_zero << n;
853         }
854       }
855     }
856   } else {
857     for (n = 0; n < 8; n += 2) {
858       nz |= VP8EncQuantize2Blocks(tmp[n], rd->uv_levels[n], &dqm->uv_) << n;
859     }
860   }
861 
862   for (n = 0; n < 8; n += 2) {
863     VP8ITransform(ref + VP8ScanUV[n], tmp[n], yuv_out + VP8ScanUV[n], 1);
864   }
865   return (nz << 16);
866 }
867 
868 //------------------------------------------------------------------------------
869 // RD-opt decision. Reconstruct each modes, evalue distortion and bit-cost.
870 // Pick the mode is lower RD-cost = Rate + lambda * Distortion.
871 
StoreMaxDelta(VP8SegmentInfo * const dqm,const int16_t DCs[16])872 static void StoreMaxDelta(VP8SegmentInfo* const dqm, const int16_t DCs[16]) {
873   // We look at the first three AC coefficients to determine what is the average
874   // delta between each sub-4x4 block.
875   const int v0 = abs(DCs[1]);
876   const int v1 = abs(DCs[2]);
877   const int v2 = abs(DCs[4]);
878   int max_v = (v1 > v0) ? v1 : v0;
879   max_v = (v2 > max_v) ? v2 : max_v;
880   if (max_v > dqm->max_edge_) dqm->max_edge_ = max_v;
881 }
882 
SwapModeScore(VP8ModeScore ** a,VP8ModeScore ** b)883 static void SwapModeScore(VP8ModeScore** a, VP8ModeScore** b) {
884   VP8ModeScore* const tmp = *a;
885   *a = *b;
886   *b = tmp;
887 }
888 
SwapPtr(uint8_t ** a,uint8_t ** b)889 static void SwapPtr(uint8_t** a, uint8_t** b) {
890   uint8_t* const tmp = *a;
891   *a = *b;
892   *b = tmp;
893 }
894 
SwapOut(VP8EncIterator * const it)895 static void SwapOut(VP8EncIterator* const it) {
896   SwapPtr(&it->yuv_out_, &it->yuv_out2_);
897 }
898 
IsFlat(const int16_t * levels,int num_blocks,score_t thresh)899 static score_t IsFlat(const int16_t* levels, int num_blocks, score_t thresh) {
900   score_t score = 0;
901   while (num_blocks-- > 0) {      // TODO(skal): refine positional scoring?
902     int i;
903     for (i = 1; i < 16; ++i) {    // omit DC, we're only interested in AC
904       score += (levels[i] != 0);
905       if (score > thresh) return 0;
906     }
907     levels += 16;
908   }
909   return 1;
910 }
911 
PickBestIntra16(VP8EncIterator * const it,VP8ModeScore * rd)912 static void PickBestIntra16(VP8EncIterator* const it, VP8ModeScore* rd) {
913   const int kNumBlocks = 16;
914   VP8SegmentInfo* const dqm = &it->enc_->dqm_[it->mb_->segment_];
915   const int lambda = dqm->lambda_i16_;
916   const int tlambda = dqm->tlambda_;
917   const uint8_t* const src = it->yuv_in_ + Y_OFF_ENC;
918   VP8ModeScore rd_tmp;
919   VP8ModeScore* rd_cur = &rd_tmp;
920   VP8ModeScore* rd_best = rd;
921   int mode;
922 
923   rd->mode_i16 = -1;
924   for (mode = 0; mode < NUM_PRED_MODES; ++mode) {
925     uint8_t* const tmp_dst = it->yuv_out2_ + Y_OFF_ENC;  // scratch buffer
926     rd_cur->mode_i16 = mode;
927 
928     // Reconstruct
929     rd_cur->nz = ReconstructIntra16(it, rd_cur, tmp_dst, mode);
930 
931     // Measure RD-score
932     rd_cur->D = VP8SSE16x16(src, tmp_dst);
933     rd_cur->SD =
934         tlambda ? MULT_8B(tlambda, VP8TDisto16x16(src, tmp_dst, kWeightY)) : 0;
935     rd_cur->H = VP8FixedCostsI16[mode];
936     rd_cur->R = VP8GetCostLuma16(it, rd_cur);
937     if (mode > 0 &&
938         IsFlat(rd_cur->y_ac_levels[0], kNumBlocks, FLATNESS_LIMIT_I16)) {
939       // penalty to avoid flat area to be mispredicted by complex mode
940       rd_cur->R += FLATNESS_PENALTY * kNumBlocks;
941     }
942 
943     // Since we always examine Intra16 first, we can overwrite *rd directly.
944     SetRDScore(lambda, rd_cur);
945     if (mode == 0 || rd_cur->score < rd_best->score) {
946       SwapModeScore(&rd_cur, &rd_best);
947       SwapOut(it);
948     }
949   }
950   if (rd_best != rd) {
951     memcpy(rd, rd_best, sizeof(*rd));
952   }
953   SetRDScore(dqm->lambda_mode_, rd);   // finalize score for mode decision.
954   VP8SetIntra16Mode(it, rd->mode_i16);
955 
956   // we have a blocky macroblock (only DCs are non-zero) with fairly high
957   // distortion, record max delta so we can later adjust the minimal filtering
958   // strength needed to smooth these blocks out.
959   if ((rd->nz & 0x100ffff) == 0x1000000 && rd->D > dqm->min_disto_) {
960     StoreMaxDelta(dqm, rd->y_dc_levels);
961   }
962 }
963 
964 //------------------------------------------------------------------------------
965 
966 // return the cost array corresponding to the surrounding prediction modes.
GetCostModeI4(VP8EncIterator * const it,const uint8_t modes[16])967 static const uint16_t* GetCostModeI4(VP8EncIterator* const it,
968                                      const uint8_t modes[16]) {
969   const int preds_w = it->enc_->preds_w_;
970   const int x = (it->i4_ & 3), y = it->i4_ >> 2;
971   const int left = (x == 0) ? it->preds_[y * preds_w - 1] : modes[it->i4_ - 1];
972   const int top = (y == 0) ? it->preds_[-preds_w + x] : modes[it->i4_ - 4];
973   return VP8FixedCostsI4[top][left];
974 }
975 
PickBestIntra4(VP8EncIterator * const it,VP8ModeScore * const rd)976 static int PickBestIntra4(VP8EncIterator* const it, VP8ModeScore* const rd) {
977   const VP8Encoder* const enc = it->enc_;
978   const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
979   const int lambda = dqm->lambda_i4_;
980   const int tlambda = dqm->tlambda_;
981   const uint8_t* const src0 = it->yuv_in_ + Y_OFF_ENC;
982   uint8_t* const best_blocks = it->yuv_out2_ + Y_OFF_ENC;
983   int total_header_bits = 0;
984   VP8ModeScore rd_best;
985 
986   if (enc->max_i4_header_bits_ == 0) {
987     return 0;
988   }
989 
990   InitScore(&rd_best);
991   rd_best.H = 211;  // '211' is the value of VP8BitCost(0, 145)
992   SetRDScore(dqm->lambda_mode_, &rd_best);
993   VP8IteratorStartI4(it);
994   do {
995     const int kNumBlocks = 1;
996     VP8ModeScore rd_i4;
997     int mode;
998     int best_mode = -1;
999     const uint8_t* const src = src0 + VP8Scan[it->i4_];
1000     const uint16_t* const mode_costs = GetCostModeI4(it, rd->modes_i4);
1001     uint8_t* best_block = best_blocks + VP8Scan[it->i4_];
1002     uint8_t* tmp_dst = it->yuv_p_ + I4TMP;    // scratch buffer.
1003 
1004     InitScore(&rd_i4);
1005     VP8MakeIntra4Preds(it);
1006     for (mode = 0; mode < NUM_BMODES; ++mode) {
1007       VP8ModeScore rd_tmp;
1008       int16_t tmp_levels[16];
1009 
1010       // Reconstruct
1011       rd_tmp.nz =
1012           ReconstructIntra4(it, tmp_levels, src, tmp_dst, mode) << it->i4_;
1013 
1014       // Compute RD-score
1015       rd_tmp.D = VP8SSE4x4(src, tmp_dst);
1016       rd_tmp.SD =
1017           tlambda ? MULT_8B(tlambda, VP8TDisto4x4(src, tmp_dst, kWeightY))
1018                   : 0;
1019       rd_tmp.H = mode_costs[mode];
1020 
1021       // Add flatness penalty
1022       if (mode > 0 && IsFlat(tmp_levels, kNumBlocks, FLATNESS_LIMIT_I4)) {
1023         rd_tmp.R = FLATNESS_PENALTY * kNumBlocks;
1024       } else {
1025         rd_tmp.R = 0;
1026       }
1027 
1028       // early-out check
1029       SetRDScore(lambda, &rd_tmp);
1030       if (best_mode >= 0 && rd_tmp.score >= rd_i4.score) continue;
1031 
1032       // finish computing score
1033       rd_tmp.R += VP8GetCostLuma4(it, tmp_levels);
1034       SetRDScore(lambda, &rd_tmp);
1035 
1036       if (best_mode < 0 || rd_tmp.score < rd_i4.score) {
1037         CopyScore(&rd_i4, &rd_tmp);
1038         best_mode = mode;
1039         SwapPtr(&tmp_dst, &best_block);
1040         memcpy(rd_best.y_ac_levels[it->i4_], tmp_levels,
1041                sizeof(rd_best.y_ac_levels[it->i4_]));
1042       }
1043     }
1044     SetRDScore(dqm->lambda_mode_, &rd_i4);
1045     AddScore(&rd_best, &rd_i4);
1046     if (rd_best.score >= rd->score) {
1047       return 0;
1048     }
1049     total_header_bits += (int)rd_i4.H;   // <- equal to mode_costs[best_mode];
1050     if (total_header_bits > enc->max_i4_header_bits_) {
1051       return 0;
1052     }
1053     // Copy selected samples if not in the right place already.
1054     if (best_block != best_blocks + VP8Scan[it->i4_]) {
1055       VP8Copy4x4(best_block, best_blocks + VP8Scan[it->i4_]);
1056     }
1057     rd->modes_i4[it->i4_] = best_mode;
1058     it->top_nz_[it->i4_ & 3] = it->left_nz_[it->i4_ >> 2] = (rd_i4.nz ? 1 : 0);
1059   } while (VP8IteratorRotateI4(it, best_blocks));
1060 
1061   // finalize state
1062   CopyScore(rd, &rd_best);
1063   VP8SetIntra4Mode(it, rd->modes_i4);
1064   SwapOut(it);
1065   memcpy(rd->y_ac_levels, rd_best.y_ac_levels, sizeof(rd->y_ac_levels));
1066   return 1;   // select intra4x4 over intra16x16
1067 }
1068 
1069 //------------------------------------------------------------------------------
1070 
PickBestUV(VP8EncIterator * const it,VP8ModeScore * const rd)1071 static void PickBestUV(VP8EncIterator* const it, VP8ModeScore* const rd) {
1072   const int kNumBlocks = 8;
1073   const VP8SegmentInfo* const dqm = &it->enc_->dqm_[it->mb_->segment_];
1074   const int lambda = dqm->lambda_uv_;
1075   const uint8_t* const src = it->yuv_in_ + U_OFF_ENC;
1076   uint8_t* tmp_dst = it->yuv_out2_ + U_OFF_ENC;  // scratch buffer
1077   uint8_t* dst0 = it->yuv_out_ + U_OFF_ENC;
1078   uint8_t* dst = dst0;
1079   VP8ModeScore rd_best;
1080   int mode;
1081 
1082   rd->mode_uv = -1;
1083   InitScore(&rd_best);
1084   for (mode = 0; mode < NUM_PRED_MODES; ++mode) {
1085     VP8ModeScore rd_uv;
1086 
1087     // Reconstruct
1088     rd_uv.nz = ReconstructUV(it, &rd_uv, tmp_dst, mode);
1089 
1090     // Compute RD-score
1091     rd_uv.D  = VP8SSE16x8(src, tmp_dst);
1092     rd_uv.SD = 0;    // not calling TDisto here: it tends to flatten areas.
1093     rd_uv.H  = VP8FixedCostsUV[mode];
1094     rd_uv.R  = VP8GetCostUV(it, &rd_uv);
1095     if (mode > 0 && IsFlat(rd_uv.uv_levels[0], kNumBlocks, FLATNESS_LIMIT_UV)) {
1096       rd_uv.R += FLATNESS_PENALTY * kNumBlocks;
1097     }
1098 
1099     SetRDScore(lambda, &rd_uv);
1100     if (mode == 0 || rd_uv.score < rd_best.score) {
1101       CopyScore(&rd_best, &rd_uv);
1102       rd->mode_uv = mode;
1103       memcpy(rd->uv_levels, rd_uv.uv_levels, sizeof(rd->uv_levels));
1104       SwapPtr(&dst, &tmp_dst);
1105     }
1106   }
1107   VP8SetIntraUVMode(it, rd->mode_uv);
1108   AddScore(rd, &rd_best);
1109   if (dst != dst0) {   // copy 16x8 block if needed
1110     VP8Copy16x8(dst, dst0);
1111   }
1112 }
1113 
1114 //------------------------------------------------------------------------------
1115 // Final reconstruction and quantization.
1116 
SimpleQuantize(VP8EncIterator * const it,VP8ModeScore * const rd)1117 static void SimpleQuantize(VP8EncIterator* const it, VP8ModeScore* const rd) {
1118   const VP8Encoder* const enc = it->enc_;
1119   const int is_i16 = (it->mb_->type_ == 1);
1120   int nz = 0;
1121 
1122   if (is_i16) {
1123     nz = ReconstructIntra16(it, rd, it->yuv_out_ + Y_OFF_ENC, it->preds_[0]);
1124   } else {
1125     VP8IteratorStartI4(it);
1126     do {
1127       const int mode =
1128           it->preds_[(it->i4_ & 3) + (it->i4_ >> 2) * enc->preds_w_];
1129       const uint8_t* const src = it->yuv_in_ + Y_OFF_ENC + VP8Scan[it->i4_];
1130       uint8_t* const dst = it->yuv_out_ + Y_OFF_ENC + VP8Scan[it->i4_];
1131       VP8MakeIntra4Preds(it);
1132       nz |= ReconstructIntra4(it, rd->y_ac_levels[it->i4_],
1133                               src, dst, mode) << it->i4_;
1134     } while (VP8IteratorRotateI4(it, it->yuv_out_ + Y_OFF_ENC));
1135   }
1136 
1137   nz |= ReconstructUV(it, rd, it->yuv_out_ + U_OFF_ENC, it->mb_->uv_mode_);
1138   rd->nz = nz;
1139 }
1140 
1141 // Refine intra16/intra4 sub-modes based on distortion only (not rate).
RefineUsingDistortion(VP8EncIterator * const it,int try_both_modes,int refine_uv_mode,VP8ModeScore * const rd)1142 static void RefineUsingDistortion(VP8EncIterator* const it,
1143                                   int try_both_modes, int refine_uv_mode,
1144                                   VP8ModeScore* const rd) {
1145   score_t best_score = MAX_COST;
1146   int nz = 0;
1147   int mode;
1148   int is_i16 = try_both_modes || (it->mb_->type_ == 1);
1149 
1150   const VP8SegmentInfo* const dqm = &it->enc_->dqm_[it->mb_->segment_];
1151   // Some empiric constants, of approximate order of magnitude.
1152   const int lambda_d_i16 = 106;
1153   const int lambda_d_i4 = 11;
1154   const int lambda_d_uv = 120;
1155   score_t score_i4 = dqm->i4_penalty_;
1156   score_t i4_bit_sum = 0;
1157   const score_t bit_limit = try_both_modes ? it->enc_->mb_header_limit_
1158                                            : MAX_COST;  // no early-out allowed
1159 
1160   if (is_i16) {   // First, evaluate Intra16 distortion
1161     int best_mode = -1;
1162     const uint8_t* const src = it->yuv_in_ + Y_OFF_ENC;
1163     for (mode = 0; mode < NUM_PRED_MODES; ++mode) {
1164       const uint8_t* const ref = it->yuv_p_ + VP8I16ModeOffsets[mode];
1165       const score_t score = VP8SSE16x16(src, ref) * RD_DISTO_MULT
1166                           + VP8FixedCostsI16[mode] * lambda_d_i16;
1167       if (mode > 0 && VP8FixedCostsI16[mode] > bit_limit) {
1168         continue;
1169       }
1170       if (score < best_score) {
1171         best_mode = mode;
1172         best_score = score;
1173       }
1174     }
1175     VP8SetIntra16Mode(it, best_mode);
1176     // we'll reconstruct later, if i16 mode actually gets selected
1177   }
1178 
1179   // Next, evaluate Intra4
1180   if (try_both_modes || !is_i16) {
1181     // We don't evaluate the rate here, but just account for it through a
1182     // constant penalty (i4 mode usually needs more bits compared to i16).
1183     is_i16 = 0;
1184     VP8IteratorStartI4(it);
1185     do {
1186       int best_i4_mode = -1;
1187       score_t best_i4_score = MAX_COST;
1188       const uint8_t* const src = it->yuv_in_ + Y_OFF_ENC + VP8Scan[it->i4_];
1189       const uint16_t* const mode_costs = GetCostModeI4(it, rd->modes_i4);
1190 
1191       VP8MakeIntra4Preds(it);
1192       for (mode = 0; mode < NUM_BMODES; ++mode) {
1193         const uint8_t* const ref = it->yuv_p_ + VP8I4ModeOffsets[mode];
1194         const score_t score = VP8SSE4x4(src, ref) * RD_DISTO_MULT
1195                             + mode_costs[mode] * lambda_d_i4;
1196         if (score < best_i4_score) {
1197           best_i4_mode = mode;
1198           best_i4_score = score;
1199         }
1200       }
1201       i4_bit_sum += mode_costs[best_i4_mode];
1202       rd->modes_i4[it->i4_] = best_i4_mode;
1203       score_i4 += best_i4_score;
1204       if (score_i4 >= best_score || i4_bit_sum > bit_limit) {
1205         // Intra4 won't be better than Intra16. Bail out and pick Intra16.
1206         is_i16 = 1;
1207         break;
1208       } else {  // reconstruct partial block inside yuv_out2_ buffer
1209         uint8_t* const tmp_dst = it->yuv_out2_ + Y_OFF_ENC + VP8Scan[it->i4_];
1210         nz |= ReconstructIntra4(it, rd->y_ac_levels[it->i4_],
1211                                 src, tmp_dst, best_i4_mode) << it->i4_;
1212       }
1213     } while (VP8IteratorRotateI4(it, it->yuv_out2_ + Y_OFF_ENC));
1214   }
1215 
1216   // Final reconstruction, depending on which mode is selected.
1217   if (!is_i16) {
1218     VP8SetIntra4Mode(it, rd->modes_i4);
1219     SwapOut(it);
1220     best_score = score_i4;
1221   } else {
1222     nz = ReconstructIntra16(it, rd, it->yuv_out_ + Y_OFF_ENC, it->preds_[0]);
1223   }
1224 
1225   // ... and UV!
1226   if (refine_uv_mode) {
1227     int best_mode = -1;
1228     score_t best_uv_score = MAX_COST;
1229     const uint8_t* const src = it->yuv_in_ + U_OFF_ENC;
1230     for (mode = 0; mode < NUM_PRED_MODES; ++mode) {
1231       const uint8_t* const ref = it->yuv_p_ + VP8UVModeOffsets[mode];
1232       const score_t score = VP8SSE16x8(src, ref) * RD_DISTO_MULT
1233                           + VP8FixedCostsUV[mode] * lambda_d_uv;
1234       if (score < best_uv_score) {
1235         best_mode = mode;
1236         best_uv_score = score;
1237       }
1238     }
1239     VP8SetIntraUVMode(it, best_mode);
1240   }
1241   nz |= ReconstructUV(it, rd, it->yuv_out_ + U_OFF_ENC, it->mb_->uv_mode_);
1242 
1243   rd->nz = nz;
1244   rd->score = best_score;
1245 }
1246 
1247 //------------------------------------------------------------------------------
1248 // Entry point
1249 
VP8Decimate(VP8EncIterator * const it,VP8ModeScore * const rd,VP8RDLevel rd_opt)1250 int VP8Decimate(VP8EncIterator* const it, VP8ModeScore* const rd,
1251                 VP8RDLevel rd_opt) {
1252   int is_skipped;
1253   const int method = it->enc_->method_;
1254 
1255   InitScore(rd);
1256 
1257   // We can perform predictions for Luma16x16 and Chroma8x8 already.
1258   // Luma4x4 predictions needs to be done as-we-go.
1259   VP8MakeLuma16Preds(it);
1260   VP8MakeChroma8Preds(it);
1261 
1262   if (rd_opt > RD_OPT_NONE) {
1263     it->do_trellis_ = (rd_opt >= RD_OPT_TRELLIS_ALL);
1264     PickBestIntra16(it, rd);
1265     if (method >= 2) {
1266       PickBestIntra4(it, rd);
1267     }
1268     PickBestUV(it, rd);
1269     if (rd_opt == RD_OPT_TRELLIS) {   // finish off with trellis-optim now
1270       it->do_trellis_ = 1;
1271       SimpleQuantize(it, rd);
1272     }
1273   } else {
1274     // At this point we have heuristically decided intra16 / intra4.
1275     // For method >= 2, pick the best intra4/intra16 based on SSE (~tad slower).
1276     // For method <= 1, we don't re-examine the decision but just go ahead with
1277     // quantization/reconstruction.
1278     RefineUsingDistortion(it, (method >= 2), (method >= 1), rd);
1279   }
1280   is_skipped = (rd->nz == 0);
1281   VP8SetSkip(it, is_skipped);
1282   return is_skipped;
1283 }
1284