1 // Copyright 2011 Google Inc. All Rights Reserved.
2 //
3 // Use of this source code is governed by a BSD-style license
4 // that can be found in the COPYING file in the root of the source
5 // tree. An additional intellectual property rights grant can be found
6 // in the file PATENTS. All contributing project authors may
7 // be found in the AUTHORS file in the root of the source tree.
8 // -----------------------------------------------------------------------------
9 //
10 //   Quantization
11 //
12 // Author: Skal (pascal.massimino@gmail.com)
13 
14 #include <assert.h>
15 #include <math.h>
16 #include <stdlib.h>  // for abs()
17 
18 #include "src/dsp/quant.h"
19 #include "src/enc/vp8i_enc.h"
20 #include "src/enc/cost_enc.h"
21 
22 #define DO_TRELLIS_I4  1
23 #define DO_TRELLIS_I16 1   // not a huge gain, but ok at low bitrate.
24 #define DO_TRELLIS_UV  0   // disable trellis for UV. Risky. Not worth.
25 #define USE_TDISTO 1
26 
27 #define MID_ALPHA 64      // neutral value for susceptibility
28 #define MIN_ALPHA 30      // lowest usable value for susceptibility
29 #define MAX_ALPHA 100     // higher meaningful value for susceptibility
30 
31 #define SNS_TO_DQ 0.9     // Scaling constant between the sns value and the QP
32                           // power-law modulation. Must be strictly less than 1.
33 
34 // number of non-zero coeffs below which we consider the block very flat
35 // (and apply a penalty to complex predictions)
36 #define FLATNESS_LIMIT_I16 10      // I16 mode
37 #define FLATNESS_LIMIT_I4  3       // I4 mode
38 #define FLATNESS_LIMIT_UV  2       // UV mode
39 #define FLATNESS_PENALTY   140     // roughly ~1bit per block
40 
41 #define MULT_8B(a, b) (((a) * (b) + 128) >> 8)
42 
43 #define RD_DISTO_MULT      256  // distortion multiplier (equivalent of lambda)
44 
45 // #define DEBUG_BLOCK
46 
47 //------------------------------------------------------------------------------
48 
49 #if defined(DEBUG_BLOCK)
50 
51 #include <stdio.h>
52 #include <stdlib.h>
53 
PrintBlockInfo(const VP8EncIterator * const it,const VP8ModeScore * const rd)54 static void PrintBlockInfo(const VP8EncIterator* const it,
55                            const VP8ModeScore* const rd) {
56   int i, j;
57   const int is_i16 = (it->mb_->type_ == 1);
58   const uint8_t* const y_in = it->yuv_in_ + Y_OFF_ENC;
59   const uint8_t* const y_out = it->yuv_out_ + Y_OFF_ENC;
60   const uint8_t* const uv_in = it->yuv_in_ + U_OFF_ENC;
61   const uint8_t* const uv_out = it->yuv_out_ + U_OFF_ENC;
62   printf("SOURCE / OUTPUT / ABS DELTA\n");
63   for (j = 0; j < 16; ++j) {
64     for (i = 0; i < 16; ++i) printf("%3d ", y_in[i + j * BPS]);
65     printf("     ");
66     for (i = 0; i < 16; ++i) printf("%3d ", y_out[i + j * BPS]);
67     printf("     ");
68     for (i = 0; i < 16; ++i) {
69       printf("%1d ", abs(y_in[i + j * BPS] - y_out[i + j * BPS]));
70     }
71     printf("\n");
72   }
73   printf("\n");   // newline before the U/V block
74   for (j = 0; j < 8; ++j) {
75     for (i = 0; i < 8; ++i) printf("%3d ", uv_in[i + j * BPS]);
76     printf(" ");
77     for (i = 8; i < 16; ++i) printf("%3d ", uv_in[i + j * BPS]);
78     printf("    ");
79     for (i = 0; i < 8; ++i) printf("%3d ", uv_out[i + j * BPS]);
80     printf(" ");
81     for (i = 8; i < 16; ++i) printf("%3d ", uv_out[i + j * BPS]);
82     printf("   ");
83     for (i = 0; i < 8; ++i) {
84       printf("%1d ", abs(uv_out[i + j * BPS] - uv_in[i + j * BPS]));
85     }
86     printf(" ");
87     for (i = 8; i < 16; ++i) {
88       printf("%1d ", abs(uv_out[i + j * BPS] - uv_in[i + j * BPS]));
89     }
90     printf("\n");
91   }
92   printf("\nD:%d SD:%d R:%d H:%d nz:0x%x score:%d\n",
93     (int)rd->D, (int)rd->SD, (int)rd->R, (int)rd->H, (int)rd->nz,
94     (int)rd->score);
95   if (is_i16) {
96     printf("Mode: %d\n", rd->mode_i16);
97     printf("y_dc_levels:");
98     for (i = 0; i < 16; ++i) printf("%3d ", rd->y_dc_levels[i]);
99     printf("\n");
100   } else {
101     printf("Modes[16]: ");
102     for (i = 0; i < 16; ++i) printf("%d ", rd->modes_i4[i]);
103     printf("\n");
104   }
105   printf("y_ac_levels:\n");
106   for (j = 0; j < 16; ++j) {
107     for (i = is_i16 ? 1 : 0; i < 16; ++i) {
108       printf("%4d ", rd->y_ac_levels[j][i]);
109     }
110     printf("\n");
111   }
112   printf("\n");
113   printf("uv_levels (mode=%d):\n", rd->mode_uv);
114   for (j = 0; j < 8; ++j) {
115     for (i = 0; i < 16; ++i) {
116       printf("%4d ", rd->uv_levels[j][i]);
117     }
118     printf("\n");
119   }
120 }
121 
122 #endif   // DEBUG_BLOCK
123 
124 //------------------------------------------------------------------------------
125 
clip(int v,int m,int M)126 static WEBP_INLINE int clip(int v, int m, int M) {
127   return v < m ? m : v > M ? M : v;
128 }
129 
130 static const uint8_t kZigzag[16] = {
131   0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15
132 };
133 
134 static const uint8_t kDcTable[128] = {
135   4,     5,   6,   7,   8,   9,  10,  10,
136   11,   12,  13,  14,  15,  16,  17,  17,
137   18,   19,  20,  20,  21,  21,  22,  22,
138   23,   23,  24,  25,  25,  26,  27,  28,
139   29,   30,  31,  32,  33,  34,  35,  36,
140   37,   37,  38,  39,  40,  41,  42,  43,
141   44,   45,  46,  46,  47,  48,  49,  50,
142   51,   52,  53,  54,  55,  56,  57,  58,
143   59,   60,  61,  62,  63,  64,  65,  66,
144   67,   68,  69,  70,  71,  72,  73,  74,
145   75,   76,  76,  77,  78,  79,  80,  81,
146   82,   83,  84,  85,  86,  87,  88,  89,
147   91,   93,  95,  96,  98, 100, 101, 102,
148   104, 106, 108, 110, 112, 114, 116, 118,
149   122, 124, 126, 128, 130, 132, 134, 136,
150   138, 140, 143, 145, 148, 151, 154, 157
151 };
152 
153 static const uint16_t kAcTable[128] = {
154   4,     5,   6,   7,   8,   9,  10,  11,
155   12,   13,  14,  15,  16,  17,  18,  19,
156   20,   21,  22,  23,  24,  25,  26,  27,
157   28,   29,  30,  31,  32,  33,  34,  35,
158   36,   37,  38,  39,  40,  41,  42,  43,
159   44,   45,  46,  47,  48,  49,  50,  51,
160   52,   53,  54,  55,  56,  57,  58,  60,
161   62,   64,  66,  68,  70,  72,  74,  76,
162   78,   80,  82,  84,  86,  88,  90,  92,
163   94,   96,  98, 100, 102, 104, 106, 108,
164   110, 112, 114, 116, 119, 122, 125, 128,
165   131, 134, 137, 140, 143, 146, 149, 152,
166   155, 158, 161, 164, 167, 170, 173, 177,
167   181, 185, 189, 193, 197, 201, 205, 209,
168   213, 217, 221, 225, 229, 234, 239, 245,
169   249, 254, 259, 264, 269, 274, 279, 284
170 };
171 
172 static const uint16_t kAcTable2[128] = {
173   8,     8,   9,  10,  12,  13,  15,  17,
174   18,   20,  21,  23,  24,  26,  27,  29,
175   31,   32,  34,  35,  37,  38,  40,  41,
176   43,   44,  46,  48,  49,  51,  52,  54,
177   55,   57,  58,  60,  62,  63,  65,  66,
178   68,   69,  71,  72,  74,  75,  77,  79,
179   80,   82,  83,  85,  86,  88,  89,  93,
180   96,   99, 102, 105, 108, 111, 114, 117,
181   120, 124, 127, 130, 133, 136, 139, 142,
182   145, 148, 151, 155, 158, 161, 164, 167,
183   170, 173, 176, 179, 184, 189, 193, 198,
184   203, 207, 212, 217, 221, 226, 230, 235,
185   240, 244, 249, 254, 258, 263, 268, 274,
186   280, 286, 292, 299, 305, 311, 317, 323,
187   330, 336, 342, 348, 354, 362, 370, 379,
188   385, 393, 401, 409, 416, 424, 432, 440
189 };
190 
191 static const uint8_t kBiasMatrices[3][2] = {  // [luma-ac,luma-dc,chroma][dc,ac]
192   { 96, 110 }, { 96, 108 }, { 110, 115 }
193 };
194 
195 // Sharpening by (slightly) raising the hi-frequency coeffs.
196 // Hack-ish but helpful for mid-bitrate range. Use with care.
197 #define SHARPEN_BITS 11  // number of descaling bits for sharpening bias
198 static const uint8_t kFreqSharpening[16] = {
199   0,  30, 60, 90,
200   30, 60, 90, 90,
201   60, 90, 90, 90,
202   90, 90, 90, 90
203 };
204 
205 //------------------------------------------------------------------------------
206 // Initialize quantization parameters in VP8Matrix
207 
208 // Returns the average quantizer
ExpandMatrix(VP8Matrix * const m,int type)209 static int ExpandMatrix(VP8Matrix* const m, int type) {
210   int i, sum;
211   for (i = 0; i < 2; ++i) {
212     const int is_ac_coeff = (i > 0);
213     const int bias = kBiasMatrices[type][is_ac_coeff];
214     m->iq_[i] = (1 << QFIX) / m->q_[i];
215     m->bias_[i] = BIAS(bias);
216     // zthresh_ is the exact value such that QUANTDIV(coeff, iQ, B) is:
217     //   * zero if coeff <= zthresh
218     //   * non-zero if coeff > zthresh
219     m->zthresh_[i] = ((1 << QFIX) - 1 - m->bias_[i]) / m->iq_[i];
220   }
221   for (i = 2; i < 16; ++i) {
222     m->q_[i] = m->q_[1];
223     m->iq_[i] = m->iq_[1];
224     m->bias_[i] = m->bias_[1];
225     m->zthresh_[i] = m->zthresh_[1];
226   }
227   for (sum = 0, i = 0; i < 16; ++i) {
228     if (type == 0) {  // we only use sharpening for AC luma coeffs
229       m->sharpen_[i] = (kFreqSharpening[i] * m->q_[i]) >> SHARPEN_BITS;
230     } else {
231       m->sharpen_[i] = 0;
232     }
233     sum += m->q_[i];
234   }
235   return (sum + 8) >> 4;
236 }
237 
CheckLambdaValue(int * const v)238 static void CheckLambdaValue(int* const v) { if (*v < 1) *v = 1; }
239 
SetupMatrices(VP8Encoder * enc)240 static void SetupMatrices(VP8Encoder* enc) {
241   int i;
242   const int tlambda_scale =
243     (enc->method_ >= 4) ? enc->config_->sns_strength
244                         : 0;
245   const int num_segments = enc->segment_hdr_.num_segments_;
246   for (i = 0; i < num_segments; ++i) {
247     VP8SegmentInfo* const m = &enc->dqm_[i];
248     const int q = m->quant_;
249     int q_i4, q_i16, q_uv;
250     m->y1_.q_[0] = kDcTable[clip(q + enc->dq_y1_dc_, 0, 127)];
251     m->y1_.q_[1] = kAcTable[clip(q,                  0, 127)];
252 
253     m->y2_.q_[0] = kDcTable[ clip(q + enc->dq_y2_dc_, 0, 127)] * 2;
254     m->y2_.q_[1] = kAcTable2[clip(q + enc->dq_y2_ac_, 0, 127)];
255 
256     m->uv_.q_[0] = kDcTable[clip(q + enc->dq_uv_dc_, 0, 117)];
257     m->uv_.q_[1] = kAcTable[clip(q + enc->dq_uv_ac_, 0, 127)];
258 
259     q_i4  = ExpandMatrix(&m->y1_, 0);
260     q_i16 = ExpandMatrix(&m->y2_, 1);
261     q_uv  = ExpandMatrix(&m->uv_, 2);
262 
263     m->lambda_i4_          = (3 * q_i4 * q_i4) >> 7;
264     m->lambda_i16_         = (3 * q_i16 * q_i16);
265     m->lambda_uv_          = (3 * q_uv * q_uv) >> 6;
266     m->lambda_mode_        = (1 * q_i4 * q_i4) >> 7;
267     m->lambda_trellis_i4_  = (7 * q_i4 * q_i4) >> 3;
268     m->lambda_trellis_i16_ = (q_i16 * q_i16) >> 2;
269     m->lambda_trellis_uv_  = (q_uv * q_uv) << 1;
270     m->tlambda_            = (tlambda_scale * q_i4) >> 5;
271 
272     // none of these constants should be < 1
273     CheckLambdaValue(&m->lambda_i4_);
274     CheckLambdaValue(&m->lambda_i16_);
275     CheckLambdaValue(&m->lambda_uv_);
276     CheckLambdaValue(&m->lambda_mode_);
277     CheckLambdaValue(&m->lambda_trellis_i4_);
278     CheckLambdaValue(&m->lambda_trellis_i16_);
279     CheckLambdaValue(&m->lambda_trellis_uv_);
280     CheckLambdaValue(&m->tlambda_);
281 
282     m->min_disto_ = 20 * m->y1_.q_[0];   // quantization-aware min disto
283     m->max_edge_  = 0;
284 
285     m->i4_penalty_ = 1000 * q_i4 * q_i4;
286   }
287 }
288 
289 //------------------------------------------------------------------------------
290 // Initialize filtering parameters
291 
292 // Very small filter-strength values have close to no visual effect. So we can
293 // save a little decoding-CPU by turning filtering off for these.
294 #define FSTRENGTH_CUTOFF 2
295 
SetupFilterStrength(VP8Encoder * const enc)296 static void SetupFilterStrength(VP8Encoder* const enc) {
297   int i;
298   // level0 is in [0..500]. Using '-f 50' as filter_strength is mid-filtering.
299   const int level0 = 5 * enc->config_->filter_strength;
300   for (i = 0; i < NUM_MB_SEGMENTS; ++i) {
301     VP8SegmentInfo* const m = &enc->dqm_[i];
302     // We focus on the quantization of AC coeffs.
303     const int qstep = kAcTable[clip(m->quant_, 0, 127)] >> 2;
304     const int base_strength =
305         VP8FilterStrengthFromDelta(enc->filter_hdr_.sharpness_, qstep);
306     // Segments with lower complexity ('beta') will be less filtered.
307     const int f = base_strength * level0 / (256 + m->beta_);
308     m->fstrength_ = (f < FSTRENGTH_CUTOFF) ? 0 : (f > 63) ? 63 : f;
309   }
310   // We record the initial strength (mainly for the case of 1-segment only).
311   enc->filter_hdr_.level_ = enc->dqm_[0].fstrength_;
312   enc->filter_hdr_.simple_ = (enc->config_->filter_type == 0);
313   enc->filter_hdr_.sharpness_ = enc->config_->filter_sharpness;
314 }
315 
316 //------------------------------------------------------------------------------
317 
318 // Note: if you change the values below, remember that the max range
319 // allowed by the syntax for DQ_UV is [-16,16].
320 #define MAX_DQ_UV (6)
321 #define MIN_DQ_UV (-4)
322 
323 // We want to emulate jpeg-like behaviour where the expected "good" quality
324 // is around q=75. Internally, our "good" middle is around c=50. So we
325 // map accordingly using linear piece-wise function
QualityToCompression(double c)326 static double QualityToCompression(double c) {
327   const double linear_c = (c < 0.75) ? c * (2. / 3.) : 2. * c - 1.;
328   // The file size roughly scales as pow(quantizer, 3.). Actually, the
329   // exponent is somewhere between 2.8 and 3.2, but we're mostly interested
330   // in the mid-quant range. So we scale the compressibility inversely to
331   // this power-law: quant ~= compression ^ 1/3. This law holds well for
332   // low quant. Finer modeling for high-quant would make use of kAcTable[]
333   // more explicitly.
334   const double v = pow(linear_c, 1 / 3.);
335   return v;
336 }
337 
QualityToJPEGCompression(double c,double alpha)338 static double QualityToJPEGCompression(double c, double alpha) {
339   // We map the complexity 'alpha' and quality setting 'c' to a compression
340   // exponent empirically matched to the compression curve of libjpeg6b.
341   // On average, the WebP output size will be roughly similar to that of a
342   // JPEG file compressed with same quality factor.
343   const double amin = 0.30;
344   const double amax = 0.85;
345   const double exp_min = 0.4;
346   const double exp_max = 0.9;
347   const double slope = (exp_min - exp_max) / (amax - amin);
348   // Linearly interpolate 'expn' from exp_min to exp_max
349   // in the [amin, amax] range.
350   const double expn = (alpha > amax) ? exp_min
351                     : (alpha < amin) ? exp_max
352                     : exp_max + slope * (alpha - amin);
353   const double v = pow(c, expn);
354   return v;
355 }
356 
SegmentsAreEquivalent(const VP8SegmentInfo * const S1,const VP8SegmentInfo * const S2)357 static int SegmentsAreEquivalent(const VP8SegmentInfo* const S1,
358                                  const VP8SegmentInfo* const S2) {
359   return (S1->quant_ == S2->quant_) && (S1->fstrength_ == S2->fstrength_);
360 }
361 
SimplifySegments(VP8Encoder * const enc)362 static void SimplifySegments(VP8Encoder* const enc) {
363   int map[NUM_MB_SEGMENTS] = { 0, 1, 2, 3 };
364   // 'num_segments_' is previously validated and <= NUM_MB_SEGMENTS, but an
365   // explicit check is needed to avoid a spurious warning about 'i' exceeding
366   // array bounds of 'dqm_' with some compilers (noticed with gcc-4.9).
367   const int num_segments = (enc->segment_hdr_.num_segments_ < NUM_MB_SEGMENTS)
368                                ? enc->segment_hdr_.num_segments_
369                                : NUM_MB_SEGMENTS;
370   int num_final_segments = 1;
371   int s1, s2;
372   for (s1 = 1; s1 < num_segments; ++s1) {    // find similar segments
373     const VP8SegmentInfo* const S1 = &enc->dqm_[s1];
374     int found = 0;
375     // check if we already have similar segment
376     for (s2 = 0; s2 < num_final_segments; ++s2) {
377       const VP8SegmentInfo* const S2 = &enc->dqm_[s2];
378       if (SegmentsAreEquivalent(S1, S2)) {
379         found = 1;
380         break;
381       }
382     }
383     map[s1] = s2;
384     if (!found) {
385       if (num_final_segments != s1) {
386         enc->dqm_[num_final_segments] = enc->dqm_[s1];
387       }
388       ++num_final_segments;
389     }
390   }
391   if (num_final_segments < num_segments) {  // Remap
392     int i = enc->mb_w_ * enc->mb_h_;
393     while (i-- > 0) enc->mb_info_[i].segment_ = map[enc->mb_info_[i].segment_];
394     enc->segment_hdr_.num_segments_ = num_final_segments;
395     // Replicate the trailing segment infos (it's mostly cosmetics)
396     for (i = num_final_segments; i < num_segments; ++i) {
397       enc->dqm_[i] = enc->dqm_[num_final_segments - 1];
398     }
399   }
400 }
401 
VP8SetSegmentParams(VP8Encoder * const enc,float quality)402 void VP8SetSegmentParams(VP8Encoder* const enc, float quality) {
403   int i;
404   int dq_uv_ac, dq_uv_dc;
405   const int num_segments = enc->segment_hdr_.num_segments_;
406   const double amp = SNS_TO_DQ * enc->config_->sns_strength / 100. / 128.;
407   const double Q = quality / 100.;
408   const double c_base = enc->config_->emulate_jpeg_size ?
409       QualityToJPEGCompression(Q, enc->alpha_ / 255.) :
410       QualityToCompression(Q);
411   for (i = 0; i < num_segments; ++i) {
412     // We modulate the base coefficient to accommodate for the quantization
413     // susceptibility and allow denser segments to be quantized more.
414     const double expn = 1. - amp * enc->dqm_[i].alpha_;
415     const double c = pow(c_base, expn);
416     const int q = (int)(127. * (1. - c));
417     assert(expn > 0.);
418     enc->dqm_[i].quant_ = clip(q, 0, 127);
419   }
420 
421   // purely indicative in the bitstream (except for the 1-segment case)
422   enc->base_quant_ = enc->dqm_[0].quant_;
423 
424   // fill-in values for the unused segments (required by the syntax)
425   for (i = num_segments; i < NUM_MB_SEGMENTS; ++i) {
426     enc->dqm_[i].quant_ = enc->base_quant_;
427   }
428 
429   // uv_alpha_ is normally spread around ~60. The useful range is
430   // typically ~30 (quite bad) to ~100 (ok to decimate UV more).
431   // We map it to the safe maximal range of MAX/MIN_DQ_UV for dq_uv.
432   dq_uv_ac = (enc->uv_alpha_ - MID_ALPHA) * (MAX_DQ_UV - MIN_DQ_UV)
433                                           / (MAX_ALPHA - MIN_ALPHA);
434   // we rescale by the user-defined strength of adaptation
435   dq_uv_ac = dq_uv_ac * enc->config_->sns_strength / 100;
436   // and make it safe.
437   dq_uv_ac = clip(dq_uv_ac, MIN_DQ_UV, MAX_DQ_UV);
438   // We also boost the dc-uv-quant a little, based on sns-strength, since
439   // U/V channels are quite more reactive to high quants (flat DC-blocks
440   // tend to appear, and are unpleasant).
441   dq_uv_dc = -4 * enc->config_->sns_strength / 100;
442   dq_uv_dc = clip(dq_uv_dc, -15, 15);   // 4bit-signed max allowed
443 
444   enc->dq_y1_dc_ = 0;       // TODO(skal): dq-lum
445   enc->dq_y2_dc_ = 0;
446   enc->dq_y2_ac_ = 0;
447   enc->dq_uv_dc_ = dq_uv_dc;
448   enc->dq_uv_ac_ = dq_uv_ac;
449 
450   SetupFilterStrength(enc);   // initialize segments' filtering, eventually
451 
452   if (num_segments > 1) SimplifySegments(enc);
453 
454   SetupMatrices(enc);         // finalize quantization matrices
455 }
456 
457 //------------------------------------------------------------------------------
458 // Form the predictions in cache
459 
460 // Must be ordered using {DC_PRED, TM_PRED, V_PRED, H_PRED} as index
461 const uint16_t VP8I16ModeOffsets[4] = { I16DC16, I16TM16, I16VE16, I16HE16 };
462 const uint16_t VP8UVModeOffsets[4] = { C8DC8, C8TM8, C8VE8, C8HE8 };
463 
464 // Must be indexed using {B_DC_PRED -> B_HU_PRED} as index
465 const uint16_t VP8I4ModeOffsets[NUM_BMODES] = {
466   I4DC4, I4TM4, I4VE4, I4HE4, I4RD4, I4VR4, I4LD4, I4VL4, I4HD4, I4HU4
467 };
468 
VP8MakeLuma16Preds(const VP8EncIterator * const it)469 void VP8MakeLuma16Preds(const VP8EncIterator* const it) {
470   const uint8_t* const left = it->x_ ? it->y_left_ : NULL;
471   const uint8_t* const top = it->y_ ? it->y_top_ : NULL;
472   VP8EncPredLuma16(it->yuv_p_, left, top);
473 }
474 
VP8MakeChroma8Preds(const VP8EncIterator * const it)475 void VP8MakeChroma8Preds(const VP8EncIterator* const it) {
476   const uint8_t* const left = it->x_ ? it->u_left_ : NULL;
477   const uint8_t* const top = it->y_ ? it->uv_top_ : NULL;
478   VP8EncPredChroma8(it->yuv_p_, left, top);
479 }
480 
VP8MakeIntra4Preds(const VP8EncIterator * const it)481 void VP8MakeIntra4Preds(const VP8EncIterator* const it) {
482   VP8EncPredLuma4(it->yuv_p_, it->i4_top_);
483 }
484 
485 //------------------------------------------------------------------------------
486 // Quantize
487 
488 // Layout:
489 // +----+----+
490 // |YYYY|UUVV| 0
491 // |YYYY|UUVV| 4
492 // |YYYY|....| 8
493 // |YYYY|....| 12
494 // +----+----+
495 
496 const uint16_t VP8Scan[16] = {  // Luma
497   0 +  0 * BPS,  4 +  0 * BPS, 8 +  0 * BPS, 12 +  0 * BPS,
498   0 +  4 * BPS,  4 +  4 * BPS, 8 +  4 * BPS, 12 +  4 * BPS,
499   0 +  8 * BPS,  4 +  8 * BPS, 8 +  8 * BPS, 12 +  8 * BPS,
500   0 + 12 * BPS,  4 + 12 * BPS, 8 + 12 * BPS, 12 + 12 * BPS,
501 };
502 
503 static const uint16_t VP8ScanUV[4 + 4] = {
504   0 + 0 * BPS,   4 + 0 * BPS, 0 + 4 * BPS,  4 + 4 * BPS,    // U
505   8 + 0 * BPS,  12 + 0 * BPS, 8 + 4 * BPS, 12 + 4 * BPS     // V
506 };
507 
508 //------------------------------------------------------------------------------
509 // Distortion measurement
510 
511 static const uint16_t kWeightY[16] = {
512   38, 32, 20, 9, 32, 28, 17, 7, 20, 17, 10, 4, 9, 7, 4, 2
513 };
514 
515 static const uint16_t kWeightTrellis[16] = {
516 #if USE_TDISTO == 0
517   16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
518 #else
519   30, 27, 19, 11,
520   27, 24, 17, 10,
521   19, 17, 12,  8,
522   11, 10,  8,  6
523 #endif
524 };
525 
526 // Init/Copy the common fields in score.
InitScore(VP8ModeScore * const rd)527 static void InitScore(VP8ModeScore* const rd) {
528   rd->D  = 0;
529   rd->SD = 0;
530   rd->R  = 0;
531   rd->H  = 0;
532   rd->nz = 0;
533   rd->score = MAX_COST;
534 }
535 
CopyScore(VP8ModeScore * const dst,const VP8ModeScore * const src)536 static void CopyScore(VP8ModeScore* const dst, const VP8ModeScore* const src) {
537   dst->D  = src->D;
538   dst->SD = src->SD;
539   dst->R  = src->R;
540   dst->H  = src->H;
541   dst->nz = src->nz;      // note that nz is not accumulated, but just copied.
542   dst->score = src->score;
543 }
544 
AddScore(VP8ModeScore * const dst,const VP8ModeScore * const src)545 static void AddScore(VP8ModeScore* const dst, const VP8ModeScore* const src) {
546   dst->D  += src->D;
547   dst->SD += src->SD;
548   dst->R  += src->R;
549   dst->H  += src->H;
550   dst->nz |= src->nz;     // here, new nz bits are accumulated.
551   dst->score += src->score;
552 }
553 
554 //------------------------------------------------------------------------------
555 // Performs trellis-optimized quantization.
556 
557 // Trellis node
558 typedef struct {
559   int8_t prev;            // best previous node
560   int8_t sign;            // sign of coeff_i
561   int16_t level;          // level
562 } Node;
563 
564 // Score state
565 typedef struct {
566   score_t score;          // partial RD score
567   const uint16_t* costs;  // shortcut to cost tables
568 } ScoreState;
569 
570 // If a coefficient was quantized to a value Q (using a neutral bias),
571 // we test all alternate possibilities between [Q-MIN_DELTA, Q+MAX_DELTA]
572 // We don't test negative values though.
573 #define MIN_DELTA 0   // how much lower level to try
574 #define MAX_DELTA 1   // how much higher
575 #define NUM_NODES (MIN_DELTA + 1 + MAX_DELTA)
576 #define NODE(n, l) (nodes[(n)][(l) + MIN_DELTA])
577 #define SCORE_STATE(n, l) (score_states[n][(l) + MIN_DELTA])
578 
SetRDScore(int lambda,VP8ModeScore * const rd)579 static WEBP_INLINE void SetRDScore(int lambda, VP8ModeScore* const rd) {
580   rd->score = (rd->R + rd->H) * lambda + RD_DISTO_MULT * (rd->D + rd->SD);
581 }
582 
RDScoreTrellis(int lambda,score_t rate,score_t distortion)583 static WEBP_INLINE score_t RDScoreTrellis(int lambda, score_t rate,
584                                           score_t distortion) {
585   return rate * lambda + RD_DISTO_MULT * distortion;
586 }
587 
TrellisQuantizeBlock(const VP8Encoder * const enc,int16_t in[16],int16_t out[16],int ctx0,int coeff_type,const VP8Matrix * const mtx,int lambda)588 static int TrellisQuantizeBlock(const VP8Encoder* const enc,
589                                 int16_t in[16], int16_t out[16],
590                                 int ctx0, int coeff_type,
591                                 const VP8Matrix* const mtx,
592                                 int lambda) {
593   const ProbaArray* const probas = enc->proba_.coeffs_[coeff_type];
594   CostArrayPtr const costs =
595       (CostArrayPtr)enc->proba_.remapped_costs_[coeff_type];
596   const int first = (coeff_type == 0) ? 1 : 0;
597   Node nodes[16][NUM_NODES];
598   ScoreState score_states[2][NUM_NODES];
599   ScoreState* ss_cur = &SCORE_STATE(0, MIN_DELTA);
600   ScoreState* ss_prev = &SCORE_STATE(1, MIN_DELTA);
601   int best_path[3] = {-1, -1, -1};   // store best-last/best-level/best-previous
602   score_t best_score;
603   int n, m, p, last;
604 
605   {
606     score_t cost;
607     const int thresh = mtx->q_[1] * mtx->q_[1] / 4;
608     const int last_proba = probas[VP8EncBands[first]][ctx0][0];
609 
610     // compute the position of the last interesting coefficient
611     last = first - 1;
612     for (n = 15; n >= first; --n) {
613       const int j = kZigzag[n];
614       const int err = in[j] * in[j];
615       if (err > thresh) {
616         last = n;
617         break;
618       }
619     }
620     // we don't need to go inspect up to n = 16 coeffs. We can just go up
621     // to last + 1 (inclusive) without losing much.
622     if (last < 15) ++last;
623 
624     // compute 'skip' score. This is the max score one can do.
625     cost = VP8BitCost(0, last_proba);
626     best_score = RDScoreTrellis(lambda, cost, 0);
627 
628     // initialize source node.
629     for (m = -MIN_DELTA; m <= MAX_DELTA; ++m) {
630       const score_t rate = (ctx0 == 0) ? VP8BitCost(1, last_proba) : 0;
631       ss_cur[m].score = RDScoreTrellis(lambda, rate, 0);
632       ss_cur[m].costs = costs[first][ctx0];
633     }
634   }
635 
636   // traverse trellis.
637   for (n = first; n <= last; ++n) {
638     const int j = kZigzag[n];
639     const uint32_t Q  = mtx->q_[j];
640     const uint32_t iQ = mtx->iq_[j];
641     const uint32_t B = BIAS(0x00);     // neutral bias
642     // note: it's important to take sign of the _original_ coeff,
643     // so we don't have to consider level < 0 afterward.
644     const int sign = (in[j] < 0);
645     const uint32_t coeff0 = (sign ? -in[j] : in[j]) + mtx->sharpen_[j];
646     int level0 = QUANTDIV(coeff0, iQ, B);
647     int thresh_level = QUANTDIV(coeff0, iQ, BIAS(0x80));
648     if (thresh_level > MAX_LEVEL) thresh_level = MAX_LEVEL;
649     if (level0 > MAX_LEVEL) level0 = MAX_LEVEL;
650 
651     {   // Swap current and previous score states
652       ScoreState* const tmp = ss_cur;
653       ss_cur = ss_prev;
654       ss_prev = tmp;
655     }
656 
657     // test all alternate level values around level0.
658     for (m = -MIN_DELTA; m <= MAX_DELTA; ++m) {
659       Node* const cur = &NODE(n, m);
660       int level = level0 + m;
661       const int ctx = (level > 2) ? 2 : level;
662       const int band = VP8EncBands[n + 1];
663       score_t base_score;
664       score_t best_cur_score = MAX_COST;
665       int best_prev = 0;   // default, in case
666 
667       ss_cur[m].score = MAX_COST;
668       ss_cur[m].costs = costs[n + 1][ctx];
669       if (level < 0 || level > thresh_level) {
670         // Node is dead.
671         continue;
672       }
673 
674       {
675         // Compute delta_error = how much coding this level will
676         // subtract to max_error as distortion.
677         // Here, distortion = sum of (|coeff_i| - level_i * Q_i)^2
678         const int new_error = coeff0 - level * Q;
679         const int delta_error =
680             kWeightTrellis[j] * (new_error * new_error - coeff0 * coeff0);
681         base_score = RDScoreTrellis(lambda, 0, delta_error);
682       }
683 
684       // Inspect all possible non-dead predecessors. Retain only the best one.
685       for (p = -MIN_DELTA; p <= MAX_DELTA; ++p) {
686         // Dead nodes (with ss_prev[p].score >= MAX_COST) are automatically
687         // eliminated since their score can't be better than the current best.
688         const score_t cost = VP8LevelCost(ss_prev[p].costs, level);
689         // Examine node assuming it's a non-terminal one.
690         const score_t score =
691             base_score + ss_prev[p].score + RDScoreTrellis(lambda, cost, 0);
692         if (score < best_cur_score) {
693           best_cur_score = score;
694           best_prev = p;
695         }
696       }
697       // Store best finding in current node.
698       cur->sign = sign;
699       cur->level = level;
700       cur->prev = best_prev;
701       ss_cur[m].score = best_cur_score;
702 
703       // Now, record best terminal node (and thus best entry in the graph).
704       if (level != 0) {
705         const score_t last_pos_cost =
706             (n < 15) ? VP8BitCost(0, probas[band][ctx][0]) : 0;
707         const score_t last_pos_score = RDScoreTrellis(lambda, last_pos_cost, 0);
708         const score_t score = best_cur_score + last_pos_score;
709         if (score < best_score) {
710           best_score = score;
711           best_path[0] = n;                     // best eob position
712           best_path[1] = m;                     // best node index
713           best_path[2] = best_prev;             // best predecessor
714         }
715       }
716     }
717   }
718 
719   // Fresh start
720   memset(in + first, 0, (16 - first) * sizeof(*in));
721   memset(out + first, 0, (16 - first) * sizeof(*out));
722   if (best_path[0] == -1) {
723     return 0;   // skip!
724   }
725 
726   {
727     // Unwind the best path.
728     // Note: best-prev on terminal node is not necessarily equal to the
729     // best_prev for non-terminal. So we patch best_path[2] in.
730     int nz = 0;
731     int best_node = best_path[1];
732     n = best_path[0];
733     NODE(n, best_node).prev = best_path[2];   // force best-prev for terminal
734 
735     for (; n >= first; --n) {
736       const Node* const node = &NODE(n, best_node);
737       const int j = kZigzag[n];
738       out[n] = node->sign ? -node->level : node->level;
739       nz |= node->level;
740       in[j] = out[n] * mtx->q_[j];
741       best_node = node->prev;
742     }
743     return (nz != 0);
744   }
745 }
746 
747 #undef NODE
748 
749 //------------------------------------------------------------------------------
750 // Performs: difference, transform, quantize, back-transform, add
751 // all at once. Output is the reconstructed block in *yuv_out, and the
752 // quantized levels in *levels.
753 
ReconstructIntra16(VP8EncIterator * const it,VP8ModeScore * const rd,uint8_t * const yuv_out,int mode)754 static int ReconstructIntra16(VP8EncIterator* const it,
755                               VP8ModeScore* const rd,
756                               uint8_t* const yuv_out,
757                               int mode) {
758   const VP8Encoder* const enc = it->enc_;
759   const uint8_t* const ref = it->yuv_p_ + VP8I16ModeOffsets[mode];
760   const uint8_t* const src = it->yuv_in_ + Y_OFF_ENC;
761   const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
762   int nz = 0;
763   int n;
764   int16_t tmp[16][16], dc_tmp[16];
765 
766   for (n = 0; n < 16; n += 2) {
767     VP8FTransform2(src + VP8Scan[n], ref + VP8Scan[n], tmp[n]);
768   }
769   VP8FTransformWHT(tmp[0], dc_tmp);
770   nz |= VP8EncQuantizeBlockWHT(dc_tmp, rd->y_dc_levels, &dqm->y2_) << 24;
771 
772   if (DO_TRELLIS_I16 && it->do_trellis_) {
773     int x, y;
774     VP8IteratorNzToBytes(it);
775     for (y = 0, n = 0; y < 4; ++y) {
776       for (x = 0; x < 4; ++x, ++n) {
777         const int ctx = it->top_nz_[x] + it->left_nz_[y];
778         const int non_zero =
779             TrellisQuantizeBlock(enc, tmp[n], rd->y_ac_levels[n], ctx, 0,
780                                  &dqm->y1_, dqm->lambda_trellis_i16_);
781         it->top_nz_[x] = it->left_nz_[y] = non_zero;
782         rd->y_ac_levels[n][0] = 0;
783         nz |= non_zero << n;
784       }
785     }
786   } else {
787     for (n = 0; n < 16; n += 2) {
788       // Zero-out the first coeff, so that: a) nz is correct below, and
789       // b) finding 'last' non-zero coeffs in SetResidualCoeffs() is simplified.
790       tmp[n][0] = tmp[n + 1][0] = 0;
791       nz |= VP8EncQuantize2Blocks(tmp[n], rd->y_ac_levels[n], &dqm->y1_) << n;
792       assert(rd->y_ac_levels[n + 0][0] == 0);
793       assert(rd->y_ac_levels[n + 1][0] == 0);
794     }
795   }
796 
797   // Transform back
798   VP8TransformWHT(dc_tmp, tmp[0]);
799   for (n = 0; n < 16; n += 2) {
800     VP8ITransform(ref + VP8Scan[n], tmp[n], yuv_out + VP8Scan[n], 1);
801   }
802 
803   return nz;
804 }
805 
ReconstructIntra4(VP8EncIterator * const it,int16_t levels[16],const uint8_t * const src,uint8_t * const yuv_out,int mode)806 static int ReconstructIntra4(VP8EncIterator* const it,
807                              int16_t levels[16],
808                              const uint8_t* const src,
809                              uint8_t* const yuv_out,
810                              int mode) {
811   const VP8Encoder* const enc = it->enc_;
812   const uint8_t* const ref = it->yuv_p_ + VP8I4ModeOffsets[mode];
813   const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
814   int nz = 0;
815   int16_t tmp[16];
816 
817   VP8FTransform(src, ref, tmp);
818   if (DO_TRELLIS_I4 && it->do_trellis_) {
819     const int x = it->i4_ & 3, y = it->i4_ >> 2;
820     const int ctx = it->top_nz_[x] + it->left_nz_[y];
821     nz = TrellisQuantizeBlock(enc, tmp, levels, ctx, 3, &dqm->y1_,
822                               dqm->lambda_trellis_i4_);
823   } else {
824     nz = VP8EncQuantizeBlock(tmp, levels, &dqm->y1_);
825   }
826   VP8ITransform(ref, tmp, yuv_out, 0);
827   return nz;
828 }
829 
830 //------------------------------------------------------------------------------
831 // DC-error diffusion
832 
833 // Diffusion weights. We under-correct a bit (15/16th of the error is actually
834 // diffused) to avoid 'rainbow' chessboard pattern of blocks at q~=0.
835 #define C1 7    // fraction of error sent to the 4x4 block below
836 #define C2 8    // fraction of error sent to the 4x4 block on the right
837 #define DSHIFT 4
838 #define DSCALE 1   // storage descaling, needed to make the error fit int8_t
839 
840 // Quantize as usual, but also compute and return the quantization error.
841 // Error is already divided by DSHIFT.
QuantizeSingle(int16_t * const v,const VP8Matrix * const mtx)842 static int QuantizeSingle(int16_t* const v, const VP8Matrix* const mtx) {
843   int V = *v;
844   const int sign = (V < 0);
845   if (sign) V = -V;
846   if (V > (int)mtx->zthresh_[0]) {
847     const int qV = QUANTDIV(V, mtx->iq_[0], mtx->bias_[0]) * mtx->q_[0];
848     const int err = (V - qV);
849     *v = sign ? -qV : qV;
850     return (sign ? -err : err) >> DSCALE;
851   }
852   *v = 0;
853   return (sign ? -V : V) >> DSCALE;
854 }
855 
CorrectDCValues(const VP8EncIterator * const it,const VP8Matrix * const mtx,int16_t tmp[][16],VP8ModeScore * const rd)856 static void CorrectDCValues(const VP8EncIterator* const it,
857                             const VP8Matrix* const mtx,
858                             int16_t tmp[][16], VP8ModeScore* const rd) {
859   //         | top[0] | top[1]
860   // --------+--------+---------
861   // left[0] | tmp[0]   tmp[1]  <->   err0 err1
862   // left[1] | tmp[2]   tmp[3]        err2 err3
863   //
864   // Final errors {err1,err2,err3} are preserved and later restored
865   // as top[]/left[] on the next block.
866   int ch;
867   for (ch = 0; ch <= 1; ++ch) {
868     const int8_t* const top = it->top_derr_[it->x_][ch];
869     const int8_t* const left = it->left_derr_[ch];
870     int16_t (* const c)[16] = &tmp[ch * 4];
871     int err0, err1, err2, err3;
872     c[0][0] += (C1 * top[0] + C2 * left[0]) >> (DSHIFT - DSCALE);
873     err0 = QuantizeSingle(&c[0][0], mtx);
874     c[1][0] += (C1 * top[1] + C2 * err0) >> (DSHIFT - DSCALE);
875     err1 = QuantizeSingle(&c[1][0], mtx);
876     c[2][0] += (C1 * err0 + C2 * left[1]) >> (DSHIFT - DSCALE);
877     err2 = QuantizeSingle(&c[2][0], mtx);
878     c[3][0] += (C1 * err1 + C2 * err2) >> (DSHIFT - DSCALE);
879     err3 = QuantizeSingle(&c[3][0], mtx);
880     // error 'err' is bounded by mtx->q_[0] which is 132 at max. Hence
881     // err >> DSCALE will fit in an int8_t type if DSCALE>=1.
882     assert(abs(err1) <= 127 && abs(err2) <= 127 && abs(err3) <= 127);
883     rd->derr[ch][0] = (int8_t)err1;
884     rd->derr[ch][1] = (int8_t)err2;
885     rd->derr[ch][2] = (int8_t)err3;
886   }
887 }
888 
StoreDiffusionErrors(VP8EncIterator * const it,const VP8ModeScore * const rd)889 static void StoreDiffusionErrors(VP8EncIterator* const it,
890                                  const VP8ModeScore* const rd) {
891   int ch;
892   for (ch = 0; ch <= 1; ++ch) {
893     int8_t* const top = it->top_derr_[it->x_][ch];
894     int8_t* const left = it->left_derr_[ch];
895     left[0] = rd->derr[ch][0];            // restore err1
896     left[1] = 3 * rd->derr[ch][2] >> 2;   //     ... 3/4th of err3
897     top[0]  = rd->derr[ch][1];            //     ... err2
898     top[1]  = rd->derr[ch][2] - left[1];  //     ... 1/4th of err3.
899   }
900 }
901 
902 #undef C1
903 #undef C2
904 #undef DSHIFT
905 #undef DSCALE
906 
907 //------------------------------------------------------------------------------
908 
ReconstructUV(VP8EncIterator * const it,VP8ModeScore * const rd,uint8_t * const yuv_out,int mode)909 static int ReconstructUV(VP8EncIterator* const it, VP8ModeScore* const rd,
910                          uint8_t* const yuv_out, int mode) {
911   const VP8Encoder* const enc = it->enc_;
912   const uint8_t* const ref = it->yuv_p_ + VP8UVModeOffsets[mode];
913   const uint8_t* const src = it->yuv_in_ + U_OFF_ENC;
914   const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
915   int nz = 0;
916   int n;
917   int16_t tmp[8][16];
918 
919   for (n = 0; n < 8; n += 2) {
920     VP8FTransform2(src + VP8ScanUV[n], ref + VP8ScanUV[n], tmp[n]);
921   }
922   if (it->top_derr_ != NULL) CorrectDCValues(it, &dqm->uv_, tmp, rd);
923 
924   if (DO_TRELLIS_UV && it->do_trellis_) {
925     int ch, x, y;
926     for (ch = 0, n = 0; ch <= 2; ch += 2) {
927       for (y = 0; y < 2; ++y) {
928         for (x = 0; x < 2; ++x, ++n) {
929           const int ctx = it->top_nz_[4 + ch + x] + it->left_nz_[4 + ch + y];
930           const int non_zero =
931               TrellisQuantizeBlock(enc, tmp[n], rd->uv_levels[n], ctx, 2,
932                                    &dqm->uv_, dqm->lambda_trellis_uv_);
933           it->top_nz_[4 + ch + x] = it->left_nz_[4 + ch + y] = non_zero;
934           nz |= non_zero << n;
935         }
936       }
937     }
938   } else {
939     for (n = 0; n < 8; n += 2) {
940       nz |= VP8EncQuantize2Blocks(tmp[n], rd->uv_levels[n], &dqm->uv_) << n;
941     }
942   }
943 
944   for (n = 0; n < 8; n += 2) {
945     VP8ITransform(ref + VP8ScanUV[n], tmp[n], yuv_out + VP8ScanUV[n], 1);
946   }
947   return (nz << 16);
948 }
949 
950 //------------------------------------------------------------------------------
951 // RD-opt decision. Reconstruct each modes, evalue distortion and bit-cost.
952 // Pick the mode is lower RD-cost = Rate + lambda * Distortion.
953 
StoreMaxDelta(VP8SegmentInfo * const dqm,const int16_t DCs[16])954 static void StoreMaxDelta(VP8SegmentInfo* const dqm, const int16_t DCs[16]) {
955   // We look at the first three AC coefficients to determine what is the average
956   // delta between each sub-4x4 block.
957   const int v0 = abs(DCs[1]);
958   const int v1 = abs(DCs[2]);
959   const int v2 = abs(DCs[4]);
960   int max_v = (v1 > v0) ? v1 : v0;
961   max_v = (v2 > max_v) ? v2 : max_v;
962   if (max_v > dqm->max_edge_) dqm->max_edge_ = max_v;
963 }
964 
SwapModeScore(VP8ModeScore ** a,VP8ModeScore ** b)965 static void SwapModeScore(VP8ModeScore** a, VP8ModeScore** b) {
966   VP8ModeScore* const tmp = *a;
967   *a = *b;
968   *b = tmp;
969 }
970 
SwapPtr(uint8_t ** a,uint8_t ** b)971 static void SwapPtr(uint8_t** a, uint8_t** b) {
972   uint8_t* const tmp = *a;
973   *a = *b;
974   *b = tmp;
975 }
976 
SwapOut(VP8EncIterator * const it)977 static void SwapOut(VP8EncIterator* const it) {
978   SwapPtr(&it->yuv_out_, &it->yuv_out2_);
979 }
980 
PickBestIntra16(VP8EncIterator * const it,VP8ModeScore * rd)981 static void PickBestIntra16(VP8EncIterator* const it, VP8ModeScore* rd) {
982   const int kNumBlocks = 16;
983   VP8SegmentInfo* const dqm = &it->enc_->dqm_[it->mb_->segment_];
984   const int lambda = dqm->lambda_i16_;
985   const int tlambda = dqm->tlambda_;
986   const uint8_t* const src = it->yuv_in_ + Y_OFF_ENC;
987   VP8ModeScore rd_tmp;
988   VP8ModeScore* rd_cur = &rd_tmp;
989   VP8ModeScore* rd_best = rd;
990   int mode;
991 
992   rd->mode_i16 = -1;
993   for (mode = 0; mode < NUM_PRED_MODES; ++mode) {
994     uint8_t* const tmp_dst = it->yuv_out2_ + Y_OFF_ENC;  // scratch buffer
995     rd_cur->mode_i16 = mode;
996 
997     // Reconstruct
998     rd_cur->nz = ReconstructIntra16(it, rd_cur, tmp_dst, mode);
999 
1000     // Measure RD-score
1001     rd_cur->D = VP8SSE16x16(src, tmp_dst);
1002     rd_cur->SD =
1003         tlambda ? MULT_8B(tlambda, VP8TDisto16x16(src, tmp_dst, kWeightY)) : 0;
1004     rd_cur->H = VP8FixedCostsI16[mode];
1005     rd_cur->R = VP8GetCostLuma16(it, rd_cur);
1006     if (mode > 0 &&
1007         IsFlat(rd_cur->y_ac_levels[0], kNumBlocks, FLATNESS_LIMIT_I16)) {
1008       // penalty to avoid flat area to be mispredicted by complex mode
1009       rd_cur->R += FLATNESS_PENALTY * kNumBlocks;
1010     }
1011 
1012     // Since we always examine Intra16 first, we can overwrite *rd directly.
1013     SetRDScore(lambda, rd_cur);
1014     if (mode == 0 || rd_cur->score < rd_best->score) {
1015       SwapModeScore(&rd_cur, &rd_best);
1016       SwapOut(it);
1017     }
1018   }
1019   if (rd_best != rd) {
1020     memcpy(rd, rd_best, sizeof(*rd));
1021   }
1022   SetRDScore(dqm->lambda_mode_, rd);   // finalize score for mode decision.
1023   VP8SetIntra16Mode(it, rd->mode_i16);
1024 
1025   // we have a blocky macroblock (only DCs are non-zero) with fairly high
1026   // distortion, record max delta so we can later adjust the minimal filtering
1027   // strength needed to smooth these blocks out.
1028   if ((rd->nz & 0x100ffff) == 0x1000000 && rd->D > dqm->min_disto_) {
1029     StoreMaxDelta(dqm, rd->y_dc_levels);
1030   }
1031 }
1032 
1033 //------------------------------------------------------------------------------
1034 
1035 // return the cost array corresponding to the surrounding prediction modes.
GetCostModeI4(VP8EncIterator * const it,const uint8_t modes[16])1036 static const uint16_t* GetCostModeI4(VP8EncIterator* const it,
1037                                      const uint8_t modes[16]) {
1038   const int preds_w = it->enc_->preds_w_;
1039   const int x = (it->i4_ & 3), y = it->i4_ >> 2;
1040   const int left = (x == 0) ? it->preds_[y * preds_w - 1] : modes[it->i4_ - 1];
1041   const int top = (y == 0) ? it->preds_[-preds_w + x] : modes[it->i4_ - 4];
1042   return VP8FixedCostsI4[top][left];
1043 }
1044 
PickBestIntra4(VP8EncIterator * const it,VP8ModeScore * const rd)1045 static int PickBestIntra4(VP8EncIterator* const it, VP8ModeScore* const rd) {
1046   const VP8Encoder* const enc = it->enc_;
1047   const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
1048   const int lambda = dqm->lambda_i4_;
1049   const int tlambda = dqm->tlambda_;
1050   const uint8_t* const src0 = it->yuv_in_ + Y_OFF_ENC;
1051   uint8_t* const best_blocks = it->yuv_out2_ + Y_OFF_ENC;
1052   int total_header_bits = 0;
1053   VP8ModeScore rd_best;
1054 
1055   if (enc->max_i4_header_bits_ == 0) {
1056     return 0;
1057   }
1058 
1059   InitScore(&rd_best);
1060   rd_best.H = 211;  // '211' is the value of VP8BitCost(0, 145)
1061   SetRDScore(dqm->lambda_mode_, &rd_best);
1062   VP8IteratorStartI4(it);
1063   do {
1064     const int kNumBlocks = 1;
1065     VP8ModeScore rd_i4;
1066     int mode;
1067     int best_mode = -1;
1068     const uint8_t* const src = src0 + VP8Scan[it->i4_];
1069     const uint16_t* const mode_costs = GetCostModeI4(it, rd->modes_i4);
1070     uint8_t* best_block = best_blocks + VP8Scan[it->i4_];
1071     uint8_t* tmp_dst = it->yuv_p_ + I4TMP;    // scratch buffer.
1072 
1073     InitScore(&rd_i4);
1074     VP8MakeIntra4Preds(it);
1075     for (mode = 0; mode < NUM_BMODES; ++mode) {
1076       VP8ModeScore rd_tmp;
1077       int16_t tmp_levels[16];
1078 
1079       // Reconstruct
1080       rd_tmp.nz =
1081           ReconstructIntra4(it, tmp_levels, src, tmp_dst, mode) << it->i4_;
1082 
1083       // Compute RD-score
1084       rd_tmp.D = VP8SSE4x4(src, tmp_dst);
1085       rd_tmp.SD =
1086           tlambda ? MULT_8B(tlambda, VP8TDisto4x4(src, tmp_dst, kWeightY))
1087                   : 0;
1088       rd_tmp.H = mode_costs[mode];
1089 
1090       // Add flatness penalty
1091       if (mode > 0 && IsFlat(tmp_levels, kNumBlocks, FLATNESS_LIMIT_I4)) {
1092         rd_tmp.R = FLATNESS_PENALTY * kNumBlocks;
1093       } else {
1094         rd_tmp.R = 0;
1095       }
1096 
1097       // early-out check
1098       SetRDScore(lambda, &rd_tmp);
1099       if (best_mode >= 0 && rd_tmp.score >= rd_i4.score) continue;
1100 
1101       // finish computing score
1102       rd_tmp.R += VP8GetCostLuma4(it, tmp_levels);
1103       SetRDScore(lambda, &rd_tmp);
1104 
1105       if (best_mode < 0 || rd_tmp.score < rd_i4.score) {
1106         CopyScore(&rd_i4, &rd_tmp);
1107         best_mode = mode;
1108         SwapPtr(&tmp_dst, &best_block);
1109         memcpy(rd_best.y_ac_levels[it->i4_], tmp_levels,
1110                sizeof(rd_best.y_ac_levels[it->i4_]));
1111       }
1112     }
1113     SetRDScore(dqm->lambda_mode_, &rd_i4);
1114     AddScore(&rd_best, &rd_i4);
1115     if (rd_best.score >= rd->score) {
1116       return 0;
1117     }
1118     total_header_bits += (int)rd_i4.H;   // <- equal to mode_costs[best_mode];
1119     if (total_header_bits > enc->max_i4_header_bits_) {
1120       return 0;
1121     }
1122     // Copy selected samples if not in the right place already.
1123     if (best_block != best_blocks + VP8Scan[it->i4_]) {
1124       VP8Copy4x4(best_block, best_blocks + VP8Scan[it->i4_]);
1125     }
1126     rd->modes_i4[it->i4_] = best_mode;
1127     it->top_nz_[it->i4_ & 3] = it->left_nz_[it->i4_ >> 2] = (rd_i4.nz ? 1 : 0);
1128   } while (VP8IteratorRotateI4(it, best_blocks));
1129 
1130   // finalize state
1131   CopyScore(rd, &rd_best);
1132   VP8SetIntra4Mode(it, rd->modes_i4);
1133   SwapOut(it);
1134   memcpy(rd->y_ac_levels, rd_best.y_ac_levels, sizeof(rd->y_ac_levels));
1135   return 1;   // select intra4x4 over intra16x16
1136 }
1137 
1138 //------------------------------------------------------------------------------
1139 
PickBestUV(VP8EncIterator * const it,VP8ModeScore * const rd)1140 static void PickBestUV(VP8EncIterator* const it, VP8ModeScore* const rd) {
1141   const int kNumBlocks = 8;
1142   const VP8SegmentInfo* const dqm = &it->enc_->dqm_[it->mb_->segment_];
1143   const int lambda = dqm->lambda_uv_;
1144   const uint8_t* const src = it->yuv_in_ + U_OFF_ENC;
1145   uint8_t* tmp_dst = it->yuv_out2_ + U_OFF_ENC;  // scratch buffer
1146   uint8_t* dst0 = it->yuv_out_ + U_OFF_ENC;
1147   uint8_t* dst = dst0;
1148   VP8ModeScore rd_best;
1149   int mode;
1150 
1151   rd->mode_uv = -1;
1152   InitScore(&rd_best);
1153   for (mode = 0; mode < NUM_PRED_MODES; ++mode) {
1154     VP8ModeScore rd_uv;
1155 
1156     // Reconstruct
1157     rd_uv.nz = ReconstructUV(it, &rd_uv, tmp_dst, mode);
1158 
1159     // Compute RD-score
1160     rd_uv.D  = VP8SSE16x8(src, tmp_dst);
1161     rd_uv.SD = 0;    // not calling TDisto here: it tends to flatten areas.
1162     rd_uv.H  = VP8FixedCostsUV[mode];
1163     rd_uv.R  = VP8GetCostUV(it, &rd_uv);
1164     if (mode > 0 && IsFlat(rd_uv.uv_levels[0], kNumBlocks, FLATNESS_LIMIT_UV)) {
1165       rd_uv.R += FLATNESS_PENALTY * kNumBlocks;
1166     }
1167 
1168     SetRDScore(lambda, &rd_uv);
1169     if (mode == 0 || rd_uv.score < rd_best.score) {
1170       CopyScore(&rd_best, &rd_uv);
1171       rd->mode_uv = mode;
1172       memcpy(rd->uv_levels, rd_uv.uv_levels, sizeof(rd->uv_levels));
1173       if (it->top_derr_ != NULL) {
1174         memcpy(rd->derr, rd_uv.derr, sizeof(rd_uv.derr));
1175       }
1176       SwapPtr(&dst, &tmp_dst);
1177     }
1178   }
1179   VP8SetIntraUVMode(it, rd->mode_uv);
1180   AddScore(rd, &rd_best);
1181   if (dst != dst0) {   // copy 16x8 block if needed
1182     VP8Copy16x8(dst, dst0);
1183   }
1184   if (it->top_derr_ != NULL) {  // store diffusion errors for next block
1185     StoreDiffusionErrors(it, rd);
1186   }
1187 }
1188 
1189 //------------------------------------------------------------------------------
1190 // Final reconstruction and quantization.
1191 
SimpleQuantize(VP8EncIterator * const it,VP8ModeScore * const rd)1192 static void SimpleQuantize(VP8EncIterator* const it, VP8ModeScore* const rd) {
1193   const VP8Encoder* const enc = it->enc_;
1194   const int is_i16 = (it->mb_->type_ == 1);
1195   int nz = 0;
1196 
1197   if (is_i16) {
1198     nz = ReconstructIntra16(it, rd, it->yuv_out_ + Y_OFF_ENC, it->preds_[0]);
1199   } else {
1200     VP8IteratorStartI4(it);
1201     do {
1202       const int mode =
1203           it->preds_[(it->i4_ & 3) + (it->i4_ >> 2) * enc->preds_w_];
1204       const uint8_t* const src = it->yuv_in_ + Y_OFF_ENC + VP8Scan[it->i4_];
1205       uint8_t* const dst = it->yuv_out_ + Y_OFF_ENC + VP8Scan[it->i4_];
1206       VP8MakeIntra4Preds(it);
1207       nz |= ReconstructIntra4(it, rd->y_ac_levels[it->i4_],
1208                               src, dst, mode) << it->i4_;
1209     } while (VP8IteratorRotateI4(it, it->yuv_out_ + Y_OFF_ENC));
1210   }
1211 
1212   nz |= ReconstructUV(it, rd, it->yuv_out_ + U_OFF_ENC, it->mb_->uv_mode_);
1213   rd->nz = nz;
1214 }
1215 
1216 // Refine intra16/intra4 sub-modes based on distortion only (not rate).
RefineUsingDistortion(VP8EncIterator * const it,int try_both_modes,int refine_uv_mode,VP8ModeScore * const rd)1217 static void RefineUsingDistortion(VP8EncIterator* const it,
1218                                   int try_both_modes, int refine_uv_mode,
1219                                   VP8ModeScore* const rd) {
1220   score_t best_score = MAX_COST;
1221   int nz = 0;
1222   int mode;
1223   int is_i16 = try_both_modes || (it->mb_->type_ == 1);
1224 
1225   const VP8SegmentInfo* const dqm = &it->enc_->dqm_[it->mb_->segment_];
1226   // Some empiric constants, of approximate order of magnitude.
1227   const int lambda_d_i16 = 106;
1228   const int lambda_d_i4 = 11;
1229   const int lambda_d_uv = 120;
1230   score_t score_i4 = dqm->i4_penalty_;
1231   score_t i4_bit_sum = 0;
1232   const score_t bit_limit = try_both_modes ? it->enc_->mb_header_limit_
1233                                            : MAX_COST;  // no early-out allowed
1234 
1235   if (is_i16) {   // First, evaluate Intra16 distortion
1236     int best_mode = -1;
1237     const uint8_t* const src = it->yuv_in_ + Y_OFF_ENC;
1238     for (mode = 0; mode < NUM_PRED_MODES; ++mode) {
1239       const uint8_t* const ref = it->yuv_p_ + VP8I16ModeOffsets[mode];
1240       const score_t score = (score_t)VP8SSE16x16(src, ref) * RD_DISTO_MULT
1241                           + VP8FixedCostsI16[mode] * lambda_d_i16;
1242       if (mode > 0 && VP8FixedCostsI16[mode] > bit_limit) {
1243         continue;
1244       }
1245       if (score < best_score) {
1246         best_mode = mode;
1247         best_score = score;
1248       }
1249     }
1250     VP8SetIntra16Mode(it, best_mode);
1251     // we'll reconstruct later, if i16 mode actually gets selected
1252   }
1253 
1254   // Next, evaluate Intra4
1255   if (try_both_modes || !is_i16) {
1256     // We don't evaluate the rate here, but just account for it through a
1257     // constant penalty (i4 mode usually needs more bits compared to i16).
1258     is_i16 = 0;
1259     VP8IteratorStartI4(it);
1260     do {
1261       int best_i4_mode = -1;
1262       score_t best_i4_score = MAX_COST;
1263       const uint8_t* const src = it->yuv_in_ + Y_OFF_ENC + VP8Scan[it->i4_];
1264       const uint16_t* const mode_costs = GetCostModeI4(it, rd->modes_i4);
1265 
1266       VP8MakeIntra4Preds(it);
1267       for (mode = 0; mode < NUM_BMODES; ++mode) {
1268         const uint8_t* const ref = it->yuv_p_ + VP8I4ModeOffsets[mode];
1269         const score_t score = VP8SSE4x4(src, ref) * RD_DISTO_MULT
1270                             + mode_costs[mode] * lambda_d_i4;
1271         if (score < best_i4_score) {
1272           best_i4_mode = mode;
1273           best_i4_score = score;
1274         }
1275       }
1276       i4_bit_sum += mode_costs[best_i4_mode];
1277       rd->modes_i4[it->i4_] = best_i4_mode;
1278       score_i4 += best_i4_score;
1279       if (score_i4 >= best_score || i4_bit_sum > bit_limit) {
1280         // Intra4 won't be better than Intra16. Bail out and pick Intra16.
1281         is_i16 = 1;
1282         break;
1283       } else {  // reconstruct partial block inside yuv_out2_ buffer
1284         uint8_t* const tmp_dst = it->yuv_out2_ + Y_OFF_ENC + VP8Scan[it->i4_];
1285         nz |= ReconstructIntra4(it, rd->y_ac_levels[it->i4_],
1286                                 src, tmp_dst, best_i4_mode) << it->i4_;
1287       }
1288     } while (VP8IteratorRotateI4(it, it->yuv_out2_ + Y_OFF_ENC));
1289   }
1290 
1291   // Final reconstruction, depending on which mode is selected.
1292   if (!is_i16) {
1293     VP8SetIntra4Mode(it, rd->modes_i4);
1294     SwapOut(it);
1295     best_score = score_i4;
1296   } else {
1297     nz = ReconstructIntra16(it, rd, it->yuv_out_ + Y_OFF_ENC, it->preds_[0]);
1298   }
1299 
1300   // ... and UV!
1301   if (refine_uv_mode) {
1302     int best_mode = -1;
1303     score_t best_uv_score = MAX_COST;
1304     const uint8_t* const src = it->yuv_in_ + U_OFF_ENC;
1305     for (mode = 0; mode < NUM_PRED_MODES; ++mode) {
1306       const uint8_t* const ref = it->yuv_p_ + VP8UVModeOffsets[mode];
1307       const score_t score = VP8SSE16x8(src, ref) * RD_DISTO_MULT
1308                           + VP8FixedCostsUV[mode] * lambda_d_uv;
1309       if (score < best_uv_score) {
1310         best_mode = mode;
1311         best_uv_score = score;
1312       }
1313     }
1314     VP8SetIntraUVMode(it, best_mode);
1315   }
1316   nz |= ReconstructUV(it, rd, it->yuv_out_ + U_OFF_ENC, it->mb_->uv_mode_);
1317 
1318   rd->nz = nz;
1319   rd->score = best_score;
1320 }
1321 
1322 //------------------------------------------------------------------------------
1323 // Entry point
1324 
VP8Decimate(VP8EncIterator * const it,VP8ModeScore * const rd,VP8RDLevel rd_opt)1325 int VP8Decimate(VP8EncIterator* const it, VP8ModeScore* const rd,
1326                 VP8RDLevel rd_opt) {
1327   int is_skipped;
1328   const int method = it->enc_->method_;
1329 
1330   InitScore(rd);
1331 
1332   // We can perform predictions for Luma16x16 and Chroma8x8 already.
1333   // Luma4x4 predictions needs to be done as-we-go.
1334   VP8MakeLuma16Preds(it);
1335   VP8MakeChroma8Preds(it);
1336 
1337   if (rd_opt > RD_OPT_NONE) {
1338     it->do_trellis_ = (rd_opt >= RD_OPT_TRELLIS_ALL);
1339     PickBestIntra16(it, rd);
1340     if (method >= 2) {
1341       PickBestIntra4(it, rd);
1342     }
1343     PickBestUV(it, rd);
1344     if (rd_opt == RD_OPT_TRELLIS) {   // finish off with trellis-optim now
1345       it->do_trellis_ = 1;
1346       SimpleQuantize(it, rd);
1347     }
1348   } else {
1349     // At this point we have heuristically decided intra16 / intra4.
1350     // For method >= 2, pick the best intra4/intra16 based on SSE (~tad slower).
1351     // For method <= 1, we don't re-examine the decision but just go ahead with
1352     // quantization/reconstruction.
1353     RefineUsingDistortion(it, (method >= 2), (method >= 1), rd);
1354   }
1355   is_skipped = (rd->nz == 0);
1356   VP8SetSkip(it, is_skipped);
1357   return is_skipped;
1358 }
1359