1 // Copyright 2011 Google Inc. All Rights Reserved.
2 //
3 // Use of this source code is governed by a BSD-style license
4 // that can be found in the COPYING file in the root of the source
5 // tree. An additional intellectual property rights grant can be found
6 // in the file PATENTS. All contributing project authors may
7 // be found in the AUTHORS file in the root of the source tree.
8 // -----------------------------------------------------------------------------
9 //
10 //   Quantization
11 //
12 // Author: Skal (pascal.massimino@gmail.com)
13 
14 #include <assert.h>
15 #include <math.h>
16 #include <stdlib.h>  // for abs()
17 
18 #include "./vp8enci.h"
19 #include "./cost.h"
20 
21 #define DO_TRELLIS_I4  1
22 #define DO_TRELLIS_I16 1   // not a huge gain, but ok at low bitrate.
23 #define DO_TRELLIS_UV  0   // disable trellis for UV. Risky. Not worth.
24 #define USE_TDISTO 1
25 
26 #define MID_ALPHA 64      // neutral value for susceptibility
27 #define MIN_ALPHA 30      // lowest usable value for susceptibility
28 #define MAX_ALPHA 100     // higher meaningful value for susceptibility
29 
30 #define SNS_TO_DQ 0.9     // Scaling constant between the sns value and the QP
31                           // power-law modulation. Must be strictly less than 1.
32 
33 #define I4_PENALTY 4000   // Rate-penalty for quick i4/i16 decision
34 
35 // number of non-zero coeffs below which we consider the block very flat
36 // (and apply a penalty to complex predictions)
37 #define FLATNESS_LIMIT_I16 10      // I16 mode
38 #define FLATNESS_LIMIT_I4  3       // I4 mode
39 #define FLATNESS_LIMIT_UV  2       // UV mode
40 #define FLATNESS_PENALTY   140     // roughly ~1bit per block
41 
42 #define MULT_8B(a, b) (((a) * (b) + 128) >> 8)
43 
44 // #define DEBUG_BLOCK
45 
46 //------------------------------------------------------------------------------
47 
48 #if defined(DEBUG_BLOCK)
49 
50 #include <stdio.h>
51 #include <stdlib.h>
52 
PrintBlockInfo(const VP8EncIterator * const it,const VP8ModeScore * const rd)53 static void PrintBlockInfo(const VP8EncIterator* const it,
54                            const VP8ModeScore* const rd) {
55   int i, j;
56   const int is_i16 = (it->mb_->type_ == 1);
57   printf("SOURCE / OUTPUT / ABS DELTA\n");
58   for (j = 0; j < 24; ++j) {
59     if (j == 16) printf("\n");   // newline before the U/V block
60     for (i = 0; i < 16; ++i) printf("%3d ", it->yuv_in_[i + j * BPS]);
61     printf("     ");
62     for (i = 0; i < 16; ++i) printf("%3d ", it->yuv_out_[i + j * BPS]);
63     printf("     ");
64     for (i = 0; i < 16; ++i) {
65       printf("%1d ", abs(it->yuv_out_[i + j * BPS] - it->yuv_in_[i + j * BPS]));
66     }
67     printf("\n");
68   }
69   printf("\nD:%d SD:%d R:%d H:%d nz:0x%x score:%d\n",
70     (int)rd->D, (int)rd->SD, (int)rd->R, (int)rd->H, (int)rd->nz,
71     (int)rd->score);
72   if (is_i16) {
73     printf("Mode: %d\n", rd->mode_i16);
74     printf("y_dc_levels:");
75     for (i = 0; i < 16; ++i) printf("%3d ", rd->y_dc_levels[i]);
76     printf("\n");
77   } else {
78     printf("Modes[16]: ");
79     for (i = 0; i < 16; ++i) printf("%d ", rd->modes_i4[i]);
80     printf("\n");
81   }
82   printf("y_ac_levels:\n");
83   for (j = 0; j < 16; ++j) {
84     for (i = is_i16 ? 1 : 0; i < 16; ++i) {
85       printf("%4d ", rd->y_ac_levels[j][i]);
86     }
87     printf("\n");
88   }
89   printf("\n");
90   printf("uv_levels (mode=%d):\n", rd->mode_uv);
91   for (j = 0; j < 8; ++j) {
92     for (i = 0; i < 16; ++i) {
93       printf("%4d ", rd->uv_levels[j][i]);
94     }
95     printf("\n");
96   }
97 }
98 
99 #endif   // DEBUG_BLOCK
100 
101 //------------------------------------------------------------------------------
102 
clip(int v,int m,int M)103 static WEBP_INLINE int clip(int v, int m, int M) {
104   return v < m ? m : v > M ? M : v;
105 }
106 
107 static const uint8_t kZigzag[16] = {
108   0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15
109 };
110 
111 static const uint8_t kDcTable[128] = {
112   4,     5,   6,   7,   8,   9,  10,  10,
113   11,   12,  13,  14,  15,  16,  17,  17,
114   18,   19,  20,  20,  21,  21,  22,  22,
115   23,   23,  24,  25,  25,  26,  27,  28,
116   29,   30,  31,  32,  33,  34,  35,  36,
117   37,   37,  38,  39,  40,  41,  42,  43,
118   44,   45,  46,  46,  47,  48,  49,  50,
119   51,   52,  53,  54,  55,  56,  57,  58,
120   59,   60,  61,  62,  63,  64,  65,  66,
121   67,   68,  69,  70,  71,  72,  73,  74,
122   75,   76,  76,  77,  78,  79,  80,  81,
123   82,   83,  84,  85,  86,  87,  88,  89,
124   91,   93,  95,  96,  98, 100, 101, 102,
125   104, 106, 108, 110, 112, 114, 116, 118,
126   122, 124, 126, 128, 130, 132, 134, 136,
127   138, 140, 143, 145, 148, 151, 154, 157
128 };
129 
130 static const uint16_t kAcTable[128] = {
131   4,     5,   6,   7,   8,   9,  10,  11,
132   12,   13,  14,  15,  16,  17,  18,  19,
133   20,   21,  22,  23,  24,  25,  26,  27,
134   28,   29,  30,  31,  32,  33,  34,  35,
135   36,   37,  38,  39,  40,  41,  42,  43,
136   44,   45,  46,  47,  48,  49,  50,  51,
137   52,   53,  54,  55,  56,  57,  58,  60,
138   62,   64,  66,  68,  70,  72,  74,  76,
139   78,   80,  82,  84,  86,  88,  90,  92,
140   94,   96,  98, 100, 102, 104, 106, 108,
141   110, 112, 114, 116, 119, 122, 125, 128,
142   131, 134, 137, 140, 143, 146, 149, 152,
143   155, 158, 161, 164, 167, 170, 173, 177,
144   181, 185, 189, 193, 197, 201, 205, 209,
145   213, 217, 221, 225, 229, 234, 239, 245,
146   249, 254, 259, 264, 269, 274, 279, 284
147 };
148 
149 static const uint16_t kAcTable2[128] = {
150   8,     8,   9,  10,  12,  13,  15,  17,
151   18,   20,  21,  23,  24,  26,  27,  29,
152   31,   32,  34,  35,  37,  38,  40,  41,
153   43,   44,  46,  48,  49,  51,  52,  54,
154   55,   57,  58,  60,  62,  63,  65,  66,
155   68,   69,  71,  72,  74,  75,  77,  79,
156   80,   82,  83,  85,  86,  88,  89,  93,
157   96,   99, 102, 105, 108, 111, 114, 117,
158   120, 124, 127, 130, 133, 136, 139, 142,
159   145, 148, 151, 155, 158, 161, 164, 167,
160   170, 173, 176, 179, 184, 189, 193, 198,
161   203, 207, 212, 217, 221, 226, 230, 235,
162   240, 244, 249, 254, 258, 263, 268, 274,
163   280, 286, 292, 299, 305, 311, 317, 323,
164   330, 336, 342, 348, 354, 362, 370, 379,
165   385, 393, 401, 409, 416, 424, 432, 440
166 };
167 
168 static const uint8_t kBiasMatrices[3][2] = {  // [luma-ac,luma-dc,chroma][dc,ac]
169   { 96, 110 }, { 96, 108 }, { 110, 115 }
170 };
171 
172 // Sharpening by (slightly) raising the hi-frequency coeffs.
173 // Hack-ish but helpful for mid-bitrate range. Use with care.
174 #define SHARPEN_BITS 11  // number of descaling bits for sharpening bias
175 static const uint8_t kFreqSharpening[16] = {
176   0,  30, 60, 90,
177   30, 60, 90, 90,
178   60, 90, 90, 90,
179   90, 90, 90, 90
180 };
181 
182 //------------------------------------------------------------------------------
183 // Initialize quantization parameters in VP8Matrix
184 
185 // Returns the average quantizer
ExpandMatrix(VP8Matrix * const m,int type)186 static int ExpandMatrix(VP8Matrix* const m, int type) {
187   int i, sum;
188   for (i = 0; i < 2; ++i) {
189     const int is_ac_coeff = (i > 0);
190     const int bias = kBiasMatrices[type][is_ac_coeff];
191     m->iq_[i] = (1 << QFIX) / m->q_[i];
192     m->bias_[i] = BIAS(bias);
193     // zthresh_ is the exact value such that QUANTDIV(coeff, iQ, B) is:
194     //   * zero if coeff <= zthresh
195     //   * non-zero if coeff > zthresh
196     m->zthresh_[i] = ((1 << QFIX) - 1 - m->bias_[i]) / m->iq_[i];
197   }
198   for (i = 2; i < 16; ++i) {
199     m->q_[i] = m->q_[1];
200     m->iq_[i] = m->iq_[1];
201     m->bias_[i] = m->bias_[1];
202     m->zthresh_[i] = m->zthresh_[1];
203   }
204   for (sum = 0, i = 0; i < 16; ++i) {
205     if (type == 0) {  // we only use sharpening for AC luma coeffs
206       m->sharpen_[i] = (kFreqSharpening[i] * m->q_[i]) >> SHARPEN_BITS;
207     } else {
208       m->sharpen_[i] = 0;
209     }
210     sum += m->q_[i];
211   }
212   return (sum + 8) >> 4;
213 }
214 
SetupMatrices(VP8Encoder * enc)215 static void SetupMatrices(VP8Encoder* enc) {
216   int i;
217   const int tlambda_scale =
218     (enc->method_ >= 4) ? enc->config_->sns_strength
219                         : 0;
220   const int num_segments = enc->segment_hdr_.num_segments_;
221   for (i = 0; i < num_segments; ++i) {
222     VP8SegmentInfo* const m = &enc->dqm_[i];
223     const int q = m->quant_;
224     int q4, q16, quv;
225     m->y1_.q_[0] = kDcTable[clip(q + enc->dq_y1_dc_, 0, 127)];
226     m->y1_.q_[1] = kAcTable[clip(q,                  0, 127)];
227 
228     m->y2_.q_[0] = kDcTable[ clip(q + enc->dq_y2_dc_, 0, 127)] * 2;
229     m->y2_.q_[1] = kAcTable2[clip(q + enc->dq_y2_ac_, 0, 127)];
230 
231     m->uv_.q_[0] = kDcTable[clip(q + enc->dq_uv_dc_, 0, 117)];
232     m->uv_.q_[1] = kAcTable[clip(q + enc->dq_uv_ac_, 0, 127)];
233 
234     q4  = ExpandMatrix(&m->y1_, 0);
235     q16 = ExpandMatrix(&m->y2_, 1);
236     quv = ExpandMatrix(&m->uv_, 2);
237 
238     m->lambda_i4_          = (3 * q4 * q4) >> 7;
239     m->lambda_i16_         = (3 * q16 * q16);
240     m->lambda_uv_          = (3 * quv * quv) >> 6;
241     m->lambda_mode_        = (1 * q4 * q4) >> 7;
242     m->lambda_trellis_i4_  = (7 * q4 * q4) >> 3;
243     m->lambda_trellis_i16_ = (q16 * q16) >> 2;
244     m->lambda_trellis_uv_  = (quv *quv) << 1;
245     m->tlambda_            = (tlambda_scale * q4) >> 5;
246 
247     m->min_disto_ = 10 * m->y1_.q_[0];   // quantization-aware min disto
248     m->max_edge_  = 0;
249   }
250 }
251 
252 //------------------------------------------------------------------------------
253 // Initialize filtering parameters
254 
255 // Very small filter-strength values have close to no visual effect. So we can
256 // save a little decoding-CPU by turning filtering off for these.
257 #define FSTRENGTH_CUTOFF 2
258 
SetupFilterStrength(VP8Encoder * const enc)259 static void SetupFilterStrength(VP8Encoder* const enc) {
260   int i;
261   // level0 is in [0..500]. Using '-f 50' as filter_strength is mid-filtering.
262   const int level0 = 5 * enc->config_->filter_strength;
263   for (i = 0; i < NUM_MB_SEGMENTS; ++i) {
264     VP8SegmentInfo* const m = &enc->dqm_[i];
265     // We focus on the quantization of AC coeffs.
266     const int qstep = kAcTable[clip(m->quant_, 0, 127)] >> 2;
267     const int base_strength =
268         VP8FilterStrengthFromDelta(enc->filter_hdr_.sharpness_, qstep);
269     // Segments with lower complexity ('beta') will be less filtered.
270     const int f = base_strength * level0 / (256 + m->beta_);
271     m->fstrength_ = (f < FSTRENGTH_CUTOFF) ? 0 : (f > 63) ? 63 : f;
272   }
273   // We record the initial strength (mainly for the case of 1-segment only).
274   enc->filter_hdr_.level_ = enc->dqm_[0].fstrength_;
275   enc->filter_hdr_.simple_ = (enc->config_->filter_type == 0);
276   enc->filter_hdr_.sharpness_ = enc->config_->filter_sharpness;
277 }
278 
279 //------------------------------------------------------------------------------
280 
281 // Note: if you change the values below, remember that the max range
282 // allowed by the syntax for DQ_UV is [-16,16].
283 #define MAX_DQ_UV (6)
284 #define MIN_DQ_UV (-4)
285 
286 // We want to emulate jpeg-like behaviour where the expected "good" quality
287 // is around q=75. Internally, our "good" middle is around c=50. So we
288 // map accordingly using linear piece-wise function
QualityToCompression(double c)289 static double QualityToCompression(double c) {
290   const double linear_c = (c < 0.75) ? c * (2. / 3.) : 2. * c - 1.;
291   // The file size roughly scales as pow(quantizer, 3.). Actually, the
292   // exponent is somewhere between 2.8 and 3.2, but we're mostly interested
293   // in the mid-quant range. So we scale the compressibility inversely to
294   // this power-law: quant ~= compression ^ 1/3. This law holds well for
295   // low quant. Finer modeling for high-quant would make use of kAcTable[]
296   // more explicitly.
297   const double v = pow(linear_c, 1 / 3.);
298   return v;
299 }
300 
QualityToJPEGCompression(double c,double alpha)301 static double QualityToJPEGCompression(double c, double alpha) {
302   // We map the complexity 'alpha' and quality setting 'c' to a compression
303   // exponent empirically matched to the compression curve of libjpeg6b.
304   // On average, the WebP output size will be roughly similar to that of a
305   // JPEG file compressed with same quality factor.
306   const double amin = 0.30;
307   const double amax = 0.85;
308   const double exp_min = 0.4;
309   const double exp_max = 0.9;
310   const double slope = (exp_min - exp_max) / (amax - amin);
311   // Linearly interpolate 'expn' from exp_min to exp_max
312   // in the [amin, amax] range.
313   const double expn = (alpha > amax) ? exp_min
314                     : (alpha < amin) ? exp_max
315                     : exp_max + slope * (alpha - amin);
316   const double v = pow(c, expn);
317   return v;
318 }
319 
SegmentsAreEquivalent(const VP8SegmentInfo * const S1,const VP8SegmentInfo * const S2)320 static int SegmentsAreEquivalent(const VP8SegmentInfo* const S1,
321                                  const VP8SegmentInfo* const S2) {
322   return (S1->quant_ == S2->quant_) && (S1->fstrength_ == S2->fstrength_);
323 }
324 
SimplifySegments(VP8Encoder * const enc)325 static void SimplifySegments(VP8Encoder* const enc) {
326   int map[NUM_MB_SEGMENTS] = { 0, 1, 2, 3 };
327   const int num_segments = enc->segment_hdr_.num_segments_;
328   int num_final_segments = 1;
329   int s1, s2;
330   for (s1 = 1; s1 < num_segments; ++s1) {    // find similar segments
331     const VP8SegmentInfo* const S1 = &enc->dqm_[s1];
332     int found = 0;
333     // check if we already have similar segment
334     for (s2 = 0; s2 < num_final_segments; ++s2) {
335       const VP8SegmentInfo* const S2 = &enc->dqm_[s2];
336       if (SegmentsAreEquivalent(S1, S2)) {
337         found = 1;
338         break;
339       }
340     }
341     map[s1] = s2;
342     if (!found) {
343       if (num_final_segments != s1) {
344         enc->dqm_[num_final_segments] = enc->dqm_[s1];
345       }
346       ++num_final_segments;
347     }
348   }
349   if (num_final_segments < num_segments) {  // Remap
350     int i = enc->mb_w_ * enc->mb_h_;
351     while (i-- > 0) enc->mb_info_[i].segment_ = map[enc->mb_info_[i].segment_];
352     enc->segment_hdr_.num_segments_ = num_final_segments;
353     // Replicate the trailing segment infos (it's mostly cosmetics)
354     for (i = num_final_segments; i < num_segments; ++i) {
355       enc->dqm_[i] = enc->dqm_[num_final_segments - 1];
356     }
357   }
358 }
359 
VP8SetSegmentParams(VP8Encoder * const enc,float quality)360 void VP8SetSegmentParams(VP8Encoder* const enc, float quality) {
361   int i;
362   int dq_uv_ac, dq_uv_dc;
363   const int num_segments = enc->segment_hdr_.num_segments_;
364   const double amp = SNS_TO_DQ * enc->config_->sns_strength / 100. / 128.;
365   const double Q = quality / 100.;
366   const double c_base = enc->config_->emulate_jpeg_size ?
367       QualityToJPEGCompression(Q, enc->alpha_ / 255.) :
368       QualityToCompression(Q);
369   for (i = 0; i < num_segments; ++i) {
370     // We modulate the base coefficient to accommodate for the quantization
371     // susceptibility and allow denser segments to be quantized more.
372     const double expn = 1. - amp * enc->dqm_[i].alpha_;
373     const double c = pow(c_base, expn);
374     const int q = (int)(127. * (1. - c));
375     assert(expn > 0.);
376     enc->dqm_[i].quant_ = clip(q, 0, 127);
377   }
378 
379   // purely indicative in the bitstream (except for the 1-segment case)
380   enc->base_quant_ = enc->dqm_[0].quant_;
381 
382   // fill-in values for the unused segments (required by the syntax)
383   for (i = num_segments; i < NUM_MB_SEGMENTS; ++i) {
384     enc->dqm_[i].quant_ = enc->base_quant_;
385   }
386 
387   // uv_alpha_ is normally spread around ~60. The useful range is
388   // typically ~30 (quite bad) to ~100 (ok to decimate UV more).
389   // We map it to the safe maximal range of MAX/MIN_DQ_UV for dq_uv.
390   dq_uv_ac = (enc->uv_alpha_ - MID_ALPHA) * (MAX_DQ_UV - MIN_DQ_UV)
391                                           / (MAX_ALPHA - MIN_ALPHA);
392   // we rescale by the user-defined strength of adaptation
393   dq_uv_ac = dq_uv_ac * enc->config_->sns_strength / 100;
394   // and make it safe.
395   dq_uv_ac = clip(dq_uv_ac, MIN_DQ_UV, MAX_DQ_UV);
396   // We also boost the dc-uv-quant a little, based on sns-strength, since
397   // U/V channels are quite more reactive to high quants (flat DC-blocks
398   // tend to appear, and are unpleasant).
399   dq_uv_dc = -4 * enc->config_->sns_strength / 100;
400   dq_uv_dc = clip(dq_uv_dc, -15, 15);   // 4bit-signed max allowed
401 
402   enc->dq_y1_dc_ = 0;       // TODO(skal): dq-lum
403   enc->dq_y2_dc_ = 0;
404   enc->dq_y2_ac_ = 0;
405   enc->dq_uv_dc_ = dq_uv_dc;
406   enc->dq_uv_ac_ = dq_uv_ac;
407 
408   SetupFilterStrength(enc);   // initialize segments' filtering, eventually
409 
410   if (num_segments > 1) SimplifySegments(enc);
411 
412   SetupMatrices(enc);         // finalize quantization matrices
413 }
414 
415 //------------------------------------------------------------------------------
416 // Form the predictions in cache
417 
418 // Must be ordered using {DC_PRED, TM_PRED, V_PRED, H_PRED} as index
419 const int VP8I16ModeOffsets[4] = { I16DC16, I16TM16, I16VE16, I16HE16 };
420 const int VP8UVModeOffsets[4] = { C8DC8, C8TM8, C8VE8, C8HE8 };
421 
422 // Must be indexed using {B_DC_PRED -> B_HU_PRED} as index
423 const int VP8I4ModeOffsets[NUM_BMODES] = {
424   I4DC4, I4TM4, I4VE4, I4HE4, I4RD4, I4VR4, I4LD4, I4VL4, I4HD4, I4HU4
425 };
426 
VP8MakeLuma16Preds(const VP8EncIterator * const it)427 void VP8MakeLuma16Preds(const VP8EncIterator* const it) {
428   const uint8_t* const left = it->x_ ? it->y_left_ : NULL;
429   const uint8_t* const top = it->y_ ? it->y_top_ : NULL;
430   VP8EncPredLuma16(it->yuv_p_, left, top);
431 }
432 
VP8MakeChroma8Preds(const VP8EncIterator * const it)433 void VP8MakeChroma8Preds(const VP8EncIterator* const it) {
434   const uint8_t* const left = it->x_ ? it->u_left_ : NULL;
435   const uint8_t* const top = it->y_ ? it->uv_top_ : NULL;
436   VP8EncPredChroma8(it->yuv_p_, left, top);
437 }
438 
VP8MakeIntra4Preds(const VP8EncIterator * const it)439 void VP8MakeIntra4Preds(const VP8EncIterator* const it) {
440   VP8EncPredLuma4(it->yuv_p_, it->i4_top_);
441 }
442 
443 //------------------------------------------------------------------------------
444 // Quantize
445 
446 // Layout:
447 // +----+
448 // |YYYY| 0
449 // |YYYY| 4
450 // |YYYY| 8
451 // |YYYY| 12
452 // +----+
453 // |UUVV| 16
454 // |UUVV| 20
455 // +----+
456 
457 const int VP8Scan[16] = {  // Luma
458   0 +  0 * BPS,  4 +  0 * BPS, 8 +  0 * BPS, 12 +  0 * BPS,
459   0 +  4 * BPS,  4 +  4 * BPS, 8 +  4 * BPS, 12 +  4 * BPS,
460   0 +  8 * BPS,  4 +  8 * BPS, 8 +  8 * BPS, 12 +  8 * BPS,
461   0 + 12 * BPS,  4 + 12 * BPS, 8 + 12 * BPS, 12 + 12 * BPS,
462 };
463 
464 static const int VP8ScanUV[4 + 4] = {
465   0 + 0 * BPS,   4 + 0 * BPS, 0 + 4 * BPS,  4 + 4 * BPS,    // U
466   8 + 0 * BPS,  12 + 0 * BPS, 8 + 4 * BPS, 12 + 4 * BPS     // V
467 };
468 
469 //------------------------------------------------------------------------------
470 // Distortion measurement
471 
472 static const uint16_t kWeightY[16] = {
473   38, 32, 20, 9, 32, 28, 17, 7, 20, 17, 10, 4, 9, 7, 4, 2
474 };
475 
476 static const uint16_t kWeightTrellis[16] = {
477 #if USE_TDISTO == 0
478   16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
479 #else
480   30, 27, 19, 11,
481   27, 24, 17, 10,
482   19, 17, 12,  8,
483   11, 10,  8,  6
484 #endif
485 };
486 
487 // Init/Copy the common fields in score.
InitScore(VP8ModeScore * const rd)488 static void InitScore(VP8ModeScore* const rd) {
489   rd->D  = 0;
490   rd->SD = 0;
491   rd->R  = 0;
492   rd->H  = 0;
493   rd->nz = 0;
494   rd->score = MAX_COST;
495 }
496 
CopyScore(VP8ModeScore * const dst,const VP8ModeScore * const src)497 static void CopyScore(VP8ModeScore* const dst, const VP8ModeScore* const src) {
498   dst->D  = src->D;
499   dst->SD = src->SD;
500   dst->R  = src->R;
501   dst->H  = src->H;
502   dst->nz = src->nz;      // note that nz is not accumulated, but just copied.
503   dst->score = src->score;
504 }
505 
AddScore(VP8ModeScore * const dst,const VP8ModeScore * const src)506 static void AddScore(VP8ModeScore* const dst, const VP8ModeScore* const src) {
507   dst->D  += src->D;
508   dst->SD += src->SD;
509   dst->R  += src->R;
510   dst->H  += src->H;
511   dst->nz |= src->nz;     // here, new nz bits are accumulated.
512   dst->score += src->score;
513 }
514 
515 //------------------------------------------------------------------------------
516 // Performs trellis-optimized quantization.
517 
518 // Trellis node
519 typedef struct {
520   int8_t prev;            // best previous node
521   int8_t sign;            // sign of coeff_i
522   int16_t level;          // level
523 } Node;
524 
525 // Score state
526 typedef struct {
527   score_t score;          // partial RD score
528   const uint16_t* costs;  // shortcut to cost tables
529 } ScoreState;
530 
531 // If a coefficient was quantized to a value Q (using a neutral bias),
532 // we test all alternate possibilities between [Q-MIN_DELTA, Q+MAX_DELTA]
533 // We don't test negative values though.
534 #define MIN_DELTA 0   // how much lower level to try
535 #define MAX_DELTA 1   // how much higher
536 #define NUM_NODES (MIN_DELTA + 1 + MAX_DELTA)
537 #define NODE(n, l) (nodes[(n)][(l) + MIN_DELTA])
538 #define SCORE_STATE(n, l) (score_states[n][(l) + MIN_DELTA])
539 
SetRDScore(int lambda,VP8ModeScore * const rd)540 static WEBP_INLINE void SetRDScore(int lambda, VP8ModeScore* const rd) {
541   // TODO: incorporate the "* 256" in the tables?
542   rd->score = (rd->R + rd->H) * lambda + 256 * (rd->D + rd->SD);
543 }
544 
RDScoreTrellis(int lambda,score_t rate,score_t distortion)545 static WEBP_INLINE score_t RDScoreTrellis(int lambda, score_t rate,
546                                           score_t distortion) {
547   return rate * lambda + 256 * distortion;
548 }
549 
TrellisQuantizeBlock(const VP8Encoder * const enc,int16_t in[16],int16_t out[16],int ctx0,int coeff_type,const VP8Matrix * const mtx,int lambda)550 static int TrellisQuantizeBlock(const VP8Encoder* const enc,
551                                 int16_t in[16], int16_t out[16],
552                                 int ctx0, int coeff_type,
553                                 const VP8Matrix* const mtx,
554                                 int lambda) {
555   const ProbaArray* const probas = enc->proba_.coeffs_[coeff_type];
556   const CostArray* const costs = enc->proba_.level_cost_[coeff_type];
557   const int first = (coeff_type == 0) ? 1 : 0;
558   Node nodes[16][NUM_NODES];
559   ScoreState score_states[2][NUM_NODES];
560   ScoreState* ss_cur = &SCORE_STATE(0, MIN_DELTA);
561   ScoreState* ss_prev = &SCORE_STATE(1, MIN_DELTA);
562   int best_path[3] = {-1, -1, -1};   // store best-last/best-level/best-previous
563   score_t best_score;
564   int n, m, p, last;
565 
566   {
567     score_t cost;
568     const int thresh = mtx->q_[1] * mtx->q_[1] / 4;
569     const int last_proba = probas[VP8EncBands[first]][ctx0][0];
570 
571     // compute the position of the last interesting coefficient
572     last = first - 1;
573     for (n = 15; n >= first; --n) {
574       const int j = kZigzag[n];
575       const int err = in[j] * in[j];
576       if (err > thresh) {
577         last = n;
578         break;
579       }
580     }
581     // we don't need to go inspect up to n = 16 coeffs. We can just go up
582     // to last + 1 (inclusive) without losing much.
583     if (last < 15) ++last;
584 
585     // compute 'skip' score. This is the max score one can do.
586     cost = VP8BitCost(0, last_proba);
587     best_score = RDScoreTrellis(lambda, cost, 0);
588 
589     // initialize source node.
590     for (m = -MIN_DELTA; m <= MAX_DELTA; ++m) {
591       const score_t rate = (ctx0 == 0) ? VP8BitCost(1, last_proba) : 0;
592       ss_cur[m].score = RDScoreTrellis(lambda, rate, 0);
593       ss_cur[m].costs = costs[VP8EncBands[first]][ctx0];
594     }
595   }
596 
597   // traverse trellis.
598   for (n = first; n <= last; ++n) {
599     const int j = kZigzag[n];
600     const uint32_t Q  = mtx->q_[j];
601     const uint32_t iQ = mtx->iq_[j];
602     const uint32_t B = BIAS(0x00);     // neutral bias
603     // note: it's important to take sign of the _original_ coeff,
604     // so we don't have to consider level < 0 afterward.
605     const int sign = (in[j] < 0);
606     const uint32_t coeff0 = (sign ? -in[j] : in[j]) + mtx->sharpen_[j];
607     int level0 = QUANTDIV(coeff0, iQ, B);
608     if (level0 > MAX_LEVEL) level0 = MAX_LEVEL;
609 
610     {   // Swap current and previous score states
611       ScoreState* const tmp = ss_cur;
612       ss_cur = ss_prev;
613       ss_prev = tmp;
614     }
615 
616     // test all alternate level values around level0.
617     for (m = -MIN_DELTA; m <= MAX_DELTA; ++m) {
618       Node* const cur = &NODE(n, m);
619       int level = level0 + m;
620       const int ctx = (level > 2) ? 2 : level;
621       const int band = VP8EncBands[n + 1];
622       score_t base_score, last_pos_score;
623       score_t best_cur_score = MAX_COST;
624       int best_prev = 0;   // default, in case
625 
626       ss_cur[m].score = MAX_COST;
627       ss_cur[m].costs = costs[band][ctx];
628       if (level > MAX_LEVEL || level < 0) {   // node is dead?
629         continue;
630       }
631 
632       // Compute extra rate cost if last coeff's position is < 15
633       {
634         const score_t last_pos_cost =
635             (n < 15) ? VP8BitCost(0, probas[band][ctx][0]) : 0;
636         last_pos_score = RDScoreTrellis(lambda, last_pos_cost, 0);
637       }
638 
639       {
640         // Compute delta_error = how much coding this level will
641         // subtract to max_error as distortion.
642         // Here, distortion = sum of (|coeff_i| - level_i * Q_i)^2
643         const int new_error = coeff0 - level * Q;
644         const int delta_error =
645             kWeightTrellis[j] * (new_error * new_error - coeff0 * coeff0);
646         base_score = RDScoreTrellis(lambda, 0, delta_error);
647       }
648 
649       // Inspect all possible non-dead predecessors. Retain only the best one.
650       for (p = -MIN_DELTA; p <= MAX_DELTA; ++p) {
651         // Dead nodes (with ss_prev[p].score >= MAX_COST) are automatically
652         // eliminated since their score can't be better than the current best.
653         const score_t cost = VP8LevelCost(ss_prev[p].costs, level);
654         // Examine node assuming it's a non-terminal one.
655         const score_t score =
656             base_score + ss_prev[p].score + RDScoreTrellis(lambda, cost, 0);
657         if (score < best_cur_score) {
658           best_cur_score = score;
659           best_prev = p;
660         }
661       }
662       // Store best finding in current node.
663       cur->sign = sign;
664       cur->level = level;
665       cur->prev = best_prev;
666       ss_cur[m].score = best_cur_score;
667 
668       // Now, record best terminal node (and thus best entry in the graph).
669       if (level != 0) {
670         const score_t score = best_cur_score + last_pos_score;
671         if (score < best_score) {
672           best_score = score;
673           best_path[0] = n;                     // best eob position
674           best_path[1] = m;                     // best node index
675           best_path[2] = best_prev;             // best predecessor
676         }
677       }
678     }
679   }
680 
681   // Fresh start
682   memset(in + first, 0, (16 - first) * sizeof(*in));
683   memset(out + first, 0, (16 - first) * sizeof(*out));
684   if (best_path[0] == -1) {
685     return 0;   // skip!
686   }
687 
688   {
689     // Unwind the best path.
690     // Note: best-prev on terminal node is not necessarily equal to the
691     // best_prev for non-terminal. So we patch best_path[2] in.
692     int nz = 0;
693     int best_node = best_path[1];
694     n = best_path[0];
695     NODE(n, best_node).prev = best_path[2];   // force best-prev for terminal
696 
697     for (; n >= first; --n) {
698       const Node* const node = &NODE(n, best_node);
699       const int j = kZigzag[n];
700       out[n] = node->sign ? -node->level : node->level;
701       nz |= node->level;
702       in[j] = out[n] * mtx->q_[j];
703       best_node = node->prev;
704     }
705     return (nz != 0);
706   }
707 }
708 
709 #undef NODE
710 
711 //------------------------------------------------------------------------------
712 // Performs: difference, transform, quantize, back-transform, add
713 // all at once. Output is the reconstructed block in *yuv_out, and the
714 // quantized levels in *levels.
715 
ReconstructIntra16(VP8EncIterator * const it,VP8ModeScore * const rd,uint8_t * const yuv_out,int mode)716 static int ReconstructIntra16(VP8EncIterator* const it,
717                               VP8ModeScore* const rd,
718                               uint8_t* const yuv_out,
719                               int mode) {
720   const VP8Encoder* const enc = it->enc_;
721   const uint8_t* const ref = it->yuv_p_ + VP8I16ModeOffsets[mode];
722   const uint8_t* const src = it->yuv_in_ + Y_OFF;
723   const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
724   int nz = 0;
725   int n;
726   int16_t tmp[16][16], dc_tmp[16];
727 
728   for (n = 0; n < 16; ++n) {
729     VP8FTransform(src + VP8Scan[n], ref + VP8Scan[n], tmp[n]);
730   }
731   VP8FTransformWHT(tmp[0], dc_tmp);
732   nz |= VP8EncQuantizeBlockWHT(dc_tmp, rd->y_dc_levels, &dqm->y2_) << 24;
733 
734   if (DO_TRELLIS_I16 && it->do_trellis_) {
735     int x, y;
736     VP8IteratorNzToBytes(it);
737     for (y = 0, n = 0; y < 4; ++y) {
738       for (x = 0; x < 4; ++x, ++n) {
739         const int ctx = it->top_nz_[x] + it->left_nz_[y];
740         const int non_zero =
741             TrellisQuantizeBlock(enc, tmp[n], rd->y_ac_levels[n], ctx, 0,
742                                  &dqm->y1_, dqm->lambda_trellis_i16_);
743         it->top_nz_[x] = it->left_nz_[y] = non_zero;
744         rd->y_ac_levels[n][0] = 0;
745         nz |= non_zero << n;
746       }
747     }
748   } else {
749     for (n = 0; n < 16; ++n) {
750       // Zero-out the first coeff, so that: a) nz is correct below, and
751       // b) finding 'last' non-zero coeffs in SetResidualCoeffs() is simplified.
752       tmp[n][0] = 0;
753       nz |= VP8EncQuantizeBlock(tmp[n], rd->y_ac_levels[n], &dqm->y1_) << n;
754       assert(rd->y_ac_levels[n][0] == 0);
755     }
756   }
757 
758   // Transform back
759   VP8TransformWHT(dc_tmp, tmp[0]);
760   for (n = 0; n < 16; n += 2) {
761     VP8ITransform(ref + VP8Scan[n], tmp[n], yuv_out + VP8Scan[n], 1);
762   }
763 
764   return nz;
765 }
766 
ReconstructIntra4(VP8EncIterator * const it,int16_t levels[16],const uint8_t * const src,uint8_t * const yuv_out,int mode)767 static int ReconstructIntra4(VP8EncIterator* const it,
768                              int16_t levels[16],
769                              const uint8_t* const src,
770                              uint8_t* const yuv_out,
771                              int mode) {
772   const VP8Encoder* const enc = it->enc_;
773   const uint8_t* const ref = it->yuv_p_ + VP8I4ModeOffsets[mode];
774   const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
775   int nz = 0;
776   int16_t tmp[16];
777 
778   VP8FTransform(src, ref, tmp);
779   if (DO_TRELLIS_I4 && it->do_trellis_) {
780     const int x = it->i4_ & 3, y = it->i4_ >> 2;
781     const int ctx = it->top_nz_[x] + it->left_nz_[y];
782     nz = TrellisQuantizeBlock(enc, tmp, levels, ctx, 3, &dqm->y1_,
783                               dqm->lambda_trellis_i4_);
784   } else {
785     nz = VP8EncQuantizeBlock(tmp, levels, &dqm->y1_);
786   }
787   VP8ITransform(ref, tmp, yuv_out, 0);
788   return nz;
789 }
790 
ReconstructUV(VP8EncIterator * const it,VP8ModeScore * const rd,uint8_t * const yuv_out,int mode)791 static int ReconstructUV(VP8EncIterator* const it, VP8ModeScore* const rd,
792                          uint8_t* const yuv_out, int mode) {
793   const VP8Encoder* const enc = it->enc_;
794   const uint8_t* const ref = it->yuv_p_ + VP8UVModeOffsets[mode];
795   const uint8_t* const src = it->yuv_in_ + U_OFF;
796   const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
797   int nz = 0;
798   int n;
799   int16_t tmp[8][16];
800 
801   for (n = 0; n < 8; ++n) {
802     VP8FTransform(src + VP8ScanUV[n], ref + VP8ScanUV[n], tmp[n]);
803   }
804   if (DO_TRELLIS_UV && it->do_trellis_) {
805     int ch, x, y;
806     for (ch = 0, n = 0; ch <= 2; ch += 2) {
807       for (y = 0; y < 2; ++y) {
808         for (x = 0; x < 2; ++x, ++n) {
809           const int ctx = it->top_nz_[4 + ch + x] + it->left_nz_[4 + ch + y];
810           const int non_zero =
811               TrellisQuantizeBlock(enc, tmp[n], rd->uv_levels[n], ctx, 2,
812                                    &dqm->uv_, dqm->lambda_trellis_uv_);
813           it->top_nz_[4 + ch + x] = it->left_nz_[4 + ch + y] = non_zero;
814           nz |= non_zero << n;
815         }
816       }
817     }
818   } else {
819     for (n = 0; n < 8; ++n) {
820       nz |= VP8EncQuantizeBlock(tmp[n], rd->uv_levels[n], &dqm->uv_) << n;
821     }
822   }
823 
824   for (n = 0; n < 8; n += 2) {
825     VP8ITransform(ref + VP8ScanUV[n], tmp[n], yuv_out + VP8ScanUV[n], 1);
826   }
827   return (nz << 16);
828 }
829 
830 //------------------------------------------------------------------------------
831 // RD-opt decision. Reconstruct each modes, evalue distortion and bit-cost.
832 // Pick the mode is lower RD-cost = Rate + lambda * Distortion.
833 
StoreMaxDelta(VP8SegmentInfo * const dqm,const int16_t DCs[16])834 static void StoreMaxDelta(VP8SegmentInfo* const dqm, const int16_t DCs[16]) {
835   // We look at the first three AC coefficients to determine what is the average
836   // delta between each sub-4x4 block.
837   const int v0 = abs(DCs[1]);
838   const int v1 = abs(DCs[4]);
839   const int v2 = abs(DCs[5]);
840   int max_v = (v0 > v1) ? v1 : v0;
841   max_v = (v2 > max_v) ? v2 : max_v;
842   if (max_v > dqm->max_edge_) dqm->max_edge_ = max_v;
843 }
844 
SwapPtr(uint8_t ** a,uint8_t ** b)845 static void SwapPtr(uint8_t** a, uint8_t** b) {
846   uint8_t* const tmp = *a;
847   *a = *b;
848   *b = tmp;
849 }
850 
SwapOut(VP8EncIterator * const it)851 static void SwapOut(VP8EncIterator* const it) {
852   SwapPtr(&it->yuv_out_, &it->yuv_out2_);
853 }
854 
IsFlat(const int16_t * levels,int num_blocks,score_t thresh)855 static score_t IsFlat(const int16_t* levels, int num_blocks, score_t thresh) {
856   score_t score = 0;
857   while (num_blocks-- > 0) {      // TODO(skal): refine positional scoring?
858     int i;
859     for (i = 1; i < 16; ++i) {    // omit DC, we're only interested in AC
860       score += (levels[i] != 0);
861       if (score > thresh) return 0;
862     }
863     levels += 16;
864   }
865   return 1;
866 }
867 
PickBestIntra16(VP8EncIterator * const it,VP8ModeScore * const rd)868 static void PickBestIntra16(VP8EncIterator* const it, VP8ModeScore* const rd) {
869   const int kNumBlocks = 16;
870   VP8SegmentInfo* const dqm = &it->enc_->dqm_[it->mb_->segment_];
871   const int lambda = dqm->lambda_i16_;
872   const int tlambda = dqm->tlambda_;
873   const uint8_t* const src = it->yuv_in_ + Y_OFF;
874   VP8ModeScore rd16;
875   int mode;
876 
877   rd->mode_i16 = -1;
878   for (mode = 0; mode < NUM_PRED_MODES; ++mode) {
879     uint8_t* const tmp_dst = it->yuv_out2_ + Y_OFF;  // scratch buffer
880     int nz;
881 
882     // Reconstruct
883     nz = ReconstructIntra16(it, &rd16, tmp_dst, mode);
884 
885     // Measure RD-score
886     rd16.D = VP8SSE16x16(src, tmp_dst);
887     rd16.SD = tlambda ? MULT_8B(tlambda, VP8TDisto16x16(src, tmp_dst, kWeightY))
888             : 0;
889     rd16.H = VP8FixedCostsI16[mode];
890     rd16.R = VP8GetCostLuma16(it, &rd16);
891     if (mode > 0 &&
892         IsFlat(rd16.y_ac_levels[0], kNumBlocks, FLATNESS_LIMIT_I16)) {
893       // penalty to avoid flat area to be mispredicted by complex mode
894       rd16.R += FLATNESS_PENALTY * kNumBlocks;
895     }
896 
897     // Since we always examine Intra16 first, we can overwrite *rd directly.
898     SetRDScore(lambda, &rd16);
899     if (mode == 0 || rd16.score < rd->score) {
900       CopyScore(rd, &rd16);
901       rd->mode_i16 = mode;
902       rd->nz = nz;
903       memcpy(rd->y_ac_levels, rd16.y_ac_levels, sizeof(rd16.y_ac_levels));
904       memcpy(rd->y_dc_levels, rd16.y_dc_levels, sizeof(rd16.y_dc_levels));
905       SwapOut(it);
906     }
907   }
908   SetRDScore(dqm->lambda_mode_, rd);   // finalize score for mode decision.
909   VP8SetIntra16Mode(it, rd->mode_i16);
910 
911   // we have a blocky macroblock (only DCs are non-zero) with fairly high
912   // distortion, record max delta so we can later adjust the minimal filtering
913   // strength needed to smooth these blocks out.
914   if ((rd->nz & 0xffff) == 0 && rd->D > dqm->min_disto_) {
915     StoreMaxDelta(dqm, rd->y_dc_levels);
916   }
917 }
918 
919 //------------------------------------------------------------------------------
920 
921 // return the cost array corresponding to the surrounding prediction modes.
GetCostModeI4(VP8EncIterator * const it,const uint8_t modes[16])922 static const uint16_t* GetCostModeI4(VP8EncIterator* const it,
923                                      const uint8_t modes[16]) {
924   const int preds_w = it->enc_->preds_w_;
925   const int x = (it->i4_ & 3), y = it->i4_ >> 2;
926   const int left = (x == 0) ? it->preds_[y * preds_w - 1] : modes[it->i4_ - 1];
927   const int top = (y == 0) ? it->preds_[-preds_w + x] : modes[it->i4_ - 4];
928   return VP8FixedCostsI4[top][left];
929 }
930 
PickBestIntra4(VP8EncIterator * const it,VP8ModeScore * const rd)931 static int PickBestIntra4(VP8EncIterator* const it, VP8ModeScore* const rd) {
932   const VP8Encoder* const enc = it->enc_;
933   const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
934   const int lambda = dqm->lambda_i4_;
935   const int tlambda = dqm->tlambda_;
936   const uint8_t* const src0 = it->yuv_in_ + Y_OFF;
937   uint8_t* const best_blocks = it->yuv_out2_ + Y_OFF;
938   int total_header_bits = 0;
939   VP8ModeScore rd_best;
940 
941   if (enc->max_i4_header_bits_ == 0) {
942     return 0;
943   }
944 
945   InitScore(&rd_best);
946   rd_best.H = 211;  // '211' is the value of VP8BitCost(0, 145)
947   SetRDScore(dqm->lambda_mode_, &rd_best);
948   VP8IteratorStartI4(it);
949   do {
950     const int kNumBlocks = 1;
951     VP8ModeScore rd_i4;
952     int mode;
953     int best_mode = -1;
954     const uint8_t* const src = src0 + VP8Scan[it->i4_];
955     const uint16_t* const mode_costs = GetCostModeI4(it, rd->modes_i4);
956     uint8_t* best_block = best_blocks + VP8Scan[it->i4_];
957     uint8_t* tmp_dst = it->yuv_p_ + I4TMP;    // scratch buffer.
958 
959     InitScore(&rd_i4);
960     VP8MakeIntra4Preds(it);
961     for (mode = 0; mode < NUM_BMODES; ++mode) {
962       VP8ModeScore rd_tmp;
963       int16_t tmp_levels[16];
964 
965       // Reconstruct
966       rd_tmp.nz =
967           ReconstructIntra4(it, tmp_levels, src, tmp_dst, mode) << it->i4_;
968 
969       // Compute RD-score
970       rd_tmp.D = VP8SSE4x4(src, tmp_dst);
971       rd_tmp.SD =
972           tlambda ? MULT_8B(tlambda, VP8TDisto4x4(src, tmp_dst, kWeightY))
973                   : 0;
974       rd_tmp.H = mode_costs[mode];
975       rd_tmp.R = VP8GetCostLuma4(it, tmp_levels);
976       if (mode > 0 && IsFlat(tmp_levels, kNumBlocks, FLATNESS_LIMIT_I4)) {
977         rd_tmp.R += FLATNESS_PENALTY * kNumBlocks;
978       }
979 
980       SetRDScore(lambda, &rd_tmp);
981       if (best_mode < 0 || rd_tmp.score < rd_i4.score) {
982         CopyScore(&rd_i4, &rd_tmp);
983         best_mode = mode;
984         SwapPtr(&tmp_dst, &best_block);
985         memcpy(rd_best.y_ac_levels[it->i4_], tmp_levels, sizeof(tmp_levels));
986       }
987     }
988     SetRDScore(dqm->lambda_mode_, &rd_i4);
989     AddScore(&rd_best, &rd_i4);
990     if (rd_best.score >= rd->score) {
991       return 0;
992     }
993     total_header_bits += (int)rd_i4.H;   // <- equal to mode_costs[best_mode];
994     if (total_header_bits > enc->max_i4_header_bits_) {
995       return 0;
996     }
997     // Copy selected samples if not in the right place already.
998     if (best_block != best_blocks + VP8Scan[it->i4_]) {
999       VP8Copy4x4(best_block, best_blocks + VP8Scan[it->i4_]);
1000     }
1001     rd->modes_i4[it->i4_] = best_mode;
1002     it->top_nz_[it->i4_ & 3] = it->left_nz_[it->i4_ >> 2] = (rd_i4.nz ? 1 : 0);
1003   } while (VP8IteratorRotateI4(it, best_blocks));
1004 
1005   // finalize state
1006   CopyScore(rd, &rd_best);
1007   VP8SetIntra4Mode(it, rd->modes_i4);
1008   SwapOut(it);
1009   memcpy(rd->y_ac_levels, rd_best.y_ac_levels, sizeof(rd->y_ac_levels));
1010   return 1;   // select intra4x4 over intra16x16
1011 }
1012 
1013 //------------------------------------------------------------------------------
1014 
PickBestUV(VP8EncIterator * const it,VP8ModeScore * const rd)1015 static void PickBestUV(VP8EncIterator* const it, VP8ModeScore* const rd) {
1016   const int kNumBlocks = 8;
1017   const VP8SegmentInfo* const dqm = &it->enc_->dqm_[it->mb_->segment_];
1018   const int lambda = dqm->lambda_uv_;
1019   const uint8_t* const src = it->yuv_in_ + U_OFF;
1020   uint8_t* const tmp_dst = it->yuv_out2_ + U_OFF;  // scratch buffer
1021   uint8_t* const dst0 = it->yuv_out_ + U_OFF;
1022   VP8ModeScore rd_best;
1023   int mode;
1024 
1025   rd->mode_uv = -1;
1026   InitScore(&rd_best);
1027   for (mode = 0; mode < NUM_PRED_MODES; ++mode) {
1028     VP8ModeScore rd_uv;
1029 
1030     // Reconstruct
1031     rd_uv.nz = ReconstructUV(it, &rd_uv, tmp_dst, mode);
1032 
1033     // Compute RD-score
1034     rd_uv.D  = VP8SSE16x8(src, tmp_dst);
1035     rd_uv.SD = 0;    // TODO: should we call TDisto? it tends to flatten areas.
1036     rd_uv.H  = VP8FixedCostsUV[mode];
1037     rd_uv.R  = VP8GetCostUV(it, &rd_uv);
1038     if (mode > 0 && IsFlat(rd_uv.uv_levels[0], kNumBlocks, FLATNESS_LIMIT_UV)) {
1039       rd_uv.R += FLATNESS_PENALTY * kNumBlocks;
1040     }
1041 
1042     SetRDScore(lambda, &rd_uv);
1043     if (mode == 0 || rd_uv.score < rd_best.score) {
1044       CopyScore(&rd_best, &rd_uv);
1045       rd->mode_uv = mode;
1046       memcpy(rd->uv_levels, rd_uv.uv_levels, sizeof(rd->uv_levels));
1047       memcpy(dst0, tmp_dst, UV_SIZE);   //  TODO: SwapUVOut() ?
1048     }
1049   }
1050   VP8SetIntraUVMode(it, rd->mode_uv);
1051   AddScore(rd, &rd_best);
1052 }
1053 
1054 //------------------------------------------------------------------------------
1055 // Final reconstruction and quantization.
1056 
SimpleQuantize(VP8EncIterator * const it,VP8ModeScore * const rd)1057 static void SimpleQuantize(VP8EncIterator* const it, VP8ModeScore* const rd) {
1058   const VP8Encoder* const enc = it->enc_;
1059   const int is_i16 = (it->mb_->type_ == 1);
1060   int nz = 0;
1061 
1062   if (is_i16) {
1063     nz = ReconstructIntra16(it, rd, it->yuv_out_ + Y_OFF, it->preds_[0]);
1064   } else {
1065     VP8IteratorStartI4(it);
1066     do {
1067       const int mode =
1068           it->preds_[(it->i4_ & 3) + (it->i4_ >> 2) * enc->preds_w_];
1069       const uint8_t* const src = it->yuv_in_ + Y_OFF + VP8Scan[it->i4_];
1070       uint8_t* const dst = it->yuv_out_ + Y_OFF + VP8Scan[it->i4_];
1071       VP8MakeIntra4Preds(it);
1072       nz |= ReconstructIntra4(it, rd->y_ac_levels[it->i4_],
1073                               src, dst, mode) << it->i4_;
1074     } while (VP8IteratorRotateI4(it, it->yuv_out_ + Y_OFF));
1075   }
1076 
1077   nz |= ReconstructUV(it, rd, it->yuv_out_ + U_OFF, it->mb_->uv_mode_);
1078   rd->nz = nz;
1079 }
1080 
1081 // Refine intra16/intra4 sub-modes based on distortion only (not rate).
DistoRefine(VP8EncIterator * const it,int try_both_i4_i16)1082 static void DistoRefine(VP8EncIterator* const it, int try_both_i4_i16) {
1083   const int is_i16 = (it->mb_->type_ == 1);
1084   score_t best_score = MAX_COST;
1085 
1086   if (try_both_i4_i16 || is_i16) {
1087     int mode;
1088     int best_mode = -1;
1089     for (mode = 0; mode < NUM_PRED_MODES; ++mode) {
1090       const uint8_t* const ref = it->yuv_p_ + VP8I16ModeOffsets[mode];
1091       const uint8_t* const src = it->yuv_in_ + Y_OFF;
1092       const score_t score = VP8SSE16x16(src, ref);
1093       if (score < best_score) {
1094         best_mode = mode;
1095         best_score = score;
1096       }
1097     }
1098     VP8SetIntra16Mode(it, best_mode);
1099   }
1100   if (try_both_i4_i16 || !is_i16) {
1101     uint8_t modes_i4[16];
1102     // We don't evaluate the rate here, but just account for it through a
1103     // constant penalty (i4 mode usually needs more bits compared to i16).
1104     score_t score_i4 = (score_t)I4_PENALTY;
1105 
1106     VP8IteratorStartI4(it);
1107     do {
1108       int mode;
1109       int best_sub_mode = -1;
1110       score_t best_sub_score = MAX_COST;
1111       const uint8_t* const src = it->yuv_in_ + Y_OFF + VP8Scan[it->i4_];
1112 
1113       // TODO(skal): we don't really need the prediction pixels here,
1114       // but just the distortion against 'src'.
1115       VP8MakeIntra4Preds(it);
1116       for (mode = 0; mode < NUM_BMODES; ++mode) {
1117         const uint8_t* const ref = it->yuv_p_ + VP8I4ModeOffsets[mode];
1118         const score_t score = VP8SSE4x4(src, ref);
1119         if (score < best_sub_score) {
1120           best_sub_mode = mode;
1121           best_sub_score = score;
1122         }
1123       }
1124       modes_i4[it->i4_] = best_sub_mode;
1125       score_i4 += best_sub_score;
1126       if (score_i4 >= best_score) break;
1127     } while (VP8IteratorRotateI4(it, it->yuv_in_ + Y_OFF));
1128     if (score_i4 < best_score) {
1129       VP8SetIntra4Mode(it, modes_i4);
1130     }
1131   }
1132 }
1133 
1134 //------------------------------------------------------------------------------
1135 // Entry point
1136 
VP8Decimate(VP8EncIterator * const it,VP8ModeScore * const rd,VP8RDLevel rd_opt)1137 int VP8Decimate(VP8EncIterator* const it, VP8ModeScore* const rd,
1138                 VP8RDLevel rd_opt) {
1139   int is_skipped;
1140   const int method = it->enc_->method_;
1141 
1142   InitScore(rd);
1143 
1144   // We can perform predictions for Luma16x16 and Chroma8x8 already.
1145   // Luma4x4 predictions needs to be done as-we-go.
1146   VP8MakeLuma16Preds(it);
1147   VP8MakeChroma8Preds(it);
1148 
1149   if (rd_opt > RD_OPT_NONE) {
1150     it->do_trellis_ = (rd_opt >= RD_OPT_TRELLIS_ALL);
1151     PickBestIntra16(it, rd);
1152     if (method >= 2) {
1153       PickBestIntra4(it, rd);
1154     }
1155     PickBestUV(it, rd);
1156     if (rd_opt == RD_OPT_TRELLIS) {   // finish off with trellis-optim now
1157       it->do_trellis_ = 1;
1158       SimpleQuantize(it, rd);
1159     }
1160   } else {
1161     // For method == 2, pick the best intra4/intra16 based on SSE (~tad slower).
1162     // For method <= 1, we refine intra4 or intra16 (but don't re-examine mode).
1163     DistoRefine(it, (method >= 2));
1164     SimpleQuantize(it, rd);
1165   }
1166   is_skipped = (rd->nz == 0);
1167   VP8SetSkip(it, is_skipped);
1168   return is_skipped;
1169 }
1170 
1171