1 // Copyright 2011 Google Inc. All Rights Reserved.
2 //
3 // Use of this source code is governed by a BSD-style license
4 // that can be found in the COPYING file in the root of the source
5 // tree. An additional intellectual property rights grant can be found
6 // in the file PATENTS. All contributing project authors may
7 // be found in the AUTHORS file in the root of the source tree.
8 // -----------------------------------------------------------------------------
9 //
10 // Quantization
11 //
12 // Author: Skal (pascal.massimino@gmail.com)
13
14 #include <assert.h>
15 #include <math.h>
16 #include <stdlib.h> // for abs()
17
18 #include "./vp8enci.h"
19 #include "./cost.h"
20
21 #define DO_TRELLIS_I4 1
22 #define DO_TRELLIS_I16 1 // not a huge gain, but ok at low bitrate.
23 #define DO_TRELLIS_UV 0 // disable trellis for UV. Risky. Not worth.
24 #define USE_TDISTO 1
25
26 #define MID_ALPHA 64 // neutral value for susceptibility
27 #define MIN_ALPHA 30 // lowest usable value for susceptibility
28 #define MAX_ALPHA 100 // higher meaningful value for susceptibility
29
30 #define SNS_TO_DQ 0.9 // Scaling constant between the sns value and the QP
31 // power-law modulation. Must be strictly less than 1.
32
33 #define I4_PENALTY 4000 // Rate-penalty for quick i4/i16 decision
34
35 // number of non-zero coeffs below which we consider the block very flat
36 // (and apply a penalty to complex predictions)
37 #define FLATNESS_LIMIT_I16 10 // I16 mode
38 #define FLATNESS_LIMIT_I4 3 // I4 mode
39 #define FLATNESS_LIMIT_UV 2 // UV mode
40 #define FLATNESS_PENALTY 140 // roughly ~1bit per block
41
42 #define MULT_8B(a, b) (((a) * (b) + 128) >> 8)
43
44 // #define DEBUG_BLOCK
45
46 //------------------------------------------------------------------------------
47
48 #if defined(DEBUG_BLOCK)
49
50 #include <stdio.h>
51 #include <stdlib.h>
52
PrintBlockInfo(const VP8EncIterator * const it,const VP8ModeScore * const rd)53 static void PrintBlockInfo(const VP8EncIterator* const it,
54 const VP8ModeScore* const rd) {
55 int i, j;
56 const int is_i16 = (it->mb_->type_ == 1);
57 printf("SOURCE / OUTPUT / ABS DELTA\n");
58 for (j = 0; j < 24; ++j) {
59 if (j == 16) printf("\n"); // newline before the U/V block
60 for (i = 0; i < 16; ++i) printf("%3d ", it->yuv_in_[i + j * BPS]);
61 printf(" ");
62 for (i = 0; i < 16; ++i) printf("%3d ", it->yuv_out_[i + j * BPS]);
63 printf(" ");
64 for (i = 0; i < 16; ++i) {
65 printf("%1d ", abs(it->yuv_out_[i + j * BPS] - it->yuv_in_[i + j * BPS]));
66 }
67 printf("\n");
68 }
69 printf("\nD:%d SD:%d R:%d H:%d nz:0x%x score:%d\n",
70 (int)rd->D, (int)rd->SD, (int)rd->R, (int)rd->H, (int)rd->nz,
71 (int)rd->score);
72 if (is_i16) {
73 printf("Mode: %d\n", rd->mode_i16);
74 printf("y_dc_levels:");
75 for (i = 0; i < 16; ++i) printf("%3d ", rd->y_dc_levels[i]);
76 printf("\n");
77 } else {
78 printf("Modes[16]: ");
79 for (i = 0; i < 16; ++i) printf("%d ", rd->modes_i4[i]);
80 printf("\n");
81 }
82 printf("y_ac_levels:\n");
83 for (j = 0; j < 16; ++j) {
84 for (i = is_i16 ? 1 : 0; i < 16; ++i) {
85 printf("%4d ", rd->y_ac_levels[j][i]);
86 }
87 printf("\n");
88 }
89 printf("\n");
90 printf("uv_levels (mode=%d):\n", rd->mode_uv);
91 for (j = 0; j < 8; ++j) {
92 for (i = 0; i < 16; ++i) {
93 printf("%4d ", rd->uv_levels[j][i]);
94 }
95 printf("\n");
96 }
97 }
98
99 #endif // DEBUG_BLOCK
100
101 //------------------------------------------------------------------------------
102
clip(int v,int m,int M)103 static WEBP_INLINE int clip(int v, int m, int M) {
104 return v < m ? m : v > M ? M : v;
105 }
106
107 static const uint8_t kZigzag[16] = {
108 0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15
109 };
110
111 static const uint8_t kDcTable[128] = {
112 4, 5, 6, 7, 8, 9, 10, 10,
113 11, 12, 13, 14, 15, 16, 17, 17,
114 18, 19, 20, 20, 21, 21, 22, 22,
115 23, 23, 24, 25, 25, 26, 27, 28,
116 29, 30, 31, 32, 33, 34, 35, 36,
117 37, 37, 38, 39, 40, 41, 42, 43,
118 44, 45, 46, 46, 47, 48, 49, 50,
119 51, 52, 53, 54, 55, 56, 57, 58,
120 59, 60, 61, 62, 63, 64, 65, 66,
121 67, 68, 69, 70, 71, 72, 73, 74,
122 75, 76, 76, 77, 78, 79, 80, 81,
123 82, 83, 84, 85, 86, 87, 88, 89,
124 91, 93, 95, 96, 98, 100, 101, 102,
125 104, 106, 108, 110, 112, 114, 116, 118,
126 122, 124, 126, 128, 130, 132, 134, 136,
127 138, 140, 143, 145, 148, 151, 154, 157
128 };
129
130 static const uint16_t kAcTable[128] = {
131 4, 5, 6, 7, 8, 9, 10, 11,
132 12, 13, 14, 15, 16, 17, 18, 19,
133 20, 21, 22, 23, 24, 25, 26, 27,
134 28, 29, 30, 31, 32, 33, 34, 35,
135 36, 37, 38, 39, 40, 41, 42, 43,
136 44, 45, 46, 47, 48, 49, 50, 51,
137 52, 53, 54, 55, 56, 57, 58, 60,
138 62, 64, 66, 68, 70, 72, 74, 76,
139 78, 80, 82, 84, 86, 88, 90, 92,
140 94, 96, 98, 100, 102, 104, 106, 108,
141 110, 112, 114, 116, 119, 122, 125, 128,
142 131, 134, 137, 140, 143, 146, 149, 152,
143 155, 158, 161, 164, 167, 170, 173, 177,
144 181, 185, 189, 193, 197, 201, 205, 209,
145 213, 217, 221, 225, 229, 234, 239, 245,
146 249, 254, 259, 264, 269, 274, 279, 284
147 };
148
149 static const uint16_t kAcTable2[128] = {
150 8, 8, 9, 10, 12, 13, 15, 17,
151 18, 20, 21, 23, 24, 26, 27, 29,
152 31, 32, 34, 35, 37, 38, 40, 41,
153 43, 44, 46, 48, 49, 51, 52, 54,
154 55, 57, 58, 60, 62, 63, 65, 66,
155 68, 69, 71, 72, 74, 75, 77, 79,
156 80, 82, 83, 85, 86, 88, 89, 93,
157 96, 99, 102, 105, 108, 111, 114, 117,
158 120, 124, 127, 130, 133, 136, 139, 142,
159 145, 148, 151, 155, 158, 161, 164, 167,
160 170, 173, 176, 179, 184, 189, 193, 198,
161 203, 207, 212, 217, 221, 226, 230, 235,
162 240, 244, 249, 254, 258, 263, 268, 274,
163 280, 286, 292, 299, 305, 311, 317, 323,
164 330, 336, 342, 348, 354, 362, 370, 379,
165 385, 393, 401, 409, 416, 424, 432, 440
166 };
167
168 static const uint8_t kBiasMatrices[3][2] = { // [luma-ac,luma-dc,chroma][dc,ac]
169 { 96, 110 }, { 96, 108 }, { 110, 115 }
170 };
171
172 // Sharpening by (slightly) raising the hi-frequency coeffs.
173 // Hack-ish but helpful for mid-bitrate range. Use with care.
174 #define SHARPEN_BITS 11 // number of descaling bits for sharpening bias
175 static const uint8_t kFreqSharpening[16] = {
176 0, 30, 60, 90,
177 30, 60, 90, 90,
178 60, 90, 90, 90,
179 90, 90, 90, 90
180 };
181
182 //------------------------------------------------------------------------------
183 // Initialize quantization parameters in VP8Matrix
184
185 // Returns the average quantizer
ExpandMatrix(VP8Matrix * const m,int type)186 static int ExpandMatrix(VP8Matrix* const m, int type) {
187 int i, sum;
188 for (i = 0; i < 2; ++i) {
189 const int is_ac_coeff = (i > 0);
190 const int bias = kBiasMatrices[type][is_ac_coeff];
191 m->iq_[i] = (1 << QFIX) / m->q_[i];
192 m->bias_[i] = BIAS(bias);
193 // zthresh_ is the exact value such that QUANTDIV(coeff, iQ, B) is:
194 // * zero if coeff <= zthresh
195 // * non-zero if coeff > zthresh
196 m->zthresh_[i] = ((1 << QFIX) - 1 - m->bias_[i]) / m->iq_[i];
197 }
198 for (i = 2; i < 16; ++i) {
199 m->q_[i] = m->q_[1];
200 m->iq_[i] = m->iq_[1];
201 m->bias_[i] = m->bias_[1];
202 m->zthresh_[i] = m->zthresh_[1];
203 }
204 for (sum = 0, i = 0; i < 16; ++i) {
205 if (type == 0) { // we only use sharpening for AC luma coeffs
206 m->sharpen_[i] = (kFreqSharpening[i] * m->q_[i]) >> SHARPEN_BITS;
207 } else {
208 m->sharpen_[i] = 0;
209 }
210 sum += m->q_[i];
211 }
212 return (sum + 8) >> 4;
213 }
214
SetupMatrices(VP8Encoder * enc)215 static void SetupMatrices(VP8Encoder* enc) {
216 int i;
217 const int tlambda_scale =
218 (enc->method_ >= 4) ? enc->config_->sns_strength
219 : 0;
220 const int num_segments = enc->segment_hdr_.num_segments_;
221 for (i = 0; i < num_segments; ++i) {
222 VP8SegmentInfo* const m = &enc->dqm_[i];
223 const int q = m->quant_;
224 int q4, q16, quv;
225 m->y1_.q_[0] = kDcTable[clip(q + enc->dq_y1_dc_, 0, 127)];
226 m->y1_.q_[1] = kAcTable[clip(q, 0, 127)];
227
228 m->y2_.q_[0] = kDcTable[ clip(q + enc->dq_y2_dc_, 0, 127)] * 2;
229 m->y2_.q_[1] = kAcTable2[clip(q + enc->dq_y2_ac_, 0, 127)];
230
231 m->uv_.q_[0] = kDcTable[clip(q + enc->dq_uv_dc_, 0, 117)];
232 m->uv_.q_[1] = kAcTable[clip(q + enc->dq_uv_ac_, 0, 127)];
233
234 q4 = ExpandMatrix(&m->y1_, 0);
235 q16 = ExpandMatrix(&m->y2_, 1);
236 quv = ExpandMatrix(&m->uv_, 2);
237
238 m->lambda_i4_ = (3 * q4 * q4) >> 7;
239 m->lambda_i16_ = (3 * q16 * q16);
240 m->lambda_uv_ = (3 * quv * quv) >> 6;
241 m->lambda_mode_ = (1 * q4 * q4) >> 7;
242 m->lambda_trellis_i4_ = (7 * q4 * q4) >> 3;
243 m->lambda_trellis_i16_ = (q16 * q16) >> 2;
244 m->lambda_trellis_uv_ = (quv *quv) << 1;
245 m->tlambda_ = (tlambda_scale * q4) >> 5;
246
247 m->min_disto_ = 10 * m->y1_.q_[0]; // quantization-aware min disto
248 m->max_edge_ = 0;
249 }
250 }
251
252 //------------------------------------------------------------------------------
253 // Initialize filtering parameters
254
255 // Very small filter-strength values have close to no visual effect. So we can
256 // save a little decoding-CPU by turning filtering off for these.
257 #define FSTRENGTH_CUTOFF 2
258
SetupFilterStrength(VP8Encoder * const enc)259 static void SetupFilterStrength(VP8Encoder* const enc) {
260 int i;
261 // level0 is in [0..500]. Using '-f 50' as filter_strength is mid-filtering.
262 const int level0 = 5 * enc->config_->filter_strength;
263 for (i = 0; i < NUM_MB_SEGMENTS; ++i) {
264 VP8SegmentInfo* const m = &enc->dqm_[i];
265 // We focus on the quantization of AC coeffs.
266 const int qstep = kAcTable[clip(m->quant_, 0, 127)] >> 2;
267 const int base_strength =
268 VP8FilterStrengthFromDelta(enc->filter_hdr_.sharpness_, qstep);
269 // Segments with lower complexity ('beta') will be less filtered.
270 const int f = base_strength * level0 / (256 + m->beta_);
271 m->fstrength_ = (f < FSTRENGTH_CUTOFF) ? 0 : (f > 63) ? 63 : f;
272 }
273 // We record the initial strength (mainly for the case of 1-segment only).
274 enc->filter_hdr_.level_ = enc->dqm_[0].fstrength_;
275 enc->filter_hdr_.simple_ = (enc->config_->filter_type == 0);
276 enc->filter_hdr_.sharpness_ = enc->config_->filter_sharpness;
277 }
278
279 //------------------------------------------------------------------------------
280
281 // Note: if you change the values below, remember that the max range
282 // allowed by the syntax for DQ_UV is [-16,16].
283 #define MAX_DQ_UV (6)
284 #define MIN_DQ_UV (-4)
285
286 // We want to emulate jpeg-like behaviour where the expected "good" quality
287 // is around q=75. Internally, our "good" middle is around c=50. So we
288 // map accordingly using linear piece-wise function
QualityToCompression(double c)289 static double QualityToCompression(double c) {
290 const double linear_c = (c < 0.75) ? c * (2. / 3.) : 2. * c - 1.;
291 // The file size roughly scales as pow(quantizer, 3.). Actually, the
292 // exponent is somewhere between 2.8 and 3.2, but we're mostly interested
293 // in the mid-quant range. So we scale the compressibility inversely to
294 // this power-law: quant ~= compression ^ 1/3. This law holds well for
295 // low quant. Finer modeling for high-quant would make use of kAcTable[]
296 // more explicitly.
297 const double v = pow(linear_c, 1 / 3.);
298 return v;
299 }
300
QualityToJPEGCompression(double c,double alpha)301 static double QualityToJPEGCompression(double c, double alpha) {
302 // We map the complexity 'alpha' and quality setting 'c' to a compression
303 // exponent empirically matched to the compression curve of libjpeg6b.
304 // On average, the WebP output size will be roughly similar to that of a
305 // JPEG file compressed with same quality factor.
306 const double amin = 0.30;
307 const double amax = 0.85;
308 const double exp_min = 0.4;
309 const double exp_max = 0.9;
310 const double slope = (exp_min - exp_max) / (amax - amin);
311 // Linearly interpolate 'expn' from exp_min to exp_max
312 // in the [amin, amax] range.
313 const double expn = (alpha > amax) ? exp_min
314 : (alpha < amin) ? exp_max
315 : exp_max + slope * (alpha - amin);
316 const double v = pow(c, expn);
317 return v;
318 }
319
SegmentsAreEquivalent(const VP8SegmentInfo * const S1,const VP8SegmentInfo * const S2)320 static int SegmentsAreEquivalent(const VP8SegmentInfo* const S1,
321 const VP8SegmentInfo* const S2) {
322 return (S1->quant_ == S2->quant_) && (S1->fstrength_ == S2->fstrength_);
323 }
324
SimplifySegments(VP8Encoder * const enc)325 static void SimplifySegments(VP8Encoder* const enc) {
326 int map[NUM_MB_SEGMENTS] = { 0, 1, 2, 3 };
327 const int num_segments = enc->segment_hdr_.num_segments_;
328 int num_final_segments = 1;
329 int s1, s2;
330 for (s1 = 1; s1 < num_segments; ++s1) { // find similar segments
331 const VP8SegmentInfo* const S1 = &enc->dqm_[s1];
332 int found = 0;
333 // check if we already have similar segment
334 for (s2 = 0; s2 < num_final_segments; ++s2) {
335 const VP8SegmentInfo* const S2 = &enc->dqm_[s2];
336 if (SegmentsAreEquivalent(S1, S2)) {
337 found = 1;
338 break;
339 }
340 }
341 map[s1] = s2;
342 if (!found) {
343 if (num_final_segments != s1) {
344 enc->dqm_[num_final_segments] = enc->dqm_[s1];
345 }
346 ++num_final_segments;
347 }
348 }
349 if (num_final_segments < num_segments) { // Remap
350 int i = enc->mb_w_ * enc->mb_h_;
351 while (i-- > 0) enc->mb_info_[i].segment_ = map[enc->mb_info_[i].segment_];
352 enc->segment_hdr_.num_segments_ = num_final_segments;
353 // Replicate the trailing segment infos (it's mostly cosmetics)
354 for (i = num_final_segments; i < num_segments; ++i) {
355 enc->dqm_[i] = enc->dqm_[num_final_segments - 1];
356 }
357 }
358 }
359
VP8SetSegmentParams(VP8Encoder * const enc,float quality)360 void VP8SetSegmentParams(VP8Encoder* const enc, float quality) {
361 int i;
362 int dq_uv_ac, dq_uv_dc;
363 const int num_segments = enc->segment_hdr_.num_segments_;
364 const double amp = SNS_TO_DQ * enc->config_->sns_strength / 100. / 128.;
365 const double Q = quality / 100.;
366 const double c_base = enc->config_->emulate_jpeg_size ?
367 QualityToJPEGCompression(Q, enc->alpha_ / 255.) :
368 QualityToCompression(Q);
369 for (i = 0; i < num_segments; ++i) {
370 // We modulate the base coefficient to accommodate for the quantization
371 // susceptibility and allow denser segments to be quantized more.
372 const double expn = 1. - amp * enc->dqm_[i].alpha_;
373 const double c = pow(c_base, expn);
374 const int q = (int)(127. * (1. - c));
375 assert(expn > 0.);
376 enc->dqm_[i].quant_ = clip(q, 0, 127);
377 }
378
379 // purely indicative in the bitstream (except for the 1-segment case)
380 enc->base_quant_ = enc->dqm_[0].quant_;
381
382 // fill-in values for the unused segments (required by the syntax)
383 for (i = num_segments; i < NUM_MB_SEGMENTS; ++i) {
384 enc->dqm_[i].quant_ = enc->base_quant_;
385 }
386
387 // uv_alpha_ is normally spread around ~60. The useful range is
388 // typically ~30 (quite bad) to ~100 (ok to decimate UV more).
389 // We map it to the safe maximal range of MAX/MIN_DQ_UV for dq_uv.
390 dq_uv_ac = (enc->uv_alpha_ - MID_ALPHA) * (MAX_DQ_UV - MIN_DQ_UV)
391 / (MAX_ALPHA - MIN_ALPHA);
392 // we rescale by the user-defined strength of adaptation
393 dq_uv_ac = dq_uv_ac * enc->config_->sns_strength / 100;
394 // and make it safe.
395 dq_uv_ac = clip(dq_uv_ac, MIN_DQ_UV, MAX_DQ_UV);
396 // We also boost the dc-uv-quant a little, based on sns-strength, since
397 // U/V channels are quite more reactive to high quants (flat DC-blocks
398 // tend to appear, and are unpleasant).
399 dq_uv_dc = -4 * enc->config_->sns_strength / 100;
400 dq_uv_dc = clip(dq_uv_dc, -15, 15); // 4bit-signed max allowed
401
402 enc->dq_y1_dc_ = 0; // TODO(skal): dq-lum
403 enc->dq_y2_dc_ = 0;
404 enc->dq_y2_ac_ = 0;
405 enc->dq_uv_dc_ = dq_uv_dc;
406 enc->dq_uv_ac_ = dq_uv_ac;
407
408 SetupFilterStrength(enc); // initialize segments' filtering, eventually
409
410 if (num_segments > 1) SimplifySegments(enc);
411
412 SetupMatrices(enc); // finalize quantization matrices
413 }
414
415 //------------------------------------------------------------------------------
416 // Form the predictions in cache
417
418 // Must be ordered using {DC_PRED, TM_PRED, V_PRED, H_PRED} as index
419 const int VP8I16ModeOffsets[4] = { I16DC16, I16TM16, I16VE16, I16HE16 };
420 const int VP8UVModeOffsets[4] = { C8DC8, C8TM8, C8VE8, C8HE8 };
421
422 // Must be indexed using {B_DC_PRED -> B_HU_PRED} as index
423 const int VP8I4ModeOffsets[NUM_BMODES] = {
424 I4DC4, I4TM4, I4VE4, I4HE4, I4RD4, I4VR4, I4LD4, I4VL4, I4HD4, I4HU4
425 };
426
VP8MakeLuma16Preds(const VP8EncIterator * const it)427 void VP8MakeLuma16Preds(const VP8EncIterator* const it) {
428 const uint8_t* const left = it->x_ ? it->y_left_ : NULL;
429 const uint8_t* const top = it->y_ ? it->y_top_ : NULL;
430 VP8EncPredLuma16(it->yuv_p_, left, top);
431 }
432
VP8MakeChroma8Preds(const VP8EncIterator * const it)433 void VP8MakeChroma8Preds(const VP8EncIterator* const it) {
434 const uint8_t* const left = it->x_ ? it->u_left_ : NULL;
435 const uint8_t* const top = it->y_ ? it->uv_top_ : NULL;
436 VP8EncPredChroma8(it->yuv_p_, left, top);
437 }
438
VP8MakeIntra4Preds(const VP8EncIterator * const it)439 void VP8MakeIntra4Preds(const VP8EncIterator* const it) {
440 VP8EncPredLuma4(it->yuv_p_, it->i4_top_);
441 }
442
443 //------------------------------------------------------------------------------
444 // Quantize
445
446 // Layout:
447 // +----+
448 // |YYYY| 0
449 // |YYYY| 4
450 // |YYYY| 8
451 // |YYYY| 12
452 // +----+
453 // |UUVV| 16
454 // |UUVV| 20
455 // +----+
456
457 const int VP8Scan[16] = { // Luma
458 0 + 0 * BPS, 4 + 0 * BPS, 8 + 0 * BPS, 12 + 0 * BPS,
459 0 + 4 * BPS, 4 + 4 * BPS, 8 + 4 * BPS, 12 + 4 * BPS,
460 0 + 8 * BPS, 4 + 8 * BPS, 8 + 8 * BPS, 12 + 8 * BPS,
461 0 + 12 * BPS, 4 + 12 * BPS, 8 + 12 * BPS, 12 + 12 * BPS,
462 };
463
464 static const int VP8ScanUV[4 + 4] = {
465 0 + 0 * BPS, 4 + 0 * BPS, 0 + 4 * BPS, 4 + 4 * BPS, // U
466 8 + 0 * BPS, 12 + 0 * BPS, 8 + 4 * BPS, 12 + 4 * BPS // V
467 };
468
469 //------------------------------------------------------------------------------
470 // Distortion measurement
471
472 static const uint16_t kWeightY[16] = {
473 38, 32, 20, 9, 32, 28, 17, 7, 20, 17, 10, 4, 9, 7, 4, 2
474 };
475
476 static const uint16_t kWeightTrellis[16] = {
477 #if USE_TDISTO == 0
478 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
479 #else
480 30, 27, 19, 11,
481 27, 24, 17, 10,
482 19, 17, 12, 8,
483 11, 10, 8, 6
484 #endif
485 };
486
487 // Init/Copy the common fields in score.
InitScore(VP8ModeScore * const rd)488 static void InitScore(VP8ModeScore* const rd) {
489 rd->D = 0;
490 rd->SD = 0;
491 rd->R = 0;
492 rd->H = 0;
493 rd->nz = 0;
494 rd->score = MAX_COST;
495 }
496
CopyScore(VP8ModeScore * const dst,const VP8ModeScore * const src)497 static void CopyScore(VP8ModeScore* const dst, const VP8ModeScore* const src) {
498 dst->D = src->D;
499 dst->SD = src->SD;
500 dst->R = src->R;
501 dst->H = src->H;
502 dst->nz = src->nz; // note that nz is not accumulated, but just copied.
503 dst->score = src->score;
504 }
505
AddScore(VP8ModeScore * const dst,const VP8ModeScore * const src)506 static void AddScore(VP8ModeScore* const dst, const VP8ModeScore* const src) {
507 dst->D += src->D;
508 dst->SD += src->SD;
509 dst->R += src->R;
510 dst->H += src->H;
511 dst->nz |= src->nz; // here, new nz bits are accumulated.
512 dst->score += src->score;
513 }
514
515 //------------------------------------------------------------------------------
516 // Performs trellis-optimized quantization.
517
518 // Trellis node
519 typedef struct {
520 int8_t prev; // best previous node
521 int8_t sign; // sign of coeff_i
522 int16_t level; // level
523 } Node;
524
525 // Score state
526 typedef struct {
527 score_t score; // partial RD score
528 const uint16_t* costs; // shortcut to cost tables
529 } ScoreState;
530
531 // If a coefficient was quantized to a value Q (using a neutral bias),
532 // we test all alternate possibilities between [Q-MIN_DELTA, Q+MAX_DELTA]
533 // We don't test negative values though.
534 #define MIN_DELTA 0 // how much lower level to try
535 #define MAX_DELTA 1 // how much higher
536 #define NUM_NODES (MIN_DELTA + 1 + MAX_DELTA)
537 #define NODE(n, l) (nodes[(n)][(l) + MIN_DELTA])
538 #define SCORE_STATE(n, l) (score_states[n][(l) + MIN_DELTA])
539
SetRDScore(int lambda,VP8ModeScore * const rd)540 static WEBP_INLINE void SetRDScore(int lambda, VP8ModeScore* const rd) {
541 // TODO: incorporate the "* 256" in the tables?
542 rd->score = (rd->R + rd->H) * lambda + 256 * (rd->D + rd->SD);
543 }
544
RDScoreTrellis(int lambda,score_t rate,score_t distortion)545 static WEBP_INLINE score_t RDScoreTrellis(int lambda, score_t rate,
546 score_t distortion) {
547 return rate * lambda + 256 * distortion;
548 }
549
TrellisQuantizeBlock(const VP8Encoder * const enc,int16_t in[16],int16_t out[16],int ctx0,int coeff_type,const VP8Matrix * const mtx,int lambda)550 static int TrellisQuantizeBlock(const VP8Encoder* const enc,
551 int16_t in[16], int16_t out[16],
552 int ctx0, int coeff_type,
553 const VP8Matrix* const mtx,
554 int lambda) {
555 const ProbaArray* const probas = enc->proba_.coeffs_[coeff_type];
556 const CostArray* const costs = enc->proba_.level_cost_[coeff_type];
557 const int first = (coeff_type == 0) ? 1 : 0;
558 Node nodes[16][NUM_NODES];
559 ScoreState score_states[2][NUM_NODES];
560 ScoreState* ss_cur = &SCORE_STATE(0, MIN_DELTA);
561 ScoreState* ss_prev = &SCORE_STATE(1, MIN_DELTA);
562 int best_path[3] = {-1, -1, -1}; // store best-last/best-level/best-previous
563 score_t best_score;
564 int n, m, p, last;
565
566 {
567 score_t cost;
568 const int thresh = mtx->q_[1] * mtx->q_[1] / 4;
569 const int last_proba = probas[VP8EncBands[first]][ctx0][0];
570
571 // compute the position of the last interesting coefficient
572 last = first - 1;
573 for (n = 15; n >= first; --n) {
574 const int j = kZigzag[n];
575 const int err = in[j] * in[j];
576 if (err > thresh) {
577 last = n;
578 break;
579 }
580 }
581 // we don't need to go inspect up to n = 16 coeffs. We can just go up
582 // to last + 1 (inclusive) without losing much.
583 if (last < 15) ++last;
584
585 // compute 'skip' score. This is the max score one can do.
586 cost = VP8BitCost(0, last_proba);
587 best_score = RDScoreTrellis(lambda, cost, 0);
588
589 // initialize source node.
590 for (m = -MIN_DELTA; m <= MAX_DELTA; ++m) {
591 const score_t rate = (ctx0 == 0) ? VP8BitCost(1, last_proba) : 0;
592 ss_cur[m].score = RDScoreTrellis(lambda, rate, 0);
593 ss_cur[m].costs = costs[VP8EncBands[first]][ctx0];
594 }
595 }
596
597 // traverse trellis.
598 for (n = first; n <= last; ++n) {
599 const int j = kZigzag[n];
600 const uint32_t Q = mtx->q_[j];
601 const uint32_t iQ = mtx->iq_[j];
602 const uint32_t B = BIAS(0x00); // neutral bias
603 // note: it's important to take sign of the _original_ coeff,
604 // so we don't have to consider level < 0 afterward.
605 const int sign = (in[j] < 0);
606 const uint32_t coeff0 = (sign ? -in[j] : in[j]) + mtx->sharpen_[j];
607 int level0 = QUANTDIV(coeff0, iQ, B);
608 if (level0 > MAX_LEVEL) level0 = MAX_LEVEL;
609
610 { // Swap current and previous score states
611 ScoreState* const tmp = ss_cur;
612 ss_cur = ss_prev;
613 ss_prev = tmp;
614 }
615
616 // test all alternate level values around level0.
617 for (m = -MIN_DELTA; m <= MAX_DELTA; ++m) {
618 Node* const cur = &NODE(n, m);
619 int level = level0 + m;
620 const int ctx = (level > 2) ? 2 : level;
621 const int band = VP8EncBands[n + 1];
622 score_t base_score, last_pos_score;
623 score_t best_cur_score = MAX_COST;
624 int best_prev = 0; // default, in case
625
626 ss_cur[m].score = MAX_COST;
627 ss_cur[m].costs = costs[band][ctx];
628 if (level > MAX_LEVEL || level < 0) { // node is dead?
629 continue;
630 }
631
632 // Compute extra rate cost if last coeff's position is < 15
633 {
634 const score_t last_pos_cost =
635 (n < 15) ? VP8BitCost(0, probas[band][ctx][0]) : 0;
636 last_pos_score = RDScoreTrellis(lambda, last_pos_cost, 0);
637 }
638
639 {
640 // Compute delta_error = how much coding this level will
641 // subtract to max_error as distortion.
642 // Here, distortion = sum of (|coeff_i| - level_i * Q_i)^2
643 const int new_error = coeff0 - level * Q;
644 const int delta_error =
645 kWeightTrellis[j] * (new_error * new_error - coeff0 * coeff0);
646 base_score = RDScoreTrellis(lambda, 0, delta_error);
647 }
648
649 // Inspect all possible non-dead predecessors. Retain only the best one.
650 for (p = -MIN_DELTA; p <= MAX_DELTA; ++p) {
651 // Dead nodes (with ss_prev[p].score >= MAX_COST) are automatically
652 // eliminated since their score can't be better than the current best.
653 const score_t cost = VP8LevelCost(ss_prev[p].costs, level);
654 // Examine node assuming it's a non-terminal one.
655 const score_t score =
656 base_score + ss_prev[p].score + RDScoreTrellis(lambda, cost, 0);
657 if (score < best_cur_score) {
658 best_cur_score = score;
659 best_prev = p;
660 }
661 }
662 // Store best finding in current node.
663 cur->sign = sign;
664 cur->level = level;
665 cur->prev = best_prev;
666 ss_cur[m].score = best_cur_score;
667
668 // Now, record best terminal node (and thus best entry in the graph).
669 if (level != 0) {
670 const score_t score = best_cur_score + last_pos_score;
671 if (score < best_score) {
672 best_score = score;
673 best_path[0] = n; // best eob position
674 best_path[1] = m; // best node index
675 best_path[2] = best_prev; // best predecessor
676 }
677 }
678 }
679 }
680
681 // Fresh start
682 memset(in + first, 0, (16 - first) * sizeof(*in));
683 memset(out + first, 0, (16 - first) * sizeof(*out));
684 if (best_path[0] == -1) {
685 return 0; // skip!
686 }
687
688 {
689 // Unwind the best path.
690 // Note: best-prev on terminal node is not necessarily equal to the
691 // best_prev for non-terminal. So we patch best_path[2] in.
692 int nz = 0;
693 int best_node = best_path[1];
694 n = best_path[0];
695 NODE(n, best_node).prev = best_path[2]; // force best-prev for terminal
696
697 for (; n >= first; --n) {
698 const Node* const node = &NODE(n, best_node);
699 const int j = kZigzag[n];
700 out[n] = node->sign ? -node->level : node->level;
701 nz |= node->level;
702 in[j] = out[n] * mtx->q_[j];
703 best_node = node->prev;
704 }
705 return (nz != 0);
706 }
707 }
708
709 #undef NODE
710
711 //------------------------------------------------------------------------------
712 // Performs: difference, transform, quantize, back-transform, add
713 // all at once. Output is the reconstructed block in *yuv_out, and the
714 // quantized levels in *levels.
715
ReconstructIntra16(VP8EncIterator * const it,VP8ModeScore * const rd,uint8_t * const yuv_out,int mode)716 static int ReconstructIntra16(VP8EncIterator* const it,
717 VP8ModeScore* const rd,
718 uint8_t* const yuv_out,
719 int mode) {
720 const VP8Encoder* const enc = it->enc_;
721 const uint8_t* const ref = it->yuv_p_ + VP8I16ModeOffsets[mode];
722 const uint8_t* const src = it->yuv_in_ + Y_OFF;
723 const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
724 int nz = 0;
725 int n;
726 int16_t tmp[16][16], dc_tmp[16];
727
728 for (n = 0; n < 16; ++n) {
729 VP8FTransform(src + VP8Scan[n], ref + VP8Scan[n], tmp[n]);
730 }
731 VP8FTransformWHT(tmp[0], dc_tmp);
732 nz |= VP8EncQuantizeBlockWHT(dc_tmp, rd->y_dc_levels, &dqm->y2_) << 24;
733
734 if (DO_TRELLIS_I16 && it->do_trellis_) {
735 int x, y;
736 VP8IteratorNzToBytes(it);
737 for (y = 0, n = 0; y < 4; ++y) {
738 for (x = 0; x < 4; ++x, ++n) {
739 const int ctx = it->top_nz_[x] + it->left_nz_[y];
740 const int non_zero =
741 TrellisQuantizeBlock(enc, tmp[n], rd->y_ac_levels[n], ctx, 0,
742 &dqm->y1_, dqm->lambda_trellis_i16_);
743 it->top_nz_[x] = it->left_nz_[y] = non_zero;
744 rd->y_ac_levels[n][0] = 0;
745 nz |= non_zero << n;
746 }
747 }
748 } else {
749 for (n = 0; n < 16; ++n) {
750 // Zero-out the first coeff, so that: a) nz is correct below, and
751 // b) finding 'last' non-zero coeffs in SetResidualCoeffs() is simplified.
752 tmp[n][0] = 0;
753 nz |= VP8EncQuantizeBlock(tmp[n], rd->y_ac_levels[n], &dqm->y1_) << n;
754 assert(rd->y_ac_levels[n][0] == 0);
755 }
756 }
757
758 // Transform back
759 VP8TransformWHT(dc_tmp, tmp[0]);
760 for (n = 0; n < 16; n += 2) {
761 VP8ITransform(ref + VP8Scan[n], tmp[n], yuv_out + VP8Scan[n], 1);
762 }
763
764 return nz;
765 }
766
ReconstructIntra4(VP8EncIterator * const it,int16_t levels[16],const uint8_t * const src,uint8_t * const yuv_out,int mode)767 static int ReconstructIntra4(VP8EncIterator* const it,
768 int16_t levels[16],
769 const uint8_t* const src,
770 uint8_t* const yuv_out,
771 int mode) {
772 const VP8Encoder* const enc = it->enc_;
773 const uint8_t* const ref = it->yuv_p_ + VP8I4ModeOffsets[mode];
774 const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
775 int nz = 0;
776 int16_t tmp[16];
777
778 VP8FTransform(src, ref, tmp);
779 if (DO_TRELLIS_I4 && it->do_trellis_) {
780 const int x = it->i4_ & 3, y = it->i4_ >> 2;
781 const int ctx = it->top_nz_[x] + it->left_nz_[y];
782 nz = TrellisQuantizeBlock(enc, tmp, levels, ctx, 3, &dqm->y1_,
783 dqm->lambda_trellis_i4_);
784 } else {
785 nz = VP8EncQuantizeBlock(tmp, levels, &dqm->y1_);
786 }
787 VP8ITransform(ref, tmp, yuv_out, 0);
788 return nz;
789 }
790
ReconstructUV(VP8EncIterator * const it,VP8ModeScore * const rd,uint8_t * const yuv_out,int mode)791 static int ReconstructUV(VP8EncIterator* const it, VP8ModeScore* const rd,
792 uint8_t* const yuv_out, int mode) {
793 const VP8Encoder* const enc = it->enc_;
794 const uint8_t* const ref = it->yuv_p_ + VP8UVModeOffsets[mode];
795 const uint8_t* const src = it->yuv_in_ + U_OFF;
796 const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
797 int nz = 0;
798 int n;
799 int16_t tmp[8][16];
800
801 for (n = 0; n < 8; ++n) {
802 VP8FTransform(src + VP8ScanUV[n], ref + VP8ScanUV[n], tmp[n]);
803 }
804 if (DO_TRELLIS_UV && it->do_trellis_) {
805 int ch, x, y;
806 for (ch = 0, n = 0; ch <= 2; ch += 2) {
807 for (y = 0; y < 2; ++y) {
808 for (x = 0; x < 2; ++x, ++n) {
809 const int ctx = it->top_nz_[4 + ch + x] + it->left_nz_[4 + ch + y];
810 const int non_zero =
811 TrellisQuantizeBlock(enc, tmp[n], rd->uv_levels[n], ctx, 2,
812 &dqm->uv_, dqm->lambda_trellis_uv_);
813 it->top_nz_[4 + ch + x] = it->left_nz_[4 + ch + y] = non_zero;
814 nz |= non_zero << n;
815 }
816 }
817 }
818 } else {
819 for (n = 0; n < 8; ++n) {
820 nz |= VP8EncQuantizeBlock(tmp[n], rd->uv_levels[n], &dqm->uv_) << n;
821 }
822 }
823
824 for (n = 0; n < 8; n += 2) {
825 VP8ITransform(ref + VP8ScanUV[n], tmp[n], yuv_out + VP8ScanUV[n], 1);
826 }
827 return (nz << 16);
828 }
829
830 //------------------------------------------------------------------------------
831 // RD-opt decision. Reconstruct each modes, evalue distortion and bit-cost.
832 // Pick the mode is lower RD-cost = Rate + lambda * Distortion.
833
StoreMaxDelta(VP8SegmentInfo * const dqm,const int16_t DCs[16])834 static void StoreMaxDelta(VP8SegmentInfo* const dqm, const int16_t DCs[16]) {
835 // We look at the first three AC coefficients to determine what is the average
836 // delta between each sub-4x4 block.
837 const int v0 = abs(DCs[1]);
838 const int v1 = abs(DCs[4]);
839 const int v2 = abs(DCs[5]);
840 int max_v = (v0 > v1) ? v1 : v0;
841 max_v = (v2 > max_v) ? v2 : max_v;
842 if (max_v > dqm->max_edge_) dqm->max_edge_ = max_v;
843 }
844
SwapPtr(uint8_t ** a,uint8_t ** b)845 static void SwapPtr(uint8_t** a, uint8_t** b) {
846 uint8_t* const tmp = *a;
847 *a = *b;
848 *b = tmp;
849 }
850
SwapOut(VP8EncIterator * const it)851 static void SwapOut(VP8EncIterator* const it) {
852 SwapPtr(&it->yuv_out_, &it->yuv_out2_);
853 }
854
IsFlat(const int16_t * levels,int num_blocks,score_t thresh)855 static score_t IsFlat(const int16_t* levels, int num_blocks, score_t thresh) {
856 score_t score = 0;
857 while (num_blocks-- > 0) { // TODO(skal): refine positional scoring?
858 int i;
859 for (i = 1; i < 16; ++i) { // omit DC, we're only interested in AC
860 score += (levels[i] != 0);
861 if (score > thresh) return 0;
862 }
863 levels += 16;
864 }
865 return 1;
866 }
867
PickBestIntra16(VP8EncIterator * const it,VP8ModeScore * const rd)868 static void PickBestIntra16(VP8EncIterator* const it, VP8ModeScore* const rd) {
869 const int kNumBlocks = 16;
870 VP8SegmentInfo* const dqm = &it->enc_->dqm_[it->mb_->segment_];
871 const int lambda = dqm->lambda_i16_;
872 const int tlambda = dqm->tlambda_;
873 const uint8_t* const src = it->yuv_in_ + Y_OFF;
874 VP8ModeScore rd16;
875 int mode;
876
877 rd->mode_i16 = -1;
878 for (mode = 0; mode < NUM_PRED_MODES; ++mode) {
879 uint8_t* const tmp_dst = it->yuv_out2_ + Y_OFF; // scratch buffer
880 int nz;
881
882 // Reconstruct
883 nz = ReconstructIntra16(it, &rd16, tmp_dst, mode);
884
885 // Measure RD-score
886 rd16.D = VP8SSE16x16(src, tmp_dst);
887 rd16.SD = tlambda ? MULT_8B(tlambda, VP8TDisto16x16(src, tmp_dst, kWeightY))
888 : 0;
889 rd16.H = VP8FixedCostsI16[mode];
890 rd16.R = VP8GetCostLuma16(it, &rd16);
891 if (mode > 0 &&
892 IsFlat(rd16.y_ac_levels[0], kNumBlocks, FLATNESS_LIMIT_I16)) {
893 // penalty to avoid flat area to be mispredicted by complex mode
894 rd16.R += FLATNESS_PENALTY * kNumBlocks;
895 }
896
897 // Since we always examine Intra16 first, we can overwrite *rd directly.
898 SetRDScore(lambda, &rd16);
899 if (mode == 0 || rd16.score < rd->score) {
900 CopyScore(rd, &rd16);
901 rd->mode_i16 = mode;
902 rd->nz = nz;
903 memcpy(rd->y_ac_levels, rd16.y_ac_levels, sizeof(rd16.y_ac_levels));
904 memcpy(rd->y_dc_levels, rd16.y_dc_levels, sizeof(rd16.y_dc_levels));
905 SwapOut(it);
906 }
907 }
908 SetRDScore(dqm->lambda_mode_, rd); // finalize score for mode decision.
909 VP8SetIntra16Mode(it, rd->mode_i16);
910
911 // we have a blocky macroblock (only DCs are non-zero) with fairly high
912 // distortion, record max delta so we can later adjust the minimal filtering
913 // strength needed to smooth these blocks out.
914 if ((rd->nz & 0xffff) == 0 && rd->D > dqm->min_disto_) {
915 StoreMaxDelta(dqm, rd->y_dc_levels);
916 }
917 }
918
919 //------------------------------------------------------------------------------
920
921 // return the cost array corresponding to the surrounding prediction modes.
GetCostModeI4(VP8EncIterator * const it,const uint8_t modes[16])922 static const uint16_t* GetCostModeI4(VP8EncIterator* const it,
923 const uint8_t modes[16]) {
924 const int preds_w = it->enc_->preds_w_;
925 const int x = (it->i4_ & 3), y = it->i4_ >> 2;
926 const int left = (x == 0) ? it->preds_[y * preds_w - 1] : modes[it->i4_ - 1];
927 const int top = (y == 0) ? it->preds_[-preds_w + x] : modes[it->i4_ - 4];
928 return VP8FixedCostsI4[top][left];
929 }
930
PickBestIntra4(VP8EncIterator * const it,VP8ModeScore * const rd)931 static int PickBestIntra4(VP8EncIterator* const it, VP8ModeScore* const rd) {
932 const VP8Encoder* const enc = it->enc_;
933 const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
934 const int lambda = dqm->lambda_i4_;
935 const int tlambda = dqm->tlambda_;
936 const uint8_t* const src0 = it->yuv_in_ + Y_OFF;
937 uint8_t* const best_blocks = it->yuv_out2_ + Y_OFF;
938 int total_header_bits = 0;
939 VP8ModeScore rd_best;
940
941 if (enc->max_i4_header_bits_ == 0) {
942 return 0;
943 }
944
945 InitScore(&rd_best);
946 rd_best.H = 211; // '211' is the value of VP8BitCost(0, 145)
947 SetRDScore(dqm->lambda_mode_, &rd_best);
948 VP8IteratorStartI4(it);
949 do {
950 const int kNumBlocks = 1;
951 VP8ModeScore rd_i4;
952 int mode;
953 int best_mode = -1;
954 const uint8_t* const src = src0 + VP8Scan[it->i4_];
955 const uint16_t* const mode_costs = GetCostModeI4(it, rd->modes_i4);
956 uint8_t* best_block = best_blocks + VP8Scan[it->i4_];
957 uint8_t* tmp_dst = it->yuv_p_ + I4TMP; // scratch buffer.
958
959 InitScore(&rd_i4);
960 VP8MakeIntra4Preds(it);
961 for (mode = 0; mode < NUM_BMODES; ++mode) {
962 VP8ModeScore rd_tmp;
963 int16_t tmp_levels[16];
964
965 // Reconstruct
966 rd_tmp.nz =
967 ReconstructIntra4(it, tmp_levels, src, tmp_dst, mode) << it->i4_;
968
969 // Compute RD-score
970 rd_tmp.D = VP8SSE4x4(src, tmp_dst);
971 rd_tmp.SD =
972 tlambda ? MULT_8B(tlambda, VP8TDisto4x4(src, tmp_dst, kWeightY))
973 : 0;
974 rd_tmp.H = mode_costs[mode];
975 rd_tmp.R = VP8GetCostLuma4(it, tmp_levels);
976 if (mode > 0 && IsFlat(tmp_levels, kNumBlocks, FLATNESS_LIMIT_I4)) {
977 rd_tmp.R += FLATNESS_PENALTY * kNumBlocks;
978 }
979
980 SetRDScore(lambda, &rd_tmp);
981 if (best_mode < 0 || rd_tmp.score < rd_i4.score) {
982 CopyScore(&rd_i4, &rd_tmp);
983 best_mode = mode;
984 SwapPtr(&tmp_dst, &best_block);
985 memcpy(rd_best.y_ac_levels[it->i4_], tmp_levels, sizeof(tmp_levels));
986 }
987 }
988 SetRDScore(dqm->lambda_mode_, &rd_i4);
989 AddScore(&rd_best, &rd_i4);
990 if (rd_best.score >= rd->score) {
991 return 0;
992 }
993 total_header_bits += (int)rd_i4.H; // <- equal to mode_costs[best_mode];
994 if (total_header_bits > enc->max_i4_header_bits_) {
995 return 0;
996 }
997 // Copy selected samples if not in the right place already.
998 if (best_block != best_blocks + VP8Scan[it->i4_]) {
999 VP8Copy4x4(best_block, best_blocks + VP8Scan[it->i4_]);
1000 }
1001 rd->modes_i4[it->i4_] = best_mode;
1002 it->top_nz_[it->i4_ & 3] = it->left_nz_[it->i4_ >> 2] = (rd_i4.nz ? 1 : 0);
1003 } while (VP8IteratorRotateI4(it, best_blocks));
1004
1005 // finalize state
1006 CopyScore(rd, &rd_best);
1007 VP8SetIntra4Mode(it, rd->modes_i4);
1008 SwapOut(it);
1009 memcpy(rd->y_ac_levels, rd_best.y_ac_levels, sizeof(rd->y_ac_levels));
1010 return 1; // select intra4x4 over intra16x16
1011 }
1012
1013 //------------------------------------------------------------------------------
1014
PickBestUV(VP8EncIterator * const it,VP8ModeScore * const rd)1015 static void PickBestUV(VP8EncIterator* const it, VP8ModeScore* const rd) {
1016 const int kNumBlocks = 8;
1017 const VP8SegmentInfo* const dqm = &it->enc_->dqm_[it->mb_->segment_];
1018 const int lambda = dqm->lambda_uv_;
1019 const uint8_t* const src = it->yuv_in_ + U_OFF;
1020 uint8_t* const tmp_dst = it->yuv_out2_ + U_OFF; // scratch buffer
1021 uint8_t* const dst0 = it->yuv_out_ + U_OFF;
1022 VP8ModeScore rd_best;
1023 int mode;
1024
1025 rd->mode_uv = -1;
1026 InitScore(&rd_best);
1027 for (mode = 0; mode < NUM_PRED_MODES; ++mode) {
1028 VP8ModeScore rd_uv;
1029
1030 // Reconstruct
1031 rd_uv.nz = ReconstructUV(it, &rd_uv, tmp_dst, mode);
1032
1033 // Compute RD-score
1034 rd_uv.D = VP8SSE16x8(src, tmp_dst);
1035 rd_uv.SD = 0; // TODO: should we call TDisto? it tends to flatten areas.
1036 rd_uv.H = VP8FixedCostsUV[mode];
1037 rd_uv.R = VP8GetCostUV(it, &rd_uv);
1038 if (mode > 0 && IsFlat(rd_uv.uv_levels[0], kNumBlocks, FLATNESS_LIMIT_UV)) {
1039 rd_uv.R += FLATNESS_PENALTY * kNumBlocks;
1040 }
1041
1042 SetRDScore(lambda, &rd_uv);
1043 if (mode == 0 || rd_uv.score < rd_best.score) {
1044 CopyScore(&rd_best, &rd_uv);
1045 rd->mode_uv = mode;
1046 memcpy(rd->uv_levels, rd_uv.uv_levels, sizeof(rd->uv_levels));
1047 memcpy(dst0, tmp_dst, UV_SIZE); // TODO: SwapUVOut() ?
1048 }
1049 }
1050 VP8SetIntraUVMode(it, rd->mode_uv);
1051 AddScore(rd, &rd_best);
1052 }
1053
1054 //------------------------------------------------------------------------------
1055 // Final reconstruction and quantization.
1056
SimpleQuantize(VP8EncIterator * const it,VP8ModeScore * const rd)1057 static void SimpleQuantize(VP8EncIterator* const it, VP8ModeScore* const rd) {
1058 const VP8Encoder* const enc = it->enc_;
1059 const int is_i16 = (it->mb_->type_ == 1);
1060 int nz = 0;
1061
1062 if (is_i16) {
1063 nz = ReconstructIntra16(it, rd, it->yuv_out_ + Y_OFF, it->preds_[0]);
1064 } else {
1065 VP8IteratorStartI4(it);
1066 do {
1067 const int mode =
1068 it->preds_[(it->i4_ & 3) + (it->i4_ >> 2) * enc->preds_w_];
1069 const uint8_t* const src = it->yuv_in_ + Y_OFF + VP8Scan[it->i4_];
1070 uint8_t* const dst = it->yuv_out_ + Y_OFF + VP8Scan[it->i4_];
1071 VP8MakeIntra4Preds(it);
1072 nz |= ReconstructIntra4(it, rd->y_ac_levels[it->i4_],
1073 src, dst, mode) << it->i4_;
1074 } while (VP8IteratorRotateI4(it, it->yuv_out_ + Y_OFF));
1075 }
1076
1077 nz |= ReconstructUV(it, rd, it->yuv_out_ + U_OFF, it->mb_->uv_mode_);
1078 rd->nz = nz;
1079 }
1080
1081 // Refine intra16/intra4 sub-modes based on distortion only (not rate).
DistoRefine(VP8EncIterator * const it,int try_both_i4_i16)1082 static void DistoRefine(VP8EncIterator* const it, int try_both_i4_i16) {
1083 const int is_i16 = (it->mb_->type_ == 1);
1084 score_t best_score = MAX_COST;
1085
1086 if (try_both_i4_i16 || is_i16) {
1087 int mode;
1088 int best_mode = -1;
1089 for (mode = 0; mode < NUM_PRED_MODES; ++mode) {
1090 const uint8_t* const ref = it->yuv_p_ + VP8I16ModeOffsets[mode];
1091 const uint8_t* const src = it->yuv_in_ + Y_OFF;
1092 const score_t score = VP8SSE16x16(src, ref);
1093 if (score < best_score) {
1094 best_mode = mode;
1095 best_score = score;
1096 }
1097 }
1098 VP8SetIntra16Mode(it, best_mode);
1099 }
1100 if (try_both_i4_i16 || !is_i16) {
1101 uint8_t modes_i4[16];
1102 // We don't evaluate the rate here, but just account for it through a
1103 // constant penalty (i4 mode usually needs more bits compared to i16).
1104 score_t score_i4 = (score_t)I4_PENALTY;
1105
1106 VP8IteratorStartI4(it);
1107 do {
1108 int mode;
1109 int best_sub_mode = -1;
1110 score_t best_sub_score = MAX_COST;
1111 const uint8_t* const src = it->yuv_in_ + Y_OFF + VP8Scan[it->i4_];
1112
1113 // TODO(skal): we don't really need the prediction pixels here,
1114 // but just the distortion against 'src'.
1115 VP8MakeIntra4Preds(it);
1116 for (mode = 0; mode < NUM_BMODES; ++mode) {
1117 const uint8_t* const ref = it->yuv_p_ + VP8I4ModeOffsets[mode];
1118 const score_t score = VP8SSE4x4(src, ref);
1119 if (score < best_sub_score) {
1120 best_sub_mode = mode;
1121 best_sub_score = score;
1122 }
1123 }
1124 modes_i4[it->i4_] = best_sub_mode;
1125 score_i4 += best_sub_score;
1126 if (score_i4 >= best_score) break;
1127 } while (VP8IteratorRotateI4(it, it->yuv_in_ + Y_OFF));
1128 if (score_i4 < best_score) {
1129 VP8SetIntra4Mode(it, modes_i4);
1130 }
1131 }
1132 }
1133
1134 //------------------------------------------------------------------------------
1135 // Entry point
1136
VP8Decimate(VP8EncIterator * const it,VP8ModeScore * const rd,VP8RDLevel rd_opt)1137 int VP8Decimate(VP8EncIterator* const it, VP8ModeScore* const rd,
1138 VP8RDLevel rd_opt) {
1139 int is_skipped;
1140 const int method = it->enc_->method_;
1141
1142 InitScore(rd);
1143
1144 // We can perform predictions for Luma16x16 and Chroma8x8 already.
1145 // Luma4x4 predictions needs to be done as-we-go.
1146 VP8MakeLuma16Preds(it);
1147 VP8MakeChroma8Preds(it);
1148
1149 if (rd_opt > RD_OPT_NONE) {
1150 it->do_trellis_ = (rd_opt >= RD_OPT_TRELLIS_ALL);
1151 PickBestIntra16(it, rd);
1152 if (method >= 2) {
1153 PickBestIntra4(it, rd);
1154 }
1155 PickBestUV(it, rd);
1156 if (rd_opt == RD_OPT_TRELLIS) { // finish off with trellis-optim now
1157 it->do_trellis_ = 1;
1158 SimpleQuantize(it, rd);
1159 }
1160 } else {
1161 // For method == 2, pick the best intra4/intra16 based on SSE (~tad slower).
1162 // For method <= 1, we refine intra4 or intra16 (but don't re-examine mode).
1163 DistoRefine(it, (method >= 2));
1164 SimpleQuantize(it, rd);
1165 }
1166 is_skipped = (rd->nz == 0);
1167 VP8SetSkip(it, is_skipped);
1168 return is_skipped;
1169 }
1170
1171