quant_enc.c revision fa39824bb690c5806358871f46940d0450973d8a
1// Copyright 2011 Google Inc. All Rights Reserved.
2//
3// Use of this source code is governed by a BSD-style license
4// that can be found in the COPYING file in the root of the source
5// tree. An additional intellectual property rights grant can be found
6// in the file PATENTS. All contributing project authors may
7// be found in the AUTHORS file in the root of the source tree.
8// -----------------------------------------------------------------------------
9//
10//   Quantization
11//
12// Author: Skal (pascal.massimino@gmail.com)
13
14#include <assert.h>
15#include <math.h>
16#include <stdlib.h>  // for abs()
17
18#include "./vp8i_enc.h"
19#include "./cost_enc.h"
20
21#define DO_TRELLIS_I4  1
22#define DO_TRELLIS_I16 1   // not a huge gain, but ok at low bitrate.
23#define DO_TRELLIS_UV  0   // disable trellis for UV. Risky. Not worth.
24#define USE_TDISTO 1
25
26#define MID_ALPHA 64      // neutral value for susceptibility
27#define MIN_ALPHA 30      // lowest usable value for susceptibility
28#define MAX_ALPHA 100     // higher meaningful value for susceptibility
29
30#define SNS_TO_DQ 0.9     // Scaling constant between the sns value and the QP
31                          // power-law modulation. Must be strictly less than 1.
32
33// number of non-zero coeffs below which we consider the block very flat
34// (and apply a penalty to complex predictions)
35#define FLATNESS_LIMIT_I16 10      // I16 mode
36#define FLATNESS_LIMIT_I4  3       // I4 mode
37#define FLATNESS_LIMIT_UV  2       // UV mode
38#define FLATNESS_PENALTY   140     // roughly ~1bit per block
39
40#define MULT_8B(a, b) (((a) * (b) + 128) >> 8)
41
42#define RD_DISTO_MULT      256  // distortion multiplier (equivalent of lambda)
43
44// #define DEBUG_BLOCK
45
46//------------------------------------------------------------------------------
47
48#if defined(DEBUG_BLOCK)
49
50#include <stdio.h>
51#include <stdlib.h>
52
53static void PrintBlockInfo(const VP8EncIterator* const it,
54                           const VP8ModeScore* const rd) {
55  int i, j;
56  const int is_i16 = (it->mb_->type_ == 1);
57  const uint8_t* const y_in = it->yuv_in_ + Y_OFF_ENC;
58  const uint8_t* const y_out = it->yuv_out_ + Y_OFF_ENC;
59  const uint8_t* const uv_in = it->yuv_in_ + U_OFF_ENC;
60  const uint8_t* const uv_out = it->yuv_out_ + U_OFF_ENC;
61  printf("SOURCE / OUTPUT / ABS DELTA\n");
62  for (j = 0; j < 16; ++j) {
63    for (i = 0; i < 16; ++i) printf("%3d ", y_in[i + j * BPS]);
64    printf("     ");
65    for (i = 0; i < 16; ++i) printf("%3d ", y_out[i + j * BPS]);
66    printf("     ");
67    for (i = 0; i < 16; ++i) {
68      printf("%1d ", abs(y_in[i + j * BPS] - y_out[i + j * BPS]));
69    }
70    printf("\n");
71  }
72  printf("\n");   // newline before the U/V block
73  for (j = 0; j < 8; ++j) {
74    for (i = 0; i < 8; ++i) printf("%3d ", uv_in[i + j * BPS]);
75    printf(" ");
76    for (i = 8; i < 16; ++i) printf("%3d ", uv_in[i + j * BPS]);
77    printf("    ");
78    for (i = 0; i < 8; ++i) printf("%3d ", uv_out[i + j * BPS]);
79    printf(" ");
80    for (i = 8; i < 16; ++i) printf("%3d ", uv_out[i + j * BPS]);
81    printf("   ");
82    for (i = 0; i < 8; ++i) {
83      printf("%1d ", abs(uv_out[i + j * BPS] - uv_in[i + j * BPS]));
84    }
85    printf(" ");
86    for (i = 8; i < 16; ++i) {
87      printf("%1d ", abs(uv_out[i + j * BPS] - uv_in[i + j * BPS]));
88    }
89    printf("\n");
90  }
91  printf("\nD:%d SD:%d R:%d H:%d nz:0x%x score:%d\n",
92    (int)rd->D, (int)rd->SD, (int)rd->R, (int)rd->H, (int)rd->nz,
93    (int)rd->score);
94  if (is_i16) {
95    printf("Mode: %d\n", rd->mode_i16);
96    printf("y_dc_levels:");
97    for (i = 0; i < 16; ++i) printf("%3d ", rd->y_dc_levels[i]);
98    printf("\n");
99  } else {
100    printf("Modes[16]: ");
101    for (i = 0; i < 16; ++i) printf("%d ", rd->modes_i4[i]);
102    printf("\n");
103  }
104  printf("y_ac_levels:\n");
105  for (j = 0; j < 16; ++j) {
106    for (i = is_i16 ? 1 : 0; i < 16; ++i) {
107      printf("%4d ", rd->y_ac_levels[j][i]);
108    }
109    printf("\n");
110  }
111  printf("\n");
112  printf("uv_levels (mode=%d):\n", rd->mode_uv);
113  for (j = 0; j < 8; ++j) {
114    for (i = 0; i < 16; ++i) {
115      printf("%4d ", rd->uv_levels[j][i]);
116    }
117    printf("\n");
118  }
119}
120
121#endif   // DEBUG_BLOCK
122
123//------------------------------------------------------------------------------
124
125static WEBP_INLINE int clip(int v, int m, int M) {
126  return v < m ? m : v > M ? M : v;
127}
128
129static const uint8_t kZigzag[16] = {
130  0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15
131};
132
133static const uint8_t kDcTable[128] = {
134  4,     5,   6,   7,   8,   9,  10,  10,
135  11,   12,  13,  14,  15,  16,  17,  17,
136  18,   19,  20,  20,  21,  21,  22,  22,
137  23,   23,  24,  25,  25,  26,  27,  28,
138  29,   30,  31,  32,  33,  34,  35,  36,
139  37,   37,  38,  39,  40,  41,  42,  43,
140  44,   45,  46,  46,  47,  48,  49,  50,
141  51,   52,  53,  54,  55,  56,  57,  58,
142  59,   60,  61,  62,  63,  64,  65,  66,
143  67,   68,  69,  70,  71,  72,  73,  74,
144  75,   76,  76,  77,  78,  79,  80,  81,
145  82,   83,  84,  85,  86,  87,  88,  89,
146  91,   93,  95,  96,  98, 100, 101, 102,
147  104, 106, 108, 110, 112, 114, 116, 118,
148  122, 124, 126, 128, 130, 132, 134, 136,
149  138, 140, 143, 145, 148, 151, 154, 157
150};
151
152static const uint16_t kAcTable[128] = {
153  4,     5,   6,   7,   8,   9,  10,  11,
154  12,   13,  14,  15,  16,  17,  18,  19,
155  20,   21,  22,  23,  24,  25,  26,  27,
156  28,   29,  30,  31,  32,  33,  34,  35,
157  36,   37,  38,  39,  40,  41,  42,  43,
158  44,   45,  46,  47,  48,  49,  50,  51,
159  52,   53,  54,  55,  56,  57,  58,  60,
160  62,   64,  66,  68,  70,  72,  74,  76,
161  78,   80,  82,  84,  86,  88,  90,  92,
162  94,   96,  98, 100, 102, 104, 106, 108,
163  110, 112, 114, 116, 119, 122, 125, 128,
164  131, 134, 137, 140, 143, 146, 149, 152,
165  155, 158, 161, 164, 167, 170, 173, 177,
166  181, 185, 189, 193, 197, 201, 205, 209,
167  213, 217, 221, 225, 229, 234, 239, 245,
168  249, 254, 259, 264, 269, 274, 279, 284
169};
170
171static const uint16_t kAcTable2[128] = {
172  8,     8,   9,  10,  12,  13,  15,  17,
173  18,   20,  21,  23,  24,  26,  27,  29,
174  31,   32,  34,  35,  37,  38,  40,  41,
175  43,   44,  46,  48,  49,  51,  52,  54,
176  55,   57,  58,  60,  62,  63,  65,  66,
177  68,   69,  71,  72,  74,  75,  77,  79,
178  80,   82,  83,  85,  86,  88,  89,  93,
179  96,   99, 102, 105, 108, 111, 114, 117,
180  120, 124, 127, 130, 133, 136, 139, 142,
181  145, 148, 151, 155, 158, 161, 164, 167,
182  170, 173, 176, 179, 184, 189, 193, 198,
183  203, 207, 212, 217, 221, 226, 230, 235,
184  240, 244, 249, 254, 258, 263, 268, 274,
185  280, 286, 292, 299, 305, 311, 317, 323,
186  330, 336, 342, 348, 354, 362, 370, 379,
187  385, 393, 401, 409, 416, 424, 432, 440
188};
189
190static const uint8_t kBiasMatrices[3][2] = {  // [luma-ac,luma-dc,chroma][dc,ac]
191  { 96, 110 }, { 96, 108 }, { 110, 115 }
192};
193
194// Sharpening by (slightly) raising the hi-frequency coeffs.
195// Hack-ish but helpful for mid-bitrate range. Use with care.
196#define SHARPEN_BITS 11  // number of descaling bits for sharpening bias
197static const uint8_t kFreqSharpening[16] = {
198  0,  30, 60, 90,
199  30, 60, 90, 90,
200  60, 90, 90, 90,
201  90, 90, 90, 90
202};
203
204//------------------------------------------------------------------------------
205// Initialize quantization parameters in VP8Matrix
206
207// Returns the average quantizer
208static int ExpandMatrix(VP8Matrix* const m, int type) {
209  int i, sum;
210  for (i = 0; i < 2; ++i) {
211    const int is_ac_coeff = (i > 0);
212    const int bias = kBiasMatrices[type][is_ac_coeff];
213    m->iq_[i] = (1 << QFIX) / m->q_[i];
214    m->bias_[i] = BIAS(bias);
215    // zthresh_ is the exact value such that QUANTDIV(coeff, iQ, B) is:
216    //   * zero if coeff <= zthresh
217    //   * non-zero if coeff > zthresh
218    m->zthresh_[i] = ((1 << QFIX) - 1 - m->bias_[i]) / m->iq_[i];
219  }
220  for (i = 2; i < 16; ++i) {
221    m->q_[i] = m->q_[1];
222    m->iq_[i] = m->iq_[1];
223    m->bias_[i] = m->bias_[1];
224    m->zthresh_[i] = m->zthresh_[1];
225  }
226  for (sum = 0, i = 0; i < 16; ++i) {
227    if (type == 0) {  // we only use sharpening for AC luma coeffs
228      m->sharpen_[i] = (kFreqSharpening[i] * m->q_[i]) >> SHARPEN_BITS;
229    } else {
230      m->sharpen_[i] = 0;
231    }
232    sum += m->q_[i];
233  }
234  return (sum + 8) >> 4;
235}
236
237static void CheckLambdaValue(int* const v) { if (*v < 1) *v = 1; }
238
239static void SetupMatrices(VP8Encoder* enc) {
240  int i;
241  const int tlambda_scale =
242    (enc->method_ >= 4) ? enc->config_->sns_strength
243                        : 0;
244  const int num_segments = enc->segment_hdr_.num_segments_;
245  for (i = 0; i < num_segments; ++i) {
246    VP8SegmentInfo* const m = &enc->dqm_[i];
247    const int q = m->quant_;
248    int q_i4, q_i16, q_uv;
249    m->y1_.q_[0] = kDcTable[clip(q + enc->dq_y1_dc_, 0, 127)];
250    m->y1_.q_[1] = kAcTable[clip(q,                  0, 127)];
251
252    m->y2_.q_[0] = kDcTable[ clip(q + enc->dq_y2_dc_, 0, 127)] * 2;
253    m->y2_.q_[1] = kAcTable2[clip(q + enc->dq_y2_ac_, 0, 127)];
254
255    m->uv_.q_[0] = kDcTable[clip(q + enc->dq_uv_dc_, 0, 117)];
256    m->uv_.q_[1] = kAcTable[clip(q + enc->dq_uv_ac_, 0, 127)];
257
258    q_i4  = ExpandMatrix(&m->y1_, 0);
259    q_i16 = ExpandMatrix(&m->y2_, 1);
260    q_uv  = ExpandMatrix(&m->uv_, 2);
261
262    m->lambda_i4_          = (3 * q_i4 * q_i4) >> 7;
263    m->lambda_i16_         = (3 * q_i16 * q_i16);
264    m->lambda_uv_          = (3 * q_uv * q_uv) >> 6;
265    m->lambda_mode_        = (1 * q_i4 * q_i4) >> 7;
266    m->lambda_trellis_i4_  = (7 * q_i4 * q_i4) >> 3;
267    m->lambda_trellis_i16_ = (q_i16 * q_i16) >> 2;
268    m->lambda_trellis_uv_  = (q_uv * q_uv) << 1;
269    m->tlambda_            = (tlambda_scale * q_i4) >> 5;
270
271    // none of these constants should be < 1
272    CheckLambdaValue(&m->lambda_i4_);
273    CheckLambdaValue(&m->lambda_i16_);
274    CheckLambdaValue(&m->lambda_uv_);
275    CheckLambdaValue(&m->lambda_mode_);
276    CheckLambdaValue(&m->lambda_trellis_i4_);
277    CheckLambdaValue(&m->lambda_trellis_i16_);
278    CheckLambdaValue(&m->lambda_trellis_uv_);
279    CheckLambdaValue(&m->tlambda_);
280
281    m->min_disto_ = 20 * m->y1_.q_[0];   // quantization-aware min disto
282    m->max_edge_  = 0;
283
284    m->i4_penalty_ = 1000 * q_i4 * q_i4;
285  }
286}
287
288//------------------------------------------------------------------------------
289// Initialize filtering parameters
290
291// Very small filter-strength values have close to no visual effect. So we can
292// save a little decoding-CPU by turning filtering off for these.
293#define FSTRENGTH_CUTOFF 2
294
295static void SetupFilterStrength(VP8Encoder* const enc) {
296  int i;
297  // level0 is in [0..500]. Using '-f 50' as filter_strength is mid-filtering.
298  const int level0 = 5 * enc->config_->filter_strength;
299  for (i = 0; i < NUM_MB_SEGMENTS; ++i) {
300    VP8SegmentInfo* const m = &enc->dqm_[i];
301    // We focus on the quantization of AC coeffs.
302    const int qstep = kAcTable[clip(m->quant_, 0, 127)] >> 2;
303    const int base_strength =
304        VP8FilterStrengthFromDelta(enc->filter_hdr_.sharpness_, qstep);
305    // Segments with lower complexity ('beta') will be less filtered.
306    const int f = base_strength * level0 / (256 + m->beta_);
307    m->fstrength_ = (f < FSTRENGTH_CUTOFF) ? 0 : (f > 63) ? 63 : f;
308  }
309  // We record the initial strength (mainly for the case of 1-segment only).
310  enc->filter_hdr_.level_ = enc->dqm_[0].fstrength_;
311  enc->filter_hdr_.simple_ = (enc->config_->filter_type == 0);
312  enc->filter_hdr_.sharpness_ = enc->config_->filter_sharpness;
313}
314
315//------------------------------------------------------------------------------
316
317// Note: if you change the values below, remember that the max range
318// allowed by the syntax for DQ_UV is [-16,16].
319#define MAX_DQ_UV (6)
320#define MIN_DQ_UV (-4)
321
322// We want to emulate jpeg-like behaviour where the expected "good" quality
323// is around q=75. Internally, our "good" middle is around c=50. So we
324// map accordingly using linear piece-wise function
325static double QualityToCompression(double c) {
326  const double linear_c = (c < 0.75) ? c * (2. / 3.) : 2. * c - 1.;
327  // The file size roughly scales as pow(quantizer, 3.). Actually, the
328  // exponent is somewhere between 2.8 and 3.2, but we're mostly interested
329  // in the mid-quant range. So we scale the compressibility inversely to
330  // this power-law: quant ~= compression ^ 1/3. This law holds well for
331  // low quant. Finer modeling for high-quant would make use of kAcTable[]
332  // more explicitly.
333  const double v = pow(linear_c, 1 / 3.);
334  return v;
335}
336
337static double QualityToJPEGCompression(double c, double alpha) {
338  // We map the complexity 'alpha' and quality setting 'c' to a compression
339  // exponent empirically matched to the compression curve of libjpeg6b.
340  // On average, the WebP output size will be roughly similar to that of a
341  // JPEG file compressed with same quality factor.
342  const double amin = 0.30;
343  const double amax = 0.85;
344  const double exp_min = 0.4;
345  const double exp_max = 0.9;
346  const double slope = (exp_min - exp_max) / (amax - amin);
347  // Linearly interpolate 'expn' from exp_min to exp_max
348  // in the [amin, amax] range.
349  const double expn = (alpha > amax) ? exp_min
350                    : (alpha < amin) ? exp_max
351                    : exp_max + slope * (alpha - amin);
352  const double v = pow(c, expn);
353  return v;
354}
355
356static int SegmentsAreEquivalent(const VP8SegmentInfo* const S1,
357                                 const VP8SegmentInfo* const S2) {
358  return (S1->quant_ == S2->quant_) && (S1->fstrength_ == S2->fstrength_);
359}
360
361static void SimplifySegments(VP8Encoder* const enc) {
362  int map[NUM_MB_SEGMENTS] = { 0, 1, 2, 3 };
363  // 'num_segments_' is previously validated and <= NUM_MB_SEGMENTS, but an
364  // explicit check is needed to avoid a spurious warning about 'i' exceeding
365  // array bounds of 'dqm_' with some compilers (noticed with gcc-4.9).
366  const int num_segments = (enc->segment_hdr_.num_segments_ < NUM_MB_SEGMENTS)
367                               ? enc->segment_hdr_.num_segments_
368                               : NUM_MB_SEGMENTS;
369  int num_final_segments = 1;
370  int s1, s2;
371  for (s1 = 1; s1 < num_segments; ++s1) {    // find similar segments
372    const VP8SegmentInfo* const S1 = &enc->dqm_[s1];
373    int found = 0;
374    // check if we already have similar segment
375    for (s2 = 0; s2 < num_final_segments; ++s2) {
376      const VP8SegmentInfo* const S2 = &enc->dqm_[s2];
377      if (SegmentsAreEquivalent(S1, S2)) {
378        found = 1;
379        break;
380      }
381    }
382    map[s1] = s2;
383    if (!found) {
384      if (num_final_segments != s1) {
385        enc->dqm_[num_final_segments] = enc->dqm_[s1];
386      }
387      ++num_final_segments;
388    }
389  }
390  if (num_final_segments < num_segments) {  // Remap
391    int i = enc->mb_w_ * enc->mb_h_;
392    while (i-- > 0) enc->mb_info_[i].segment_ = map[enc->mb_info_[i].segment_];
393    enc->segment_hdr_.num_segments_ = num_final_segments;
394    // Replicate the trailing segment infos (it's mostly cosmetics)
395    for (i = num_final_segments; i < num_segments; ++i) {
396      enc->dqm_[i] = enc->dqm_[num_final_segments - 1];
397    }
398  }
399}
400
401void VP8SetSegmentParams(VP8Encoder* const enc, float quality) {
402  int i;
403  int dq_uv_ac, dq_uv_dc;
404  const int num_segments = enc->segment_hdr_.num_segments_;
405  const double amp = SNS_TO_DQ * enc->config_->sns_strength / 100. / 128.;
406  const double Q = quality / 100.;
407  const double c_base = enc->config_->emulate_jpeg_size ?
408      QualityToJPEGCompression(Q, enc->alpha_ / 255.) :
409      QualityToCompression(Q);
410  for (i = 0; i < num_segments; ++i) {
411    // We modulate the base coefficient to accommodate for the quantization
412    // susceptibility and allow denser segments to be quantized more.
413    const double expn = 1. - amp * enc->dqm_[i].alpha_;
414    const double c = pow(c_base, expn);
415    const int q = (int)(127. * (1. - c));
416    assert(expn > 0.);
417    enc->dqm_[i].quant_ = clip(q, 0, 127);
418  }
419
420  // purely indicative in the bitstream (except for the 1-segment case)
421  enc->base_quant_ = enc->dqm_[0].quant_;
422
423  // fill-in values for the unused segments (required by the syntax)
424  for (i = num_segments; i < NUM_MB_SEGMENTS; ++i) {
425    enc->dqm_[i].quant_ = enc->base_quant_;
426  }
427
428  // uv_alpha_ is normally spread around ~60. The useful range is
429  // typically ~30 (quite bad) to ~100 (ok to decimate UV more).
430  // We map it to the safe maximal range of MAX/MIN_DQ_UV for dq_uv.
431  dq_uv_ac = (enc->uv_alpha_ - MID_ALPHA) * (MAX_DQ_UV - MIN_DQ_UV)
432                                          / (MAX_ALPHA - MIN_ALPHA);
433  // we rescale by the user-defined strength of adaptation
434  dq_uv_ac = dq_uv_ac * enc->config_->sns_strength / 100;
435  // and make it safe.
436  dq_uv_ac = clip(dq_uv_ac, MIN_DQ_UV, MAX_DQ_UV);
437  // We also boost the dc-uv-quant a little, based on sns-strength, since
438  // U/V channels are quite more reactive to high quants (flat DC-blocks
439  // tend to appear, and are unpleasant).
440  dq_uv_dc = -4 * enc->config_->sns_strength / 100;
441  dq_uv_dc = clip(dq_uv_dc, -15, 15);   // 4bit-signed max allowed
442
443  enc->dq_y1_dc_ = 0;       // TODO(skal): dq-lum
444  enc->dq_y2_dc_ = 0;
445  enc->dq_y2_ac_ = 0;
446  enc->dq_uv_dc_ = dq_uv_dc;
447  enc->dq_uv_ac_ = dq_uv_ac;
448
449  SetupFilterStrength(enc);   // initialize segments' filtering, eventually
450
451  if (num_segments > 1) SimplifySegments(enc);
452
453  SetupMatrices(enc);         // finalize quantization matrices
454}
455
456//------------------------------------------------------------------------------
457// Form the predictions in cache
458
459// Must be ordered using {DC_PRED, TM_PRED, V_PRED, H_PRED} as index
460const int VP8I16ModeOffsets[4] = { I16DC16, I16TM16, I16VE16, I16HE16 };
461const int VP8UVModeOffsets[4] = { C8DC8, C8TM8, C8VE8, C8HE8 };
462
463// Must be indexed using {B_DC_PRED -> B_HU_PRED} as index
464const int VP8I4ModeOffsets[NUM_BMODES] = {
465  I4DC4, I4TM4, I4VE4, I4HE4, I4RD4, I4VR4, I4LD4, I4VL4, I4HD4, I4HU4
466};
467
468void VP8MakeLuma16Preds(const VP8EncIterator* const it) {
469  const uint8_t* const left = it->x_ ? it->y_left_ : NULL;
470  const uint8_t* const top = it->y_ ? it->y_top_ : NULL;
471  VP8EncPredLuma16(it->yuv_p_, left, top);
472}
473
474void VP8MakeChroma8Preds(const VP8EncIterator* const it) {
475  const uint8_t* const left = it->x_ ? it->u_left_ : NULL;
476  const uint8_t* const top = it->y_ ? it->uv_top_ : NULL;
477  VP8EncPredChroma8(it->yuv_p_, left, top);
478}
479
480void VP8MakeIntra4Preds(const VP8EncIterator* const it) {
481  VP8EncPredLuma4(it->yuv_p_, it->i4_top_);
482}
483
484//------------------------------------------------------------------------------
485// Quantize
486
487// Layout:
488// +----+----+
489// |YYYY|UUVV| 0
490// |YYYY|UUVV| 4
491// |YYYY|....| 8
492// |YYYY|....| 12
493// +----+----+
494
495const int VP8Scan[16] = {  // Luma
496  0 +  0 * BPS,  4 +  0 * BPS, 8 +  0 * BPS, 12 +  0 * BPS,
497  0 +  4 * BPS,  4 +  4 * BPS, 8 +  4 * BPS, 12 +  4 * BPS,
498  0 +  8 * BPS,  4 +  8 * BPS, 8 +  8 * BPS, 12 +  8 * BPS,
499  0 + 12 * BPS,  4 + 12 * BPS, 8 + 12 * BPS, 12 + 12 * BPS,
500};
501
502static const int VP8ScanUV[4 + 4] = {
503  0 + 0 * BPS,   4 + 0 * BPS, 0 + 4 * BPS,  4 + 4 * BPS,    // U
504  8 + 0 * BPS,  12 + 0 * BPS, 8 + 4 * BPS, 12 + 4 * BPS     // V
505};
506
507//------------------------------------------------------------------------------
508// Distortion measurement
509
510static const uint16_t kWeightY[16] = {
511  38, 32, 20, 9, 32, 28, 17, 7, 20, 17, 10, 4, 9, 7, 4, 2
512};
513
514static const uint16_t kWeightTrellis[16] = {
515#if USE_TDISTO == 0
516  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
517#else
518  30, 27, 19, 11,
519  27, 24, 17, 10,
520  19, 17, 12,  8,
521  11, 10,  8,  6
522#endif
523};
524
525// Init/Copy the common fields in score.
526static void InitScore(VP8ModeScore* const rd) {
527  rd->D  = 0;
528  rd->SD = 0;
529  rd->R  = 0;
530  rd->H  = 0;
531  rd->nz = 0;
532  rd->score = MAX_COST;
533}
534
535static void CopyScore(VP8ModeScore* const dst, const VP8ModeScore* const src) {
536  dst->D  = src->D;
537  dst->SD = src->SD;
538  dst->R  = src->R;
539  dst->H  = src->H;
540  dst->nz = src->nz;      // note that nz is not accumulated, but just copied.
541  dst->score = src->score;
542}
543
544static void AddScore(VP8ModeScore* const dst, const VP8ModeScore* const src) {
545  dst->D  += src->D;
546  dst->SD += src->SD;
547  dst->R  += src->R;
548  dst->H  += src->H;
549  dst->nz |= src->nz;     // here, new nz bits are accumulated.
550  dst->score += src->score;
551}
552
553//------------------------------------------------------------------------------
554// Performs trellis-optimized quantization.
555
556// Trellis node
557typedef struct {
558  int8_t prev;            // best previous node
559  int8_t sign;            // sign of coeff_i
560  int16_t level;          // level
561} Node;
562
563// Score state
564typedef struct {
565  score_t score;          // partial RD score
566  const uint16_t* costs;  // shortcut to cost tables
567} ScoreState;
568
569// If a coefficient was quantized to a value Q (using a neutral bias),
570// we test all alternate possibilities between [Q-MIN_DELTA, Q+MAX_DELTA]
571// We don't test negative values though.
572#define MIN_DELTA 0   // how much lower level to try
573#define MAX_DELTA 1   // how much higher
574#define NUM_NODES (MIN_DELTA + 1 + MAX_DELTA)
575#define NODE(n, l) (nodes[(n)][(l) + MIN_DELTA])
576#define SCORE_STATE(n, l) (score_states[n][(l) + MIN_DELTA])
577
578static WEBP_INLINE void SetRDScore(int lambda, VP8ModeScore* const rd) {
579  rd->score = (rd->R + rd->H) * lambda + RD_DISTO_MULT * (rd->D + rd->SD);
580}
581
582static WEBP_INLINE score_t RDScoreTrellis(int lambda, score_t rate,
583                                          score_t distortion) {
584  return rate * lambda + RD_DISTO_MULT * distortion;
585}
586
587static int TrellisQuantizeBlock(const VP8Encoder* const enc,
588                                int16_t in[16], int16_t out[16],
589                                int ctx0, int coeff_type,
590                                const VP8Matrix* const mtx,
591                                int lambda) {
592  const ProbaArray* const probas = enc->proba_.coeffs_[coeff_type];
593  CostArrayPtr const costs =
594      (CostArrayPtr)enc->proba_.remapped_costs_[coeff_type];
595  const int first = (coeff_type == 0) ? 1 : 0;
596  Node nodes[16][NUM_NODES];
597  ScoreState score_states[2][NUM_NODES];
598  ScoreState* ss_cur = &SCORE_STATE(0, MIN_DELTA);
599  ScoreState* ss_prev = &SCORE_STATE(1, MIN_DELTA);
600  int best_path[3] = {-1, -1, -1};   // store best-last/best-level/best-previous
601  score_t best_score;
602  int n, m, p, last;
603
604  {
605    score_t cost;
606    const int thresh = mtx->q_[1] * mtx->q_[1] / 4;
607    const int last_proba = probas[VP8EncBands[first]][ctx0][0];
608
609    // compute the position of the last interesting coefficient
610    last = first - 1;
611    for (n = 15; n >= first; --n) {
612      const int j = kZigzag[n];
613      const int err = in[j] * in[j];
614      if (err > thresh) {
615        last = n;
616        break;
617      }
618    }
619    // we don't need to go inspect up to n = 16 coeffs. We can just go up
620    // to last + 1 (inclusive) without losing much.
621    if (last < 15) ++last;
622
623    // compute 'skip' score. This is the max score one can do.
624    cost = VP8BitCost(0, last_proba);
625    best_score = RDScoreTrellis(lambda, cost, 0);
626
627    // initialize source node.
628    for (m = -MIN_DELTA; m <= MAX_DELTA; ++m) {
629      const score_t rate = (ctx0 == 0) ? VP8BitCost(1, last_proba) : 0;
630      ss_cur[m].score = RDScoreTrellis(lambda, rate, 0);
631      ss_cur[m].costs = costs[first][ctx0];
632    }
633  }
634
635  // traverse trellis.
636  for (n = first; n <= last; ++n) {
637    const int j = kZigzag[n];
638    const uint32_t Q  = mtx->q_[j];
639    const uint32_t iQ = mtx->iq_[j];
640    const uint32_t B = BIAS(0x00);     // neutral bias
641    // note: it's important to take sign of the _original_ coeff,
642    // so we don't have to consider level < 0 afterward.
643    const int sign = (in[j] < 0);
644    const uint32_t coeff0 = (sign ? -in[j] : in[j]) + mtx->sharpen_[j];
645    int level0 = QUANTDIV(coeff0, iQ, B);
646    int thresh_level = QUANTDIV(coeff0, iQ, BIAS(0x80));
647    if (thresh_level > MAX_LEVEL) thresh_level = MAX_LEVEL;
648    if (level0 > MAX_LEVEL) level0 = MAX_LEVEL;
649
650    {   // Swap current and previous score states
651      ScoreState* const tmp = ss_cur;
652      ss_cur = ss_prev;
653      ss_prev = tmp;
654    }
655
656    // test all alternate level values around level0.
657    for (m = -MIN_DELTA; m <= MAX_DELTA; ++m) {
658      Node* const cur = &NODE(n, m);
659      int level = level0 + m;
660      const int ctx = (level > 2) ? 2 : level;
661      const int band = VP8EncBands[n + 1];
662      score_t base_score;
663      score_t best_cur_score = MAX_COST;
664      int best_prev = 0;   // default, in case
665
666      ss_cur[m].score = MAX_COST;
667      ss_cur[m].costs = costs[n + 1][ctx];
668      if (level < 0 || level > thresh_level) {
669        // Node is dead.
670        continue;
671      }
672
673      {
674        // Compute delta_error = how much coding this level will
675        // subtract to max_error as distortion.
676        // Here, distortion = sum of (|coeff_i| - level_i * Q_i)^2
677        const int new_error = coeff0 - level * Q;
678        const int delta_error =
679            kWeightTrellis[j] * (new_error * new_error - coeff0 * coeff0);
680        base_score = RDScoreTrellis(lambda, 0, delta_error);
681      }
682
683      // Inspect all possible non-dead predecessors. Retain only the best one.
684      for (p = -MIN_DELTA; p <= MAX_DELTA; ++p) {
685        // Dead nodes (with ss_prev[p].score >= MAX_COST) are automatically
686        // eliminated since their score can't be better than the current best.
687        const score_t cost = VP8LevelCost(ss_prev[p].costs, level);
688        // Examine node assuming it's a non-terminal one.
689        const score_t score =
690            base_score + ss_prev[p].score + RDScoreTrellis(lambda, cost, 0);
691        if (score < best_cur_score) {
692          best_cur_score = score;
693          best_prev = p;
694        }
695      }
696      // Store best finding in current node.
697      cur->sign = sign;
698      cur->level = level;
699      cur->prev = best_prev;
700      ss_cur[m].score = best_cur_score;
701
702      // Now, record best terminal node (and thus best entry in the graph).
703      if (level != 0) {
704        const score_t last_pos_cost =
705            (n < 15) ? VP8BitCost(0, probas[band][ctx][0]) : 0;
706        const score_t last_pos_score = RDScoreTrellis(lambda, last_pos_cost, 0);
707        const score_t score = best_cur_score + last_pos_score;
708        if (score < best_score) {
709          best_score = score;
710          best_path[0] = n;                     // best eob position
711          best_path[1] = m;                     // best node index
712          best_path[2] = best_prev;             // best predecessor
713        }
714      }
715    }
716  }
717
718  // Fresh start
719  memset(in + first, 0, (16 - first) * sizeof(*in));
720  memset(out + first, 0, (16 - first) * sizeof(*out));
721  if (best_path[0] == -1) {
722    return 0;   // skip!
723  }
724
725  {
726    // Unwind the best path.
727    // Note: best-prev on terminal node is not necessarily equal to the
728    // best_prev for non-terminal. So we patch best_path[2] in.
729    int nz = 0;
730    int best_node = best_path[1];
731    n = best_path[0];
732    NODE(n, best_node).prev = best_path[2];   // force best-prev for terminal
733
734    for (; n >= first; --n) {
735      const Node* const node = &NODE(n, best_node);
736      const int j = kZigzag[n];
737      out[n] = node->sign ? -node->level : node->level;
738      nz |= node->level;
739      in[j] = out[n] * mtx->q_[j];
740      best_node = node->prev;
741    }
742    return (nz != 0);
743  }
744}
745
746#undef NODE
747
748//------------------------------------------------------------------------------
749// Performs: difference, transform, quantize, back-transform, add
750// all at once. Output is the reconstructed block in *yuv_out, and the
751// quantized levels in *levels.
752
753static int ReconstructIntra16(VP8EncIterator* const it,
754                              VP8ModeScore* const rd,
755                              uint8_t* const yuv_out,
756                              int mode) {
757  const VP8Encoder* const enc = it->enc_;
758  const uint8_t* const ref = it->yuv_p_ + VP8I16ModeOffsets[mode];
759  const uint8_t* const src = it->yuv_in_ + Y_OFF_ENC;
760  const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
761  int nz = 0;
762  int n;
763  int16_t tmp[16][16], dc_tmp[16];
764
765  for (n = 0; n < 16; n += 2) {
766    VP8FTransform2(src + VP8Scan[n], ref + VP8Scan[n], tmp[n]);
767  }
768  VP8FTransformWHT(tmp[0], dc_tmp);
769  nz |= VP8EncQuantizeBlockWHT(dc_tmp, rd->y_dc_levels, &dqm->y2_) << 24;
770
771  if (DO_TRELLIS_I16 && it->do_trellis_) {
772    int x, y;
773    VP8IteratorNzToBytes(it);
774    for (y = 0, n = 0; y < 4; ++y) {
775      for (x = 0; x < 4; ++x, ++n) {
776        const int ctx = it->top_nz_[x] + it->left_nz_[y];
777        const int non_zero =
778            TrellisQuantizeBlock(enc, tmp[n], rd->y_ac_levels[n], ctx, 0,
779                                 &dqm->y1_, dqm->lambda_trellis_i16_);
780        it->top_nz_[x] = it->left_nz_[y] = non_zero;
781        rd->y_ac_levels[n][0] = 0;
782        nz |= non_zero << n;
783      }
784    }
785  } else {
786    for (n = 0; n < 16; n += 2) {
787      // Zero-out the first coeff, so that: a) nz is correct below, and
788      // b) finding 'last' non-zero coeffs in SetResidualCoeffs() is simplified.
789      tmp[n][0] = tmp[n + 1][0] = 0;
790      nz |= VP8EncQuantize2Blocks(tmp[n], rd->y_ac_levels[n], &dqm->y1_) << n;
791      assert(rd->y_ac_levels[n + 0][0] == 0);
792      assert(rd->y_ac_levels[n + 1][0] == 0);
793    }
794  }
795
796  // Transform back
797  VP8TransformWHT(dc_tmp, tmp[0]);
798  for (n = 0; n < 16; n += 2) {
799    VP8ITransform(ref + VP8Scan[n], tmp[n], yuv_out + VP8Scan[n], 1);
800  }
801
802  return nz;
803}
804
805static int ReconstructIntra4(VP8EncIterator* const it,
806                             int16_t levels[16],
807                             const uint8_t* const src,
808                             uint8_t* const yuv_out,
809                             int mode) {
810  const VP8Encoder* const enc = it->enc_;
811  const uint8_t* const ref = it->yuv_p_ + VP8I4ModeOffsets[mode];
812  const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
813  int nz = 0;
814  int16_t tmp[16];
815
816  VP8FTransform(src, ref, tmp);
817  if (DO_TRELLIS_I4 && it->do_trellis_) {
818    const int x = it->i4_ & 3, y = it->i4_ >> 2;
819    const int ctx = it->top_nz_[x] + it->left_nz_[y];
820    nz = TrellisQuantizeBlock(enc, tmp, levels, ctx, 3, &dqm->y1_,
821                              dqm->lambda_trellis_i4_);
822  } else {
823    nz = VP8EncQuantizeBlock(tmp, levels, &dqm->y1_);
824  }
825  VP8ITransform(ref, tmp, yuv_out, 0);
826  return nz;
827}
828
829static int ReconstructUV(VP8EncIterator* const it, VP8ModeScore* const rd,
830                         uint8_t* const yuv_out, int mode) {
831  const VP8Encoder* const enc = it->enc_;
832  const uint8_t* const ref = it->yuv_p_ + VP8UVModeOffsets[mode];
833  const uint8_t* const src = it->yuv_in_ + U_OFF_ENC;
834  const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
835  int nz = 0;
836  int n;
837  int16_t tmp[8][16];
838
839  for (n = 0; n < 8; n += 2) {
840    VP8FTransform2(src + VP8ScanUV[n], ref + VP8ScanUV[n], tmp[n]);
841  }
842  if (DO_TRELLIS_UV && it->do_trellis_) {
843    int ch, x, y;
844    for (ch = 0, n = 0; ch <= 2; ch += 2) {
845      for (y = 0; y < 2; ++y) {
846        for (x = 0; x < 2; ++x, ++n) {
847          const int ctx = it->top_nz_[4 + ch + x] + it->left_nz_[4 + ch + y];
848          const int non_zero =
849              TrellisQuantizeBlock(enc, tmp[n], rd->uv_levels[n], ctx, 2,
850                                   &dqm->uv_, dqm->lambda_trellis_uv_);
851          it->top_nz_[4 + ch + x] = it->left_nz_[4 + ch + y] = non_zero;
852          nz |= non_zero << n;
853        }
854      }
855    }
856  } else {
857    for (n = 0; n < 8; n += 2) {
858      nz |= VP8EncQuantize2Blocks(tmp[n], rd->uv_levels[n], &dqm->uv_) << n;
859    }
860  }
861
862  for (n = 0; n < 8; n += 2) {
863    VP8ITransform(ref + VP8ScanUV[n], tmp[n], yuv_out + VP8ScanUV[n], 1);
864  }
865  return (nz << 16);
866}
867
868//------------------------------------------------------------------------------
869// RD-opt decision. Reconstruct each modes, evalue distortion and bit-cost.
870// Pick the mode is lower RD-cost = Rate + lambda * Distortion.
871
872static void StoreMaxDelta(VP8SegmentInfo* const dqm, const int16_t DCs[16]) {
873  // We look at the first three AC coefficients to determine what is the average
874  // delta between each sub-4x4 block.
875  const int v0 = abs(DCs[1]);
876  const int v1 = abs(DCs[2]);
877  const int v2 = abs(DCs[4]);
878  int max_v = (v1 > v0) ? v1 : v0;
879  max_v = (v2 > max_v) ? v2 : max_v;
880  if (max_v > dqm->max_edge_) dqm->max_edge_ = max_v;
881}
882
883static void SwapModeScore(VP8ModeScore** a, VP8ModeScore** b) {
884  VP8ModeScore* const tmp = *a;
885  *a = *b;
886  *b = tmp;
887}
888
889static void SwapPtr(uint8_t** a, uint8_t** b) {
890  uint8_t* const tmp = *a;
891  *a = *b;
892  *b = tmp;
893}
894
895static void SwapOut(VP8EncIterator* const it) {
896  SwapPtr(&it->yuv_out_, &it->yuv_out2_);
897}
898
899static score_t IsFlat(const int16_t* levels, int num_blocks, score_t thresh) {
900  score_t score = 0;
901  while (num_blocks-- > 0) {      // TODO(skal): refine positional scoring?
902    int i;
903    for (i = 1; i < 16; ++i) {    // omit DC, we're only interested in AC
904      score += (levels[i] != 0);
905      if (score > thresh) return 0;
906    }
907    levels += 16;
908  }
909  return 1;
910}
911
912static void PickBestIntra16(VP8EncIterator* const it, VP8ModeScore* rd) {
913  const int kNumBlocks = 16;
914  VP8SegmentInfo* const dqm = &it->enc_->dqm_[it->mb_->segment_];
915  const int lambda = dqm->lambda_i16_;
916  const int tlambda = dqm->tlambda_;
917  const uint8_t* const src = it->yuv_in_ + Y_OFF_ENC;
918  VP8ModeScore rd_tmp;
919  VP8ModeScore* rd_cur = &rd_tmp;
920  VP8ModeScore* rd_best = rd;
921  int mode;
922
923  rd->mode_i16 = -1;
924  for (mode = 0; mode < NUM_PRED_MODES; ++mode) {
925    uint8_t* const tmp_dst = it->yuv_out2_ + Y_OFF_ENC;  // scratch buffer
926    rd_cur->mode_i16 = mode;
927
928    // Reconstruct
929    rd_cur->nz = ReconstructIntra16(it, rd_cur, tmp_dst, mode);
930
931    // Measure RD-score
932    rd_cur->D = VP8SSE16x16(src, tmp_dst);
933    rd_cur->SD =
934        tlambda ? MULT_8B(tlambda, VP8TDisto16x16(src, tmp_dst, kWeightY)) : 0;
935    rd_cur->H = VP8FixedCostsI16[mode];
936    rd_cur->R = VP8GetCostLuma16(it, rd_cur);
937    if (mode > 0 &&
938        IsFlat(rd_cur->y_ac_levels[0], kNumBlocks, FLATNESS_LIMIT_I16)) {
939      // penalty to avoid flat area to be mispredicted by complex mode
940      rd_cur->R += FLATNESS_PENALTY * kNumBlocks;
941    }
942
943    // Since we always examine Intra16 first, we can overwrite *rd directly.
944    SetRDScore(lambda, rd_cur);
945    if (mode == 0 || rd_cur->score < rd_best->score) {
946      SwapModeScore(&rd_cur, &rd_best);
947      SwapOut(it);
948    }
949  }
950  if (rd_best != rd) {
951    memcpy(rd, rd_best, sizeof(*rd));
952  }
953  SetRDScore(dqm->lambda_mode_, rd);   // finalize score for mode decision.
954  VP8SetIntra16Mode(it, rd->mode_i16);
955
956  // we have a blocky macroblock (only DCs are non-zero) with fairly high
957  // distortion, record max delta so we can later adjust the minimal filtering
958  // strength needed to smooth these blocks out.
959  if ((rd->nz & 0x100ffff) == 0x1000000 && rd->D > dqm->min_disto_) {
960    StoreMaxDelta(dqm, rd->y_dc_levels);
961  }
962}
963
964//------------------------------------------------------------------------------
965
966// return the cost array corresponding to the surrounding prediction modes.
967static const uint16_t* GetCostModeI4(VP8EncIterator* const it,
968                                     const uint8_t modes[16]) {
969  const int preds_w = it->enc_->preds_w_;
970  const int x = (it->i4_ & 3), y = it->i4_ >> 2;
971  const int left = (x == 0) ? it->preds_[y * preds_w - 1] : modes[it->i4_ - 1];
972  const int top = (y == 0) ? it->preds_[-preds_w + x] : modes[it->i4_ - 4];
973  return VP8FixedCostsI4[top][left];
974}
975
976static int PickBestIntra4(VP8EncIterator* const it, VP8ModeScore* const rd) {
977  const VP8Encoder* const enc = it->enc_;
978  const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
979  const int lambda = dqm->lambda_i4_;
980  const int tlambda = dqm->tlambda_;
981  const uint8_t* const src0 = it->yuv_in_ + Y_OFF_ENC;
982  uint8_t* const best_blocks = it->yuv_out2_ + Y_OFF_ENC;
983  int total_header_bits = 0;
984  VP8ModeScore rd_best;
985
986  if (enc->max_i4_header_bits_ == 0) {
987    return 0;
988  }
989
990  InitScore(&rd_best);
991  rd_best.H = 211;  // '211' is the value of VP8BitCost(0, 145)
992  SetRDScore(dqm->lambda_mode_, &rd_best);
993  VP8IteratorStartI4(it);
994  do {
995    const int kNumBlocks = 1;
996    VP8ModeScore rd_i4;
997    int mode;
998    int best_mode = -1;
999    const uint8_t* const src = src0 + VP8Scan[it->i4_];
1000    const uint16_t* const mode_costs = GetCostModeI4(it, rd->modes_i4);
1001    uint8_t* best_block = best_blocks + VP8Scan[it->i4_];
1002    uint8_t* tmp_dst = it->yuv_p_ + I4TMP;    // scratch buffer.
1003
1004    InitScore(&rd_i4);
1005    VP8MakeIntra4Preds(it);
1006    for (mode = 0; mode < NUM_BMODES; ++mode) {
1007      VP8ModeScore rd_tmp;
1008      int16_t tmp_levels[16];
1009
1010      // Reconstruct
1011      rd_tmp.nz =
1012          ReconstructIntra4(it, tmp_levels, src, tmp_dst, mode) << it->i4_;
1013
1014      // Compute RD-score
1015      rd_tmp.D = VP8SSE4x4(src, tmp_dst);
1016      rd_tmp.SD =
1017          tlambda ? MULT_8B(tlambda, VP8TDisto4x4(src, tmp_dst, kWeightY))
1018                  : 0;
1019      rd_tmp.H = mode_costs[mode];
1020
1021      // Add flatness penalty
1022      if (mode > 0 && IsFlat(tmp_levels, kNumBlocks, FLATNESS_LIMIT_I4)) {
1023        rd_tmp.R = FLATNESS_PENALTY * kNumBlocks;
1024      } else {
1025        rd_tmp.R = 0;
1026      }
1027
1028      // early-out check
1029      SetRDScore(lambda, &rd_tmp);
1030      if (best_mode >= 0 && rd_tmp.score >= rd_i4.score) continue;
1031
1032      // finish computing score
1033      rd_tmp.R += VP8GetCostLuma4(it, tmp_levels);
1034      SetRDScore(lambda, &rd_tmp);
1035
1036      if (best_mode < 0 || rd_tmp.score < rd_i4.score) {
1037        CopyScore(&rd_i4, &rd_tmp);
1038        best_mode = mode;
1039        SwapPtr(&tmp_dst, &best_block);
1040        memcpy(rd_best.y_ac_levels[it->i4_], tmp_levels,
1041               sizeof(rd_best.y_ac_levels[it->i4_]));
1042      }
1043    }
1044    SetRDScore(dqm->lambda_mode_, &rd_i4);
1045    AddScore(&rd_best, &rd_i4);
1046    if (rd_best.score >= rd->score) {
1047      return 0;
1048    }
1049    total_header_bits += (int)rd_i4.H;   // <- equal to mode_costs[best_mode];
1050    if (total_header_bits > enc->max_i4_header_bits_) {
1051      return 0;
1052    }
1053    // Copy selected samples if not in the right place already.
1054    if (best_block != best_blocks + VP8Scan[it->i4_]) {
1055      VP8Copy4x4(best_block, best_blocks + VP8Scan[it->i4_]);
1056    }
1057    rd->modes_i4[it->i4_] = best_mode;
1058    it->top_nz_[it->i4_ & 3] = it->left_nz_[it->i4_ >> 2] = (rd_i4.nz ? 1 : 0);
1059  } while (VP8IteratorRotateI4(it, best_blocks));
1060
1061  // finalize state
1062  CopyScore(rd, &rd_best);
1063  VP8SetIntra4Mode(it, rd->modes_i4);
1064  SwapOut(it);
1065  memcpy(rd->y_ac_levels, rd_best.y_ac_levels, sizeof(rd->y_ac_levels));
1066  return 1;   // select intra4x4 over intra16x16
1067}
1068
1069//------------------------------------------------------------------------------
1070
1071static void PickBestUV(VP8EncIterator* const it, VP8ModeScore* const rd) {
1072  const int kNumBlocks = 8;
1073  const VP8SegmentInfo* const dqm = &it->enc_->dqm_[it->mb_->segment_];
1074  const int lambda = dqm->lambda_uv_;
1075  const uint8_t* const src = it->yuv_in_ + U_OFF_ENC;
1076  uint8_t* tmp_dst = it->yuv_out2_ + U_OFF_ENC;  // scratch buffer
1077  uint8_t* dst0 = it->yuv_out_ + U_OFF_ENC;
1078  uint8_t* dst = dst0;
1079  VP8ModeScore rd_best;
1080  int mode;
1081
1082  rd->mode_uv = -1;
1083  InitScore(&rd_best);
1084  for (mode = 0; mode < NUM_PRED_MODES; ++mode) {
1085    VP8ModeScore rd_uv;
1086
1087    // Reconstruct
1088    rd_uv.nz = ReconstructUV(it, &rd_uv, tmp_dst, mode);
1089
1090    // Compute RD-score
1091    rd_uv.D  = VP8SSE16x8(src, tmp_dst);
1092    rd_uv.SD = 0;    // not calling TDisto here: it tends to flatten areas.
1093    rd_uv.H  = VP8FixedCostsUV[mode];
1094    rd_uv.R  = VP8GetCostUV(it, &rd_uv);
1095    if (mode > 0 && IsFlat(rd_uv.uv_levels[0], kNumBlocks, FLATNESS_LIMIT_UV)) {
1096      rd_uv.R += FLATNESS_PENALTY * kNumBlocks;
1097    }
1098
1099    SetRDScore(lambda, &rd_uv);
1100    if (mode == 0 || rd_uv.score < rd_best.score) {
1101      CopyScore(&rd_best, &rd_uv);
1102      rd->mode_uv = mode;
1103      memcpy(rd->uv_levels, rd_uv.uv_levels, sizeof(rd->uv_levels));
1104      SwapPtr(&dst, &tmp_dst);
1105    }
1106  }
1107  VP8SetIntraUVMode(it, rd->mode_uv);
1108  AddScore(rd, &rd_best);
1109  if (dst != dst0) {   // copy 16x8 block if needed
1110    VP8Copy16x8(dst, dst0);
1111  }
1112}
1113
1114//------------------------------------------------------------------------------
1115// Final reconstruction and quantization.
1116
1117static void SimpleQuantize(VP8EncIterator* const it, VP8ModeScore* const rd) {
1118  const VP8Encoder* const enc = it->enc_;
1119  const int is_i16 = (it->mb_->type_ == 1);
1120  int nz = 0;
1121
1122  if (is_i16) {
1123    nz = ReconstructIntra16(it, rd, it->yuv_out_ + Y_OFF_ENC, it->preds_[0]);
1124  } else {
1125    VP8IteratorStartI4(it);
1126    do {
1127      const int mode =
1128          it->preds_[(it->i4_ & 3) + (it->i4_ >> 2) * enc->preds_w_];
1129      const uint8_t* const src = it->yuv_in_ + Y_OFF_ENC + VP8Scan[it->i4_];
1130      uint8_t* const dst = it->yuv_out_ + Y_OFF_ENC + VP8Scan[it->i4_];
1131      VP8MakeIntra4Preds(it);
1132      nz |= ReconstructIntra4(it, rd->y_ac_levels[it->i4_],
1133                              src, dst, mode) << it->i4_;
1134    } while (VP8IteratorRotateI4(it, it->yuv_out_ + Y_OFF_ENC));
1135  }
1136
1137  nz |= ReconstructUV(it, rd, it->yuv_out_ + U_OFF_ENC, it->mb_->uv_mode_);
1138  rd->nz = nz;
1139}
1140
1141// Refine intra16/intra4 sub-modes based on distortion only (not rate).
1142static void RefineUsingDistortion(VP8EncIterator* const it,
1143                                  int try_both_modes, int refine_uv_mode,
1144                                  VP8ModeScore* const rd) {
1145  score_t best_score = MAX_COST;
1146  int nz = 0;
1147  int mode;
1148  int is_i16 = try_both_modes || (it->mb_->type_ == 1);
1149
1150  const VP8SegmentInfo* const dqm = &it->enc_->dqm_[it->mb_->segment_];
1151  // Some empiric constants, of approximate order of magnitude.
1152  const int lambda_d_i16 = 106;
1153  const int lambda_d_i4 = 11;
1154  const int lambda_d_uv = 120;
1155  score_t score_i4 = dqm->i4_penalty_;
1156  score_t i4_bit_sum = 0;
1157  const score_t bit_limit = try_both_modes ? it->enc_->mb_header_limit_
1158                                           : MAX_COST;  // no early-out allowed
1159
1160  if (is_i16) {   // First, evaluate Intra16 distortion
1161    int best_mode = -1;
1162    const uint8_t* const src = it->yuv_in_ + Y_OFF_ENC;
1163    for (mode = 0; mode < NUM_PRED_MODES; ++mode) {
1164      const uint8_t* const ref = it->yuv_p_ + VP8I16ModeOffsets[mode];
1165      const score_t score = VP8SSE16x16(src, ref) * RD_DISTO_MULT
1166                          + VP8FixedCostsI16[mode] * lambda_d_i16;
1167      if (mode > 0 && VP8FixedCostsI16[mode] > bit_limit) {
1168        continue;
1169      }
1170      if (score < best_score) {
1171        best_mode = mode;
1172        best_score = score;
1173      }
1174    }
1175    VP8SetIntra16Mode(it, best_mode);
1176    // we'll reconstruct later, if i16 mode actually gets selected
1177  }
1178
1179  // Next, evaluate Intra4
1180  if (try_both_modes || !is_i16) {
1181    // We don't evaluate the rate here, but just account for it through a
1182    // constant penalty (i4 mode usually needs more bits compared to i16).
1183    is_i16 = 0;
1184    VP8IteratorStartI4(it);
1185    do {
1186      int best_i4_mode = -1;
1187      score_t best_i4_score = MAX_COST;
1188      const uint8_t* const src = it->yuv_in_ + Y_OFF_ENC + VP8Scan[it->i4_];
1189      const uint16_t* const mode_costs = GetCostModeI4(it, rd->modes_i4);
1190
1191      VP8MakeIntra4Preds(it);
1192      for (mode = 0; mode < NUM_BMODES; ++mode) {
1193        const uint8_t* const ref = it->yuv_p_ + VP8I4ModeOffsets[mode];
1194        const score_t score = VP8SSE4x4(src, ref) * RD_DISTO_MULT
1195                            + mode_costs[mode] * lambda_d_i4;
1196        if (score < best_i4_score) {
1197          best_i4_mode = mode;
1198          best_i4_score = score;
1199        }
1200      }
1201      i4_bit_sum += mode_costs[best_i4_mode];
1202      rd->modes_i4[it->i4_] = best_i4_mode;
1203      score_i4 += best_i4_score;
1204      if (score_i4 >= best_score || i4_bit_sum > bit_limit) {
1205        // Intra4 won't be better than Intra16. Bail out and pick Intra16.
1206        is_i16 = 1;
1207        break;
1208      } else {  // reconstruct partial block inside yuv_out2_ buffer
1209        uint8_t* const tmp_dst = it->yuv_out2_ + Y_OFF_ENC + VP8Scan[it->i4_];
1210        nz |= ReconstructIntra4(it, rd->y_ac_levels[it->i4_],
1211                                src, tmp_dst, best_i4_mode) << it->i4_;
1212      }
1213    } while (VP8IteratorRotateI4(it, it->yuv_out2_ + Y_OFF_ENC));
1214  }
1215
1216  // Final reconstruction, depending on which mode is selected.
1217  if (!is_i16) {
1218    VP8SetIntra4Mode(it, rd->modes_i4);
1219    SwapOut(it);
1220    best_score = score_i4;
1221  } else {
1222    nz = ReconstructIntra16(it, rd, it->yuv_out_ + Y_OFF_ENC, it->preds_[0]);
1223  }
1224
1225  // ... and UV!
1226  if (refine_uv_mode) {
1227    int best_mode = -1;
1228    score_t best_uv_score = MAX_COST;
1229    const uint8_t* const src = it->yuv_in_ + U_OFF_ENC;
1230    for (mode = 0; mode < NUM_PRED_MODES; ++mode) {
1231      const uint8_t* const ref = it->yuv_p_ + VP8UVModeOffsets[mode];
1232      const score_t score = VP8SSE16x8(src, ref) * RD_DISTO_MULT
1233                          + VP8FixedCostsUV[mode] * lambda_d_uv;
1234      if (score < best_uv_score) {
1235        best_mode = mode;
1236        best_uv_score = score;
1237      }
1238    }
1239    VP8SetIntraUVMode(it, best_mode);
1240  }
1241  nz |= ReconstructUV(it, rd, it->yuv_out_ + U_OFF_ENC, it->mb_->uv_mode_);
1242
1243  rd->nz = nz;
1244  rd->score = best_score;
1245}
1246
1247//------------------------------------------------------------------------------
1248// Entry point
1249
1250int VP8Decimate(VP8EncIterator* const it, VP8ModeScore* const rd,
1251                VP8RDLevel rd_opt) {
1252  int is_skipped;
1253  const int method = it->enc_->method_;
1254
1255  InitScore(rd);
1256
1257  // We can perform predictions for Luma16x16 and Chroma8x8 already.
1258  // Luma4x4 predictions needs to be done as-we-go.
1259  VP8MakeLuma16Preds(it);
1260  VP8MakeChroma8Preds(it);
1261
1262  if (rd_opt > RD_OPT_NONE) {
1263    it->do_trellis_ = (rd_opt >= RD_OPT_TRELLIS_ALL);
1264    PickBestIntra16(it, rd);
1265    if (method >= 2) {
1266      PickBestIntra4(it, rd);
1267    }
1268    PickBestUV(it, rd);
1269    if (rd_opt == RD_OPT_TRELLIS) {   // finish off with trellis-optim now
1270      it->do_trellis_ = 1;
1271      SimpleQuantize(it, rd);
1272    }
1273  } else {
1274    // At this point we have heuristically decided intra16 / intra4.
1275    // For method >= 2, pick the best intra4/intra16 based on SSE (~tad slower).
1276    // For method <= 1, we don't re-examine the decision but just go ahead with
1277    // quantization/reconstruction.
1278    RefineUsingDistortion(it, (method >= 2), (method >= 1), rd);
1279  }
1280  is_skipped = (rd->nz == 0);
1281  VP8SetSkip(it, is_skipped);
1282  return is_skipped;
1283}
1284