1// Copyright 2011 Google Inc. All Rights Reserved.
2//
3// Use of this source code is governed by a BSD-style license
4// that can be found in the COPYING file in the root of the source
5// tree. An additional intellectual property rights grant can be found
6// in the file PATENTS. All contributing project authors may
7// be found in the AUTHORS file in the root of the source tree.
8// -----------------------------------------------------------------------------
9//
10//   Quantization
11//
12// Author: Skal (pascal.massimino@gmail.com)
13
14#include <assert.h>
15#include <math.h>
16#include <stdlib.h>  // for abs()
17
18#include "./vp8enci.h"
19#include "./cost.h"
20
21#define DO_TRELLIS_I4  1
22#define DO_TRELLIS_I16 1   // not a huge gain, but ok at low bitrate.
23#define DO_TRELLIS_UV  0   // disable trellis for UV. Risky. Not worth.
24#define USE_TDISTO 1
25
26#define MID_ALPHA 64      // neutral value for susceptibility
27#define MIN_ALPHA 30      // lowest usable value for susceptibility
28#define MAX_ALPHA 100     // higher meaningful value for susceptibility
29
30#define SNS_TO_DQ 0.9     // Scaling constant between the sns value and the QP
31                          // power-law modulation. Must be strictly less than 1.
32
33#define I4_PENALTY 4000   // Rate-penalty for quick i4/i16 decision
34
35// number of non-zero coeffs below which we consider the block very flat
36// (and apply a penalty to complex predictions)
37#define FLATNESS_LIMIT_I16 10      // I16 mode
38#define FLATNESS_LIMIT_I4  3       // I4 mode
39#define FLATNESS_LIMIT_UV  2       // UV mode
40#define FLATNESS_PENALTY   140     // roughly ~1bit per block
41
42#define MULT_8B(a, b) (((a) * (b) + 128) >> 8)
43
44// #define DEBUG_BLOCK
45
46//------------------------------------------------------------------------------
47
48#if defined(DEBUG_BLOCK)
49
50#include <stdio.h>
51#include <stdlib.h>
52
53static void PrintBlockInfo(const VP8EncIterator* const it,
54                           const VP8ModeScore* const rd) {
55  int i, j;
56  const int is_i16 = (it->mb_->type_ == 1);
57  printf("SOURCE / OUTPUT / ABS DELTA\n");
58  for (j = 0; j < 24; ++j) {
59    if (j == 16) printf("\n");   // newline before the U/V block
60    for (i = 0; i < 16; ++i) printf("%3d ", it->yuv_in_[i + j * BPS]);
61    printf("     ");
62    for (i = 0; i < 16; ++i) printf("%3d ", it->yuv_out_[i + j * BPS]);
63    printf("     ");
64    for (i = 0; i < 16; ++i) {
65      printf("%1d ", abs(it->yuv_out_[i + j * BPS] - it->yuv_in_[i + j * BPS]));
66    }
67    printf("\n");
68  }
69  printf("\nD:%d SD:%d R:%d H:%d nz:0x%x score:%d\n",
70    (int)rd->D, (int)rd->SD, (int)rd->R, (int)rd->H, (int)rd->nz,
71    (int)rd->score);
72  if (is_i16) {
73    printf("Mode: %d\n", rd->mode_i16);
74    printf("y_dc_levels:");
75    for (i = 0; i < 16; ++i) printf("%3d ", rd->y_dc_levels[i]);
76    printf("\n");
77  } else {
78    printf("Modes[16]: ");
79    for (i = 0; i < 16; ++i) printf("%d ", rd->modes_i4[i]);
80    printf("\n");
81  }
82  printf("y_ac_levels:\n");
83  for (j = 0; j < 16; ++j) {
84    for (i = is_i16 ? 1 : 0; i < 16; ++i) {
85      printf("%4d ", rd->y_ac_levels[j][i]);
86    }
87    printf("\n");
88  }
89  printf("\n");
90  printf("uv_levels (mode=%d):\n", rd->mode_uv);
91  for (j = 0; j < 8; ++j) {
92    for (i = 0; i < 16; ++i) {
93      printf("%4d ", rd->uv_levels[j][i]);
94    }
95    printf("\n");
96  }
97}
98
99#endif   // DEBUG_BLOCK
100
101//------------------------------------------------------------------------------
102
103static WEBP_INLINE int clip(int v, int m, int M) {
104  return v < m ? m : v > M ? M : v;
105}
106
107static const uint8_t kZigzag[16] = {
108  0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15
109};
110
111static const uint8_t kDcTable[128] = {
112  4,     5,   6,   7,   8,   9,  10,  10,
113  11,   12,  13,  14,  15,  16,  17,  17,
114  18,   19,  20,  20,  21,  21,  22,  22,
115  23,   23,  24,  25,  25,  26,  27,  28,
116  29,   30,  31,  32,  33,  34,  35,  36,
117  37,   37,  38,  39,  40,  41,  42,  43,
118  44,   45,  46,  46,  47,  48,  49,  50,
119  51,   52,  53,  54,  55,  56,  57,  58,
120  59,   60,  61,  62,  63,  64,  65,  66,
121  67,   68,  69,  70,  71,  72,  73,  74,
122  75,   76,  76,  77,  78,  79,  80,  81,
123  82,   83,  84,  85,  86,  87,  88,  89,
124  91,   93,  95,  96,  98, 100, 101, 102,
125  104, 106, 108, 110, 112, 114, 116, 118,
126  122, 124, 126, 128, 130, 132, 134, 136,
127  138, 140, 143, 145, 148, 151, 154, 157
128};
129
130static const uint16_t kAcTable[128] = {
131  4,     5,   6,   7,   8,   9,  10,  11,
132  12,   13,  14,  15,  16,  17,  18,  19,
133  20,   21,  22,  23,  24,  25,  26,  27,
134  28,   29,  30,  31,  32,  33,  34,  35,
135  36,   37,  38,  39,  40,  41,  42,  43,
136  44,   45,  46,  47,  48,  49,  50,  51,
137  52,   53,  54,  55,  56,  57,  58,  60,
138  62,   64,  66,  68,  70,  72,  74,  76,
139  78,   80,  82,  84,  86,  88,  90,  92,
140  94,   96,  98, 100, 102, 104, 106, 108,
141  110, 112, 114, 116, 119, 122, 125, 128,
142  131, 134, 137, 140, 143, 146, 149, 152,
143  155, 158, 161, 164, 167, 170, 173, 177,
144  181, 185, 189, 193, 197, 201, 205, 209,
145  213, 217, 221, 225, 229, 234, 239, 245,
146  249, 254, 259, 264, 269, 274, 279, 284
147};
148
149static const uint16_t kAcTable2[128] = {
150  8,     8,   9,  10,  12,  13,  15,  17,
151  18,   20,  21,  23,  24,  26,  27,  29,
152  31,   32,  34,  35,  37,  38,  40,  41,
153  43,   44,  46,  48,  49,  51,  52,  54,
154  55,   57,  58,  60,  62,  63,  65,  66,
155  68,   69,  71,  72,  74,  75,  77,  79,
156  80,   82,  83,  85,  86,  88,  89,  93,
157  96,   99, 102, 105, 108, 111, 114, 117,
158  120, 124, 127, 130, 133, 136, 139, 142,
159  145, 148, 151, 155, 158, 161, 164, 167,
160  170, 173, 176, 179, 184, 189, 193, 198,
161  203, 207, 212, 217, 221, 226, 230, 235,
162  240, 244, 249, 254, 258, 263, 268, 274,
163  280, 286, 292, 299, 305, 311, 317, 323,
164  330, 336, 342, 348, 354, 362, 370, 379,
165  385, 393, 401, 409, 416, 424, 432, 440
166};
167
168static const uint8_t kBiasMatrices[3][2] = {  // [luma-ac,luma-dc,chroma][dc,ac]
169  { 96, 110 }, { 96, 108 }, { 110, 115 }
170};
171
172// Sharpening by (slightly) raising the hi-frequency coeffs.
173// Hack-ish but helpful for mid-bitrate range. Use with care.
174#define SHARPEN_BITS 11  // number of descaling bits for sharpening bias
175static const uint8_t kFreqSharpening[16] = {
176  0,  30, 60, 90,
177  30, 60, 90, 90,
178  60, 90, 90, 90,
179  90, 90, 90, 90
180};
181
182//------------------------------------------------------------------------------
183// Initialize quantization parameters in VP8Matrix
184
185// Returns the average quantizer
186static int ExpandMatrix(VP8Matrix* const m, int type) {
187  int i, sum;
188  for (i = 0; i < 2; ++i) {
189    const int is_ac_coeff = (i > 0);
190    const int bias = kBiasMatrices[type][is_ac_coeff];
191    m->iq_[i] = (1 << QFIX) / m->q_[i];
192    m->bias_[i] = BIAS(bias);
193    // zthresh_ is the exact value such that QUANTDIV(coeff, iQ, B) is:
194    //   * zero if coeff <= zthresh
195    //   * non-zero if coeff > zthresh
196    m->zthresh_[i] = ((1 << QFIX) - 1 - m->bias_[i]) / m->iq_[i];
197  }
198  for (i = 2; i < 16; ++i) {
199    m->q_[i] = m->q_[1];
200    m->iq_[i] = m->iq_[1];
201    m->bias_[i] = m->bias_[1];
202    m->zthresh_[i] = m->zthresh_[1];
203  }
204  for (sum = 0, i = 0; i < 16; ++i) {
205    if (type == 0) {  // we only use sharpening for AC luma coeffs
206      m->sharpen_[i] = (kFreqSharpening[i] * m->q_[i]) >> SHARPEN_BITS;
207    } else {
208      m->sharpen_[i] = 0;
209    }
210    sum += m->q_[i];
211  }
212  return (sum + 8) >> 4;
213}
214
215static void SetupMatrices(VP8Encoder* enc) {
216  int i;
217  const int tlambda_scale =
218    (enc->method_ >= 4) ? enc->config_->sns_strength
219                        : 0;
220  const int num_segments = enc->segment_hdr_.num_segments_;
221  for (i = 0; i < num_segments; ++i) {
222    VP8SegmentInfo* const m = &enc->dqm_[i];
223    const int q = m->quant_;
224    int q4, q16, quv;
225    m->y1_.q_[0] = kDcTable[clip(q + enc->dq_y1_dc_, 0, 127)];
226    m->y1_.q_[1] = kAcTable[clip(q,                  0, 127)];
227
228    m->y2_.q_[0] = kDcTable[ clip(q + enc->dq_y2_dc_, 0, 127)] * 2;
229    m->y2_.q_[1] = kAcTable2[clip(q + enc->dq_y2_ac_, 0, 127)];
230
231    m->uv_.q_[0] = kDcTable[clip(q + enc->dq_uv_dc_, 0, 117)];
232    m->uv_.q_[1] = kAcTable[clip(q + enc->dq_uv_ac_, 0, 127)];
233
234    q4  = ExpandMatrix(&m->y1_, 0);
235    q16 = ExpandMatrix(&m->y2_, 1);
236    quv = ExpandMatrix(&m->uv_, 2);
237
238    m->lambda_i4_          = (3 * q4 * q4) >> 7;
239    m->lambda_i16_         = (3 * q16 * q16);
240    m->lambda_uv_          = (3 * quv * quv) >> 6;
241    m->lambda_mode_        = (1 * q4 * q4) >> 7;
242    m->lambda_trellis_i4_  = (7 * q4 * q4) >> 3;
243    m->lambda_trellis_i16_ = (q16 * q16) >> 2;
244    m->lambda_trellis_uv_  = (quv *quv) << 1;
245    m->tlambda_            = (tlambda_scale * q4) >> 5;
246
247    m->min_disto_ = 10 * m->y1_.q_[0];   // quantization-aware min disto
248    m->max_edge_  = 0;
249  }
250}
251
252//------------------------------------------------------------------------------
253// Initialize filtering parameters
254
255// Very small filter-strength values have close to no visual effect. So we can
256// save a little decoding-CPU by turning filtering off for these.
257#define FSTRENGTH_CUTOFF 2
258
259static void SetupFilterStrength(VP8Encoder* const enc) {
260  int i;
261  // level0 is in [0..500]. Using '-f 50' as filter_strength is mid-filtering.
262  const int level0 = 5 * enc->config_->filter_strength;
263  for (i = 0; i < NUM_MB_SEGMENTS; ++i) {
264    VP8SegmentInfo* const m = &enc->dqm_[i];
265    // We focus on the quantization of AC coeffs.
266    const int qstep = kAcTable[clip(m->quant_, 0, 127)] >> 2;
267    const int base_strength =
268        VP8FilterStrengthFromDelta(enc->filter_hdr_.sharpness_, qstep);
269    // Segments with lower complexity ('beta') will be less filtered.
270    const int f = base_strength * level0 / (256 + m->beta_);
271    m->fstrength_ = (f < FSTRENGTH_CUTOFF) ? 0 : (f > 63) ? 63 : f;
272  }
273  // We record the initial strength (mainly for the case of 1-segment only).
274  enc->filter_hdr_.level_ = enc->dqm_[0].fstrength_;
275  enc->filter_hdr_.simple_ = (enc->config_->filter_type == 0);
276  enc->filter_hdr_.sharpness_ = enc->config_->filter_sharpness;
277}
278
279//------------------------------------------------------------------------------
280
281// Note: if you change the values below, remember that the max range
282// allowed by the syntax for DQ_UV is [-16,16].
283#define MAX_DQ_UV (6)
284#define MIN_DQ_UV (-4)
285
286// We want to emulate jpeg-like behaviour where the expected "good" quality
287// is around q=75. Internally, our "good" middle is around c=50. So we
288// map accordingly using linear piece-wise function
289static double QualityToCompression(double c) {
290  const double linear_c = (c < 0.75) ? c * (2. / 3.) : 2. * c - 1.;
291  // The file size roughly scales as pow(quantizer, 3.). Actually, the
292  // exponent is somewhere between 2.8 and 3.2, but we're mostly interested
293  // in the mid-quant range. So we scale the compressibility inversely to
294  // this power-law: quant ~= compression ^ 1/3. This law holds well for
295  // low quant. Finer modeling for high-quant would make use of kAcTable[]
296  // more explicitly.
297  const double v = pow(linear_c, 1 / 3.);
298  return v;
299}
300
301static double QualityToJPEGCompression(double c, double alpha) {
302  // We map the complexity 'alpha' and quality setting 'c' to a compression
303  // exponent empirically matched to the compression curve of libjpeg6b.
304  // On average, the WebP output size will be roughly similar to that of a
305  // JPEG file compressed with same quality factor.
306  const double amin = 0.30;
307  const double amax = 0.85;
308  const double exp_min = 0.4;
309  const double exp_max = 0.9;
310  const double slope = (exp_min - exp_max) / (amax - amin);
311  // Linearly interpolate 'expn' from exp_min to exp_max
312  // in the [amin, amax] range.
313  const double expn = (alpha > amax) ? exp_min
314                    : (alpha < amin) ? exp_max
315                    : exp_max + slope * (alpha - amin);
316  const double v = pow(c, expn);
317  return v;
318}
319
320static int SegmentsAreEquivalent(const VP8SegmentInfo* const S1,
321                                 const VP8SegmentInfo* const S2) {
322  return (S1->quant_ == S2->quant_) && (S1->fstrength_ == S2->fstrength_);
323}
324
325static void SimplifySegments(VP8Encoder* const enc) {
326  int map[NUM_MB_SEGMENTS] = { 0, 1, 2, 3 };
327  const int num_segments = enc->segment_hdr_.num_segments_;
328  int num_final_segments = 1;
329  int s1, s2;
330  for (s1 = 1; s1 < num_segments; ++s1) {    // find similar segments
331    const VP8SegmentInfo* const S1 = &enc->dqm_[s1];
332    int found = 0;
333    // check if we already have similar segment
334    for (s2 = 0; s2 < num_final_segments; ++s2) {
335      const VP8SegmentInfo* const S2 = &enc->dqm_[s2];
336      if (SegmentsAreEquivalent(S1, S2)) {
337        found = 1;
338        break;
339      }
340    }
341    map[s1] = s2;
342    if (!found) {
343      if (num_final_segments != s1) {
344        enc->dqm_[num_final_segments] = enc->dqm_[s1];
345      }
346      ++num_final_segments;
347    }
348  }
349  if (num_final_segments < num_segments) {  // Remap
350    int i = enc->mb_w_ * enc->mb_h_;
351    while (i-- > 0) enc->mb_info_[i].segment_ = map[enc->mb_info_[i].segment_];
352    enc->segment_hdr_.num_segments_ = num_final_segments;
353    // Replicate the trailing segment infos (it's mostly cosmetics)
354    for (i = num_final_segments; i < num_segments; ++i) {
355      enc->dqm_[i] = enc->dqm_[num_final_segments - 1];
356    }
357  }
358}
359
360void VP8SetSegmentParams(VP8Encoder* const enc, float quality) {
361  int i;
362  int dq_uv_ac, dq_uv_dc;
363  const int num_segments = enc->segment_hdr_.num_segments_;
364  const double amp = SNS_TO_DQ * enc->config_->sns_strength / 100. / 128.;
365  const double Q = quality / 100.;
366  const double c_base = enc->config_->emulate_jpeg_size ?
367      QualityToJPEGCompression(Q, enc->alpha_ / 255.) :
368      QualityToCompression(Q);
369  for (i = 0; i < num_segments; ++i) {
370    // We modulate the base coefficient to accommodate for the quantization
371    // susceptibility and allow denser segments to be quantized more.
372    const double expn = 1. - amp * enc->dqm_[i].alpha_;
373    const double c = pow(c_base, expn);
374    const int q = (int)(127. * (1. - c));
375    assert(expn > 0.);
376    enc->dqm_[i].quant_ = clip(q, 0, 127);
377  }
378
379  // purely indicative in the bitstream (except for the 1-segment case)
380  enc->base_quant_ = enc->dqm_[0].quant_;
381
382  // fill-in values for the unused segments (required by the syntax)
383  for (i = num_segments; i < NUM_MB_SEGMENTS; ++i) {
384    enc->dqm_[i].quant_ = enc->base_quant_;
385  }
386
387  // uv_alpha_ is normally spread around ~60. The useful range is
388  // typically ~30 (quite bad) to ~100 (ok to decimate UV more).
389  // We map it to the safe maximal range of MAX/MIN_DQ_UV for dq_uv.
390  dq_uv_ac = (enc->uv_alpha_ - MID_ALPHA) * (MAX_DQ_UV - MIN_DQ_UV)
391                                          / (MAX_ALPHA - MIN_ALPHA);
392  // we rescale by the user-defined strength of adaptation
393  dq_uv_ac = dq_uv_ac * enc->config_->sns_strength / 100;
394  // and make it safe.
395  dq_uv_ac = clip(dq_uv_ac, MIN_DQ_UV, MAX_DQ_UV);
396  // We also boost the dc-uv-quant a little, based on sns-strength, since
397  // U/V channels are quite more reactive to high quants (flat DC-blocks
398  // tend to appear, and are unpleasant).
399  dq_uv_dc = -4 * enc->config_->sns_strength / 100;
400  dq_uv_dc = clip(dq_uv_dc, -15, 15);   // 4bit-signed max allowed
401
402  enc->dq_y1_dc_ = 0;       // TODO(skal): dq-lum
403  enc->dq_y2_dc_ = 0;
404  enc->dq_y2_ac_ = 0;
405  enc->dq_uv_dc_ = dq_uv_dc;
406  enc->dq_uv_ac_ = dq_uv_ac;
407
408  SetupFilterStrength(enc);   // initialize segments' filtering, eventually
409
410  if (num_segments > 1) SimplifySegments(enc);
411
412  SetupMatrices(enc);         // finalize quantization matrices
413}
414
415//------------------------------------------------------------------------------
416// Form the predictions in cache
417
418// Must be ordered using {DC_PRED, TM_PRED, V_PRED, H_PRED} as index
419const int VP8I16ModeOffsets[4] = { I16DC16, I16TM16, I16VE16, I16HE16 };
420const int VP8UVModeOffsets[4] = { C8DC8, C8TM8, C8VE8, C8HE8 };
421
422// Must be indexed using {B_DC_PRED -> B_HU_PRED} as index
423const int VP8I4ModeOffsets[NUM_BMODES] = {
424  I4DC4, I4TM4, I4VE4, I4HE4, I4RD4, I4VR4, I4LD4, I4VL4, I4HD4, I4HU4
425};
426
427void VP8MakeLuma16Preds(const VP8EncIterator* const it) {
428  const uint8_t* const left = it->x_ ? it->y_left_ : NULL;
429  const uint8_t* const top = it->y_ ? it->y_top_ : NULL;
430  VP8EncPredLuma16(it->yuv_p_, left, top);
431}
432
433void VP8MakeChroma8Preds(const VP8EncIterator* const it) {
434  const uint8_t* const left = it->x_ ? it->u_left_ : NULL;
435  const uint8_t* const top = it->y_ ? it->uv_top_ : NULL;
436  VP8EncPredChroma8(it->yuv_p_, left, top);
437}
438
439void VP8MakeIntra4Preds(const VP8EncIterator* const it) {
440  VP8EncPredLuma4(it->yuv_p_, it->i4_top_);
441}
442
443//------------------------------------------------------------------------------
444// Quantize
445
446// Layout:
447// +----+
448// |YYYY| 0
449// |YYYY| 4
450// |YYYY| 8
451// |YYYY| 12
452// +----+
453// |UUVV| 16
454// |UUVV| 20
455// +----+
456
457const int VP8Scan[16] = {  // Luma
458  0 +  0 * BPS,  4 +  0 * BPS, 8 +  0 * BPS, 12 +  0 * BPS,
459  0 +  4 * BPS,  4 +  4 * BPS, 8 +  4 * BPS, 12 +  4 * BPS,
460  0 +  8 * BPS,  4 +  8 * BPS, 8 +  8 * BPS, 12 +  8 * BPS,
461  0 + 12 * BPS,  4 + 12 * BPS, 8 + 12 * BPS, 12 + 12 * BPS,
462};
463
464static const int VP8ScanUV[4 + 4] = {
465  0 + 0 * BPS,   4 + 0 * BPS, 0 + 4 * BPS,  4 + 4 * BPS,    // U
466  8 + 0 * BPS,  12 + 0 * BPS, 8 + 4 * BPS, 12 + 4 * BPS     // V
467};
468
469//------------------------------------------------------------------------------
470// Distortion measurement
471
472static const uint16_t kWeightY[16] = {
473  38, 32, 20, 9, 32, 28, 17, 7, 20, 17, 10, 4, 9, 7, 4, 2
474};
475
476static const uint16_t kWeightTrellis[16] = {
477#if USE_TDISTO == 0
478  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
479#else
480  30, 27, 19, 11,
481  27, 24, 17, 10,
482  19, 17, 12,  8,
483  11, 10,  8,  6
484#endif
485};
486
487// Init/Copy the common fields in score.
488static void InitScore(VP8ModeScore* const rd) {
489  rd->D  = 0;
490  rd->SD = 0;
491  rd->R  = 0;
492  rd->H  = 0;
493  rd->nz = 0;
494  rd->score = MAX_COST;
495}
496
497static void CopyScore(VP8ModeScore* const dst, const VP8ModeScore* const src) {
498  dst->D  = src->D;
499  dst->SD = src->SD;
500  dst->R  = src->R;
501  dst->H  = src->H;
502  dst->nz = src->nz;      // note that nz is not accumulated, but just copied.
503  dst->score = src->score;
504}
505
506static void AddScore(VP8ModeScore* const dst, const VP8ModeScore* const src) {
507  dst->D  += src->D;
508  dst->SD += src->SD;
509  dst->R  += src->R;
510  dst->H  += src->H;
511  dst->nz |= src->nz;     // here, new nz bits are accumulated.
512  dst->score += src->score;
513}
514
515//------------------------------------------------------------------------------
516// Performs trellis-optimized quantization.
517
518// Trellis node
519typedef struct {
520  int8_t prev;            // best previous node
521  int8_t sign;            // sign of coeff_i
522  int16_t level;          // level
523} Node;
524
525// Score state
526typedef struct {
527  score_t score;          // partial RD score
528  const uint16_t* costs;  // shortcut to cost tables
529} ScoreState;
530
531// If a coefficient was quantized to a value Q (using a neutral bias),
532// we test all alternate possibilities between [Q-MIN_DELTA, Q+MAX_DELTA]
533// We don't test negative values though.
534#define MIN_DELTA 0   // how much lower level to try
535#define MAX_DELTA 1   // how much higher
536#define NUM_NODES (MIN_DELTA + 1 + MAX_DELTA)
537#define NODE(n, l) (nodes[(n)][(l) + MIN_DELTA])
538#define SCORE_STATE(n, l) (score_states[n][(l) + MIN_DELTA])
539
540static WEBP_INLINE void SetRDScore(int lambda, VP8ModeScore* const rd) {
541  // TODO: incorporate the "* 256" in the tables?
542  rd->score = (rd->R + rd->H) * lambda + 256 * (rd->D + rd->SD);
543}
544
545static WEBP_INLINE score_t RDScoreTrellis(int lambda, score_t rate,
546                                          score_t distortion) {
547  return rate * lambda + 256 * distortion;
548}
549
550static int TrellisQuantizeBlock(const VP8Encoder* const enc,
551                                int16_t in[16], int16_t out[16],
552                                int ctx0, int coeff_type,
553                                const VP8Matrix* const mtx,
554                                int lambda) {
555  const ProbaArray* const probas = enc->proba_.coeffs_[coeff_type];
556  const CostArray* const costs = enc->proba_.level_cost_[coeff_type];
557  const int first = (coeff_type == 0) ? 1 : 0;
558  Node nodes[16][NUM_NODES];
559  ScoreState score_states[2][NUM_NODES];
560  ScoreState* ss_cur = &SCORE_STATE(0, MIN_DELTA);
561  ScoreState* ss_prev = &SCORE_STATE(1, MIN_DELTA);
562  int best_path[3] = {-1, -1, -1};   // store best-last/best-level/best-previous
563  score_t best_score;
564  int n, m, p, last;
565
566  {
567    score_t cost;
568    const int thresh = mtx->q_[1] * mtx->q_[1] / 4;
569    const int last_proba = probas[VP8EncBands[first]][ctx0][0];
570
571    // compute the position of the last interesting coefficient
572    last = first - 1;
573    for (n = 15; n >= first; --n) {
574      const int j = kZigzag[n];
575      const int err = in[j] * in[j];
576      if (err > thresh) {
577        last = n;
578        break;
579      }
580    }
581    // we don't need to go inspect up to n = 16 coeffs. We can just go up
582    // to last + 1 (inclusive) without losing much.
583    if (last < 15) ++last;
584
585    // compute 'skip' score. This is the max score one can do.
586    cost = VP8BitCost(0, last_proba);
587    best_score = RDScoreTrellis(lambda, cost, 0);
588
589    // initialize source node.
590    for (m = -MIN_DELTA; m <= MAX_DELTA; ++m) {
591      const score_t rate = (ctx0 == 0) ? VP8BitCost(1, last_proba) : 0;
592      ss_cur[m].score = RDScoreTrellis(lambda, rate, 0);
593      ss_cur[m].costs = costs[VP8EncBands[first]][ctx0];
594    }
595  }
596
597  // traverse trellis.
598  for (n = first; n <= last; ++n) {
599    const int j = kZigzag[n];
600    const uint32_t Q  = mtx->q_[j];
601    const uint32_t iQ = mtx->iq_[j];
602    const uint32_t B = BIAS(0x00);     // neutral bias
603    // note: it's important to take sign of the _original_ coeff,
604    // so we don't have to consider level < 0 afterward.
605    const int sign = (in[j] < 0);
606    const uint32_t coeff0 = (sign ? -in[j] : in[j]) + mtx->sharpen_[j];
607    int level0 = QUANTDIV(coeff0, iQ, B);
608    if (level0 > MAX_LEVEL) level0 = MAX_LEVEL;
609
610    {   // Swap current and previous score states
611      ScoreState* const tmp = ss_cur;
612      ss_cur = ss_prev;
613      ss_prev = tmp;
614    }
615
616    // test all alternate level values around level0.
617    for (m = -MIN_DELTA; m <= MAX_DELTA; ++m) {
618      Node* const cur = &NODE(n, m);
619      int level = level0 + m;
620      const int ctx = (level > 2) ? 2 : level;
621      const int band = VP8EncBands[n + 1];
622      score_t base_score, last_pos_score;
623      score_t best_cur_score = MAX_COST;
624      int best_prev = 0;   // default, in case
625
626      ss_cur[m].score = MAX_COST;
627      ss_cur[m].costs = costs[band][ctx];
628      if (level > MAX_LEVEL || level < 0) {   // node is dead?
629        continue;
630      }
631
632      // Compute extra rate cost if last coeff's position is < 15
633      {
634        const score_t last_pos_cost =
635            (n < 15) ? VP8BitCost(0, probas[band][ctx][0]) : 0;
636        last_pos_score = RDScoreTrellis(lambda, last_pos_cost, 0);
637      }
638
639      {
640        // Compute delta_error = how much coding this level will
641        // subtract to max_error as distortion.
642        // Here, distortion = sum of (|coeff_i| - level_i * Q_i)^2
643        const int new_error = coeff0 - level * Q;
644        const int delta_error =
645            kWeightTrellis[j] * (new_error * new_error - coeff0 * coeff0);
646        base_score = RDScoreTrellis(lambda, 0, delta_error);
647      }
648
649      // Inspect all possible non-dead predecessors. Retain only the best one.
650      for (p = -MIN_DELTA; p <= MAX_DELTA; ++p) {
651        // Dead nodes (with ss_prev[p].score >= MAX_COST) are automatically
652        // eliminated since their score can't be better than the current best.
653        const score_t cost = VP8LevelCost(ss_prev[p].costs, level);
654        // Examine node assuming it's a non-terminal one.
655        const score_t score =
656            base_score + ss_prev[p].score + RDScoreTrellis(lambda, cost, 0);
657        if (score < best_cur_score) {
658          best_cur_score = score;
659          best_prev = p;
660        }
661      }
662      // Store best finding in current node.
663      cur->sign = sign;
664      cur->level = level;
665      cur->prev = best_prev;
666      ss_cur[m].score = best_cur_score;
667
668      // Now, record best terminal node (and thus best entry in the graph).
669      if (level != 0) {
670        const score_t score = best_cur_score + last_pos_score;
671        if (score < best_score) {
672          best_score = score;
673          best_path[0] = n;                     // best eob position
674          best_path[1] = m;                     // best node index
675          best_path[2] = best_prev;             // best predecessor
676        }
677      }
678    }
679  }
680
681  // Fresh start
682  memset(in + first, 0, (16 - first) * sizeof(*in));
683  memset(out + first, 0, (16 - first) * sizeof(*out));
684  if (best_path[0] == -1) {
685    return 0;   // skip!
686  }
687
688  {
689    // Unwind the best path.
690    // Note: best-prev on terminal node is not necessarily equal to the
691    // best_prev for non-terminal. So we patch best_path[2] in.
692    int nz = 0;
693    int best_node = best_path[1];
694    n = best_path[0];
695    NODE(n, best_node).prev = best_path[2];   // force best-prev for terminal
696
697    for (; n >= first; --n) {
698      const Node* const node = &NODE(n, best_node);
699      const int j = kZigzag[n];
700      out[n] = node->sign ? -node->level : node->level;
701      nz |= node->level;
702      in[j] = out[n] * mtx->q_[j];
703      best_node = node->prev;
704    }
705    return (nz != 0);
706  }
707}
708
709#undef NODE
710
711//------------------------------------------------------------------------------
712// Performs: difference, transform, quantize, back-transform, add
713// all at once. Output is the reconstructed block in *yuv_out, and the
714// quantized levels in *levels.
715
716static int ReconstructIntra16(VP8EncIterator* const it,
717                              VP8ModeScore* const rd,
718                              uint8_t* const yuv_out,
719                              int mode) {
720  const VP8Encoder* const enc = it->enc_;
721  const uint8_t* const ref = it->yuv_p_ + VP8I16ModeOffsets[mode];
722  const uint8_t* const src = it->yuv_in_ + Y_OFF;
723  const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
724  int nz = 0;
725  int n;
726  int16_t tmp[16][16], dc_tmp[16];
727
728  for (n = 0; n < 16; ++n) {
729    VP8FTransform(src + VP8Scan[n], ref + VP8Scan[n], tmp[n]);
730  }
731  VP8FTransformWHT(tmp[0], dc_tmp);
732  nz |= VP8EncQuantizeBlockWHT(dc_tmp, rd->y_dc_levels, &dqm->y2_) << 24;
733
734  if (DO_TRELLIS_I16 && it->do_trellis_) {
735    int x, y;
736    VP8IteratorNzToBytes(it);
737    for (y = 0, n = 0; y < 4; ++y) {
738      for (x = 0; x < 4; ++x, ++n) {
739        const int ctx = it->top_nz_[x] + it->left_nz_[y];
740        const int non_zero =
741            TrellisQuantizeBlock(enc, tmp[n], rd->y_ac_levels[n], ctx, 0,
742                                 &dqm->y1_, dqm->lambda_trellis_i16_);
743        it->top_nz_[x] = it->left_nz_[y] = non_zero;
744        rd->y_ac_levels[n][0] = 0;
745        nz |= non_zero << n;
746      }
747    }
748  } else {
749    for (n = 0; n < 16; ++n) {
750      // Zero-out the first coeff, so that: a) nz is correct below, and
751      // b) finding 'last' non-zero coeffs in SetResidualCoeffs() is simplified.
752      tmp[n][0] = 0;
753      nz |= VP8EncQuantizeBlock(tmp[n], rd->y_ac_levels[n], &dqm->y1_) << n;
754      assert(rd->y_ac_levels[n][0] == 0);
755    }
756  }
757
758  // Transform back
759  VP8TransformWHT(dc_tmp, tmp[0]);
760  for (n = 0; n < 16; n += 2) {
761    VP8ITransform(ref + VP8Scan[n], tmp[n], yuv_out + VP8Scan[n], 1);
762  }
763
764  return nz;
765}
766
767static int ReconstructIntra4(VP8EncIterator* const it,
768                             int16_t levels[16],
769                             const uint8_t* const src,
770                             uint8_t* const yuv_out,
771                             int mode) {
772  const VP8Encoder* const enc = it->enc_;
773  const uint8_t* const ref = it->yuv_p_ + VP8I4ModeOffsets[mode];
774  const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
775  int nz = 0;
776  int16_t tmp[16];
777
778  VP8FTransform(src, ref, tmp);
779  if (DO_TRELLIS_I4 && it->do_trellis_) {
780    const int x = it->i4_ & 3, y = it->i4_ >> 2;
781    const int ctx = it->top_nz_[x] + it->left_nz_[y];
782    nz = TrellisQuantizeBlock(enc, tmp, levels, ctx, 3, &dqm->y1_,
783                              dqm->lambda_trellis_i4_);
784  } else {
785    nz = VP8EncQuantizeBlock(tmp, levels, &dqm->y1_);
786  }
787  VP8ITransform(ref, tmp, yuv_out, 0);
788  return nz;
789}
790
791static int ReconstructUV(VP8EncIterator* const it, VP8ModeScore* const rd,
792                         uint8_t* const yuv_out, int mode) {
793  const VP8Encoder* const enc = it->enc_;
794  const uint8_t* const ref = it->yuv_p_ + VP8UVModeOffsets[mode];
795  const uint8_t* const src = it->yuv_in_ + U_OFF;
796  const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
797  int nz = 0;
798  int n;
799  int16_t tmp[8][16];
800
801  for (n = 0; n < 8; ++n) {
802    VP8FTransform(src + VP8ScanUV[n], ref + VP8ScanUV[n], tmp[n]);
803  }
804  if (DO_TRELLIS_UV && it->do_trellis_) {
805    int ch, x, y;
806    for (ch = 0, n = 0; ch <= 2; ch += 2) {
807      for (y = 0; y < 2; ++y) {
808        for (x = 0; x < 2; ++x, ++n) {
809          const int ctx = it->top_nz_[4 + ch + x] + it->left_nz_[4 + ch + y];
810          const int non_zero =
811              TrellisQuantizeBlock(enc, tmp[n], rd->uv_levels[n], ctx, 2,
812                                   &dqm->uv_, dqm->lambda_trellis_uv_);
813          it->top_nz_[4 + ch + x] = it->left_nz_[4 + ch + y] = non_zero;
814          nz |= non_zero << n;
815        }
816      }
817    }
818  } else {
819    for (n = 0; n < 8; ++n) {
820      nz |= VP8EncQuantizeBlock(tmp[n], rd->uv_levels[n], &dqm->uv_) << n;
821    }
822  }
823
824  for (n = 0; n < 8; n += 2) {
825    VP8ITransform(ref + VP8ScanUV[n], tmp[n], yuv_out + VP8ScanUV[n], 1);
826  }
827  return (nz << 16);
828}
829
830//------------------------------------------------------------------------------
831// RD-opt decision. Reconstruct each modes, evalue distortion and bit-cost.
832// Pick the mode is lower RD-cost = Rate + lambda * Distortion.
833
834static void StoreMaxDelta(VP8SegmentInfo* const dqm, const int16_t DCs[16]) {
835  // We look at the first three AC coefficients to determine what is the average
836  // delta between each sub-4x4 block.
837  const int v0 = abs(DCs[1]);
838  const int v1 = abs(DCs[4]);
839  const int v2 = abs(DCs[5]);
840  int max_v = (v0 > v1) ? v1 : v0;
841  max_v = (v2 > max_v) ? v2 : max_v;
842  if (max_v > dqm->max_edge_) dqm->max_edge_ = max_v;
843}
844
845static void SwapPtr(uint8_t** a, uint8_t** b) {
846  uint8_t* const tmp = *a;
847  *a = *b;
848  *b = tmp;
849}
850
851static void SwapOut(VP8EncIterator* const it) {
852  SwapPtr(&it->yuv_out_, &it->yuv_out2_);
853}
854
855static score_t IsFlat(const int16_t* levels, int num_blocks, score_t thresh) {
856  score_t score = 0;
857  while (num_blocks-- > 0) {      // TODO(skal): refine positional scoring?
858    int i;
859    for (i = 1; i < 16; ++i) {    // omit DC, we're only interested in AC
860      score += (levels[i] != 0);
861      if (score > thresh) return 0;
862    }
863    levels += 16;
864  }
865  return 1;
866}
867
868static void PickBestIntra16(VP8EncIterator* const it, VP8ModeScore* const rd) {
869  const int kNumBlocks = 16;
870  VP8SegmentInfo* const dqm = &it->enc_->dqm_[it->mb_->segment_];
871  const int lambda = dqm->lambda_i16_;
872  const int tlambda = dqm->tlambda_;
873  const uint8_t* const src = it->yuv_in_ + Y_OFF;
874  VP8ModeScore rd16;
875  int mode;
876
877  rd->mode_i16 = -1;
878  for (mode = 0; mode < NUM_PRED_MODES; ++mode) {
879    uint8_t* const tmp_dst = it->yuv_out2_ + Y_OFF;  // scratch buffer
880    int nz;
881
882    // Reconstruct
883    nz = ReconstructIntra16(it, &rd16, tmp_dst, mode);
884
885    // Measure RD-score
886    rd16.D = VP8SSE16x16(src, tmp_dst);
887    rd16.SD = tlambda ? MULT_8B(tlambda, VP8TDisto16x16(src, tmp_dst, kWeightY))
888            : 0;
889    rd16.H = VP8FixedCostsI16[mode];
890    rd16.R = VP8GetCostLuma16(it, &rd16);
891    if (mode > 0 &&
892        IsFlat(rd16.y_ac_levels[0], kNumBlocks, FLATNESS_LIMIT_I16)) {
893      // penalty to avoid flat area to be mispredicted by complex mode
894      rd16.R += FLATNESS_PENALTY * kNumBlocks;
895    }
896
897    // Since we always examine Intra16 first, we can overwrite *rd directly.
898    SetRDScore(lambda, &rd16);
899    if (mode == 0 || rd16.score < rd->score) {
900      CopyScore(rd, &rd16);
901      rd->mode_i16 = mode;
902      rd->nz = nz;
903      memcpy(rd->y_ac_levels, rd16.y_ac_levels, sizeof(rd16.y_ac_levels));
904      memcpy(rd->y_dc_levels, rd16.y_dc_levels, sizeof(rd16.y_dc_levels));
905      SwapOut(it);
906    }
907  }
908  SetRDScore(dqm->lambda_mode_, rd);   // finalize score for mode decision.
909  VP8SetIntra16Mode(it, rd->mode_i16);
910
911  // we have a blocky macroblock (only DCs are non-zero) with fairly high
912  // distortion, record max delta so we can later adjust the minimal filtering
913  // strength needed to smooth these blocks out.
914  if ((rd->nz & 0xffff) == 0 && rd->D > dqm->min_disto_) {
915    StoreMaxDelta(dqm, rd->y_dc_levels);
916  }
917}
918
919//------------------------------------------------------------------------------
920
921// return the cost array corresponding to the surrounding prediction modes.
922static const uint16_t* GetCostModeI4(VP8EncIterator* const it,
923                                     const uint8_t modes[16]) {
924  const int preds_w = it->enc_->preds_w_;
925  const int x = (it->i4_ & 3), y = it->i4_ >> 2;
926  const int left = (x == 0) ? it->preds_[y * preds_w - 1] : modes[it->i4_ - 1];
927  const int top = (y == 0) ? it->preds_[-preds_w + x] : modes[it->i4_ - 4];
928  return VP8FixedCostsI4[top][left];
929}
930
931static int PickBestIntra4(VP8EncIterator* const it, VP8ModeScore* const rd) {
932  const VP8Encoder* const enc = it->enc_;
933  const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
934  const int lambda = dqm->lambda_i4_;
935  const int tlambda = dqm->tlambda_;
936  const uint8_t* const src0 = it->yuv_in_ + Y_OFF;
937  uint8_t* const best_blocks = it->yuv_out2_ + Y_OFF;
938  int total_header_bits = 0;
939  VP8ModeScore rd_best;
940
941  if (enc->max_i4_header_bits_ == 0) {
942    return 0;
943  }
944
945  InitScore(&rd_best);
946  rd_best.H = 211;  // '211' is the value of VP8BitCost(0, 145)
947  SetRDScore(dqm->lambda_mode_, &rd_best);
948  VP8IteratorStartI4(it);
949  do {
950    const int kNumBlocks = 1;
951    VP8ModeScore rd_i4;
952    int mode;
953    int best_mode = -1;
954    const uint8_t* const src = src0 + VP8Scan[it->i4_];
955    const uint16_t* const mode_costs = GetCostModeI4(it, rd->modes_i4);
956    uint8_t* best_block = best_blocks + VP8Scan[it->i4_];
957    uint8_t* tmp_dst = it->yuv_p_ + I4TMP;    // scratch buffer.
958
959    InitScore(&rd_i4);
960    VP8MakeIntra4Preds(it);
961    for (mode = 0; mode < NUM_BMODES; ++mode) {
962      VP8ModeScore rd_tmp;
963      int16_t tmp_levels[16];
964
965      // Reconstruct
966      rd_tmp.nz =
967          ReconstructIntra4(it, tmp_levels, src, tmp_dst, mode) << it->i4_;
968
969      // Compute RD-score
970      rd_tmp.D = VP8SSE4x4(src, tmp_dst);
971      rd_tmp.SD =
972          tlambda ? MULT_8B(tlambda, VP8TDisto4x4(src, tmp_dst, kWeightY))
973                  : 0;
974      rd_tmp.H = mode_costs[mode];
975      rd_tmp.R = VP8GetCostLuma4(it, tmp_levels);
976      if (mode > 0 && IsFlat(tmp_levels, kNumBlocks, FLATNESS_LIMIT_I4)) {
977        rd_tmp.R += FLATNESS_PENALTY * kNumBlocks;
978      }
979
980      SetRDScore(lambda, &rd_tmp);
981      if (best_mode < 0 || rd_tmp.score < rd_i4.score) {
982        CopyScore(&rd_i4, &rd_tmp);
983        best_mode = mode;
984        SwapPtr(&tmp_dst, &best_block);
985        memcpy(rd_best.y_ac_levels[it->i4_], tmp_levels, sizeof(tmp_levels));
986      }
987    }
988    SetRDScore(dqm->lambda_mode_, &rd_i4);
989    AddScore(&rd_best, &rd_i4);
990    if (rd_best.score >= rd->score) {
991      return 0;
992    }
993    total_header_bits += (int)rd_i4.H;   // <- equal to mode_costs[best_mode];
994    if (total_header_bits > enc->max_i4_header_bits_) {
995      return 0;
996    }
997    // Copy selected samples if not in the right place already.
998    if (best_block != best_blocks + VP8Scan[it->i4_]) {
999      VP8Copy4x4(best_block, best_blocks + VP8Scan[it->i4_]);
1000    }
1001    rd->modes_i4[it->i4_] = best_mode;
1002    it->top_nz_[it->i4_ & 3] = it->left_nz_[it->i4_ >> 2] = (rd_i4.nz ? 1 : 0);
1003  } while (VP8IteratorRotateI4(it, best_blocks));
1004
1005  // finalize state
1006  CopyScore(rd, &rd_best);
1007  VP8SetIntra4Mode(it, rd->modes_i4);
1008  SwapOut(it);
1009  memcpy(rd->y_ac_levels, rd_best.y_ac_levels, sizeof(rd->y_ac_levels));
1010  return 1;   // select intra4x4 over intra16x16
1011}
1012
1013//------------------------------------------------------------------------------
1014
1015static void PickBestUV(VP8EncIterator* const it, VP8ModeScore* const rd) {
1016  const int kNumBlocks = 8;
1017  const VP8SegmentInfo* const dqm = &it->enc_->dqm_[it->mb_->segment_];
1018  const int lambda = dqm->lambda_uv_;
1019  const uint8_t* const src = it->yuv_in_ + U_OFF;
1020  uint8_t* const tmp_dst = it->yuv_out2_ + U_OFF;  // scratch buffer
1021  uint8_t* const dst0 = it->yuv_out_ + U_OFF;
1022  VP8ModeScore rd_best;
1023  int mode;
1024
1025  rd->mode_uv = -1;
1026  InitScore(&rd_best);
1027  for (mode = 0; mode < NUM_PRED_MODES; ++mode) {
1028    VP8ModeScore rd_uv;
1029
1030    // Reconstruct
1031    rd_uv.nz = ReconstructUV(it, &rd_uv, tmp_dst, mode);
1032
1033    // Compute RD-score
1034    rd_uv.D  = VP8SSE16x8(src, tmp_dst);
1035    rd_uv.SD = 0;    // TODO: should we call TDisto? it tends to flatten areas.
1036    rd_uv.H  = VP8FixedCostsUV[mode];
1037    rd_uv.R  = VP8GetCostUV(it, &rd_uv);
1038    if (mode > 0 && IsFlat(rd_uv.uv_levels[0], kNumBlocks, FLATNESS_LIMIT_UV)) {
1039      rd_uv.R += FLATNESS_PENALTY * kNumBlocks;
1040    }
1041
1042    SetRDScore(lambda, &rd_uv);
1043    if (mode == 0 || rd_uv.score < rd_best.score) {
1044      CopyScore(&rd_best, &rd_uv);
1045      rd->mode_uv = mode;
1046      memcpy(rd->uv_levels, rd_uv.uv_levels, sizeof(rd->uv_levels));
1047      memcpy(dst0, tmp_dst, UV_SIZE);   //  TODO: SwapUVOut() ?
1048    }
1049  }
1050  VP8SetIntraUVMode(it, rd->mode_uv);
1051  AddScore(rd, &rd_best);
1052}
1053
1054//------------------------------------------------------------------------------
1055// Final reconstruction and quantization.
1056
1057static void SimpleQuantize(VP8EncIterator* const it, VP8ModeScore* const rd) {
1058  const VP8Encoder* const enc = it->enc_;
1059  const int is_i16 = (it->mb_->type_ == 1);
1060  int nz = 0;
1061
1062  if (is_i16) {
1063    nz = ReconstructIntra16(it, rd, it->yuv_out_ + Y_OFF, it->preds_[0]);
1064  } else {
1065    VP8IteratorStartI4(it);
1066    do {
1067      const int mode =
1068          it->preds_[(it->i4_ & 3) + (it->i4_ >> 2) * enc->preds_w_];
1069      const uint8_t* const src = it->yuv_in_ + Y_OFF + VP8Scan[it->i4_];
1070      uint8_t* const dst = it->yuv_out_ + Y_OFF + VP8Scan[it->i4_];
1071      VP8MakeIntra4Preds(it);
1072      nz |= ReconstructIntra4(it, rd->y_ac_levels[it->i4_],
1073                              src, dst, mode) << it->i4_;
1074    } while (VP8IteratorRotateI4(it, it->yuv_out_ + Y_OFF));
1075  }
1076
1077  nz |= ReconstructUV(it, rd, it->yuv_out_ + U_OFF, it->mb_->uv_mode_);
1078  rd->nz = nz;
1079}
1080
1081// Refine intra16/intra4 sub-modes based on distortion only (not rate).
1082static void DistoRefine(VP8EncIterator* const it, int try_both_i4_i16) {
1083  const int is_i16 = (it->mb_->type_ == 1);
1084  score_t best_score = MAX_COST;
1085
1086  if (try_both_i4_i16 || is_i16) {
1087    int mode;
1088    int best_mode = -1;
1089    for (mode = 0; mode < NUM_PRED_MODES; ++mode) {
1090      const uint8_t* const ref = it->yuv_p_ + VP8I16ModeOffsets[mode];
1091      const uint8_t* const src = it->yuv_in_ + Y_OFF;
1092      const score_t score = VP8SSE16x16(src, ref);
1093      if (score < best_score) {
1094        best_mode = mode;
1095        best_score = score;
1096      }
1097    }
1098    VP8SetIntra16Mode(it, best_mode);
1099  }
1100  if (try_both_i4_i16 || !is_i16) {
1101    uint8_t modes_i4[16];
1102    // We don't evaluate the rate here, but just account for it through a
1103    // constant penalty (i4 mode usually needs more bits compared to i16).
1104    score_t score_i4 = (score_t)I4_PENALTY;
1105
1106    VP8IteratorStartI4(it);
1107    do {
1108      int mode;
1109      int best_sub_mode = -1;
1110      score_t best_sub_score = MAX_COST;
1111      const uint8_t* const src = it->yuv_in_ + Y_OFF + VP8Scan[it->i4_];
1112
1113      // TODO(skal): we don't really need the prediction pixels here,
1114      // but just the distortion against 'src'.
1115      VP8MakeIntra4Preds(it);
1116      for (mode = 0; mode < NUM_BMODES; ++mode) {
1117        const uint8_t* const ref = it->yuv_p_ + VP8I4ModeOffsets[mode];
1118        const score_t score = VP8SSE4x4(src, ref);
1119        if (score < best_sub_score) {
1120          best_sub_mode = mode;
1121          best_sub_score = score;
1122        }
1123      }
1124      modes_i4[it->i4_] = best_sub_mode;
1125      score_i4 += best_sub_score;
1126      if (score_i4 >= best_score) break;
1127    } while (VP8IteratorRotateI4(it, it->yuv_in_ + Y_OFF));
1128    if (score_i4 < best_score) {
1129      VP8SetIntra4Mode(it, modes_i4);
1130    }
1131  }
1132}
1133
1134//------------------------------------------------------------------------------
1135// Entry point
1136
1137int VP8Decimate(VP8EncIterator* const it, VP8ModeScore* const rd,
1138                VP8RDLevel rd_opt) {
1139  int is_skipped;
1140  const int method = it->enc_->method_;
1141
1142  InitScore(rd);
1143
1144  // We can perform predictions for Luma16x16 and Chroma8x8 already.
1145  // Luma4x4 predictions needs to be done as-we-go.
1146  VP8MakeLuma16Preds(it);
1147  VP8MakeChroma8Preds(it);
1148
1149  if (rd_opt > RD_OPT_NONE) {
1150    it->do_trellis_ = (rd_opt >= RD_OPT_TRELLIS_ALL);
1151    PickBestIntra16(it, rd);
1152    if (method >= 2) {
1153      PickBestIntra4(it, rd);
1154    }
1155    PickBestUV(it, rd);
1156    if (rd_opt == RD_OPT_TRELLIS) {   // finish off with trellis-optim now
1157      it->do_trellis_ = 1;
1158      SimpleQuantize(it, rd);
1159    }
1160  } else {
1161    // For method == 2, pick the best intra4/intra16 based on SSE (~tad slower).
1162    // For method <= 1, we refine intra4 or intra16 (but don't re-examine mode).
1163    DistoRefine(it, (method >= 2));
1164    SimpleQuantize(it, rd);
1165  }
1166  is_skipped = (rd->nz == 0);
1167  VP8SetSkip(it, is_skipped);
1168  return is_skipped;
1169}
1170
1171