1// Copyright 2011 Google Inc. All Rights Reserved.
2//
3// Use of this source code is governed by a BSD-style license
4// that can be found in the COPYING file in the root of the source
5// tree. An additional intellectual property rights grant can be found
6// in the file PATENTS. All contributing project authors may
7// be found in the AUTHORS file in the root of the source tree.
8// -----------------------------------------------------------------------------
9//
10//   Quantization
11//
12// Author: Skal (pascal.massimino@gmail.com)
13
14#include <assert.h>
15#include <math.h>
16
17#include "./vp8enci.h"
18#include "./cost.h"
19
20#define DO_TRELLIS_I4  1
21#define DO_TRELLIS_I16 1   // not a huge gain, but ok at low bitrate.
22#define DO_TRELLIS_UV  0   // disable trellis for UV. Risky. Not worth.
23#define USE_TDISTO 1
24
25#define MID_ALPHA 64      // neutral value for susceptibility
26#define MIN_ALPHA 30      // lowest usable value for susceptibility
27#define MAX_ALPHA 100     // higher meaninful value for susceptibility
28
29#define SNS_TO_DQ 0.9     // Scaling constant between the sns value and the QP
30                          // power-law modulation. Must be strictly less than 1.
31
32#define I4_PENALTY 4000   // Rate-penalty for quick i4/i16 decision
33
34#define MULT_8B(a, b) (((a) * (b) + 128) >> 8)
35
36#if defined(__cplusplus) || defined(c_plusplus)
37extern "C" {
38#endif
39
40//------------------------------------------------------------------------------
41
42static WEBP_INLINE int clip(int v, int m, int M) {
43  return v < m ? m : v > M ? M : v;
44}
45
46static const uint8_t kZigzag[16] = {
47  0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15
48};
49
50static const uint8_t kDcTable[128] = {
51  4,     5,   6,   7,   8,   9,  10,  10,
52  11,   12,  13,  14,  15,  16,  17,  17,
53  18,   19,  20,  20,  21,  21,  22,  22,
54  23,   23,  24,  25,  25,  26,  27,  28,
55  29,   30,  31,  32,  33,  34,  35,  36,
56  37,   37,  38,  39,  40,  41,  42,  43,
57  44,   45,  46,  46,  47,  48,  49,  50,
58  51,   52,  53,  54,  55,  56,  57,  58,
59  59,   60,  61,  62,  63,  64,  65,  66,
60  67,   68,  69,  70,  71,  72,  73,  74,
61  75,   76,  76,  77,  78,  79,  80,  81,
62  82,   83,  84,  85,  86,  87,  88,  89,
63  91,   93,  95,  96,  98, 100, 101, 102,
64  104, 106, 108, 110, 112, 114, 116, 118,
65  122, 124, 126, 128, 130, 132, 134, 136,
66  138, 140, 143, 145, 148, 151, 154, 157
67};
68
69static const uint16_t kAcTable[128] = {
70  4,     5,   6,   7,   8,   9,  10,  11,
71  12,   13,  14,  15,  16,  17,  18,  19,
72  20,   21,  22,  23,  24,  25,  26,  27,
73  28,   29,  30,  31,  32,  33,  34,  35,
74  36,   37,  38,  39,  40,  41,  42,  43,
75  44,   45,  46,  47,  48,  49,  50,  51,
76  52,   53,  54,  55,  56,  57,  58,  60,
77  62,   64,  66,  68,  70,  72,  74,  76,
78  78,   80,  82,  84,  86,  88,  90,  92,
79  94,   96,  98, 100, 102, 104, 106, 108,
80  110, 112, 114, 116, 119, 122, 125, 128,
81  131, 134, 137, 140, 143, 146, 149, 152,
82  155, 158, 161, 164, 167, 170, 173, 177,
83  181, 185, 189, 193, 197, 201, 205, 209,
84  213, 217, 221, 225, 229, 234, 239, 245,
85  249, 254, 259, 264, 269, 274, 279, 284
86};
87
88static const uint16_t kAcTable2[128] = {
89  8,     8,   9,  10,  12,  13,  15,  17,
90  18,   20,  21,  23,  24,  26,  27,  29,
91  31,   32,  34,  35,  37,  38,  40,  41,
92  43,   44,  46,  48,  49,  51,  52,  54,
93  55,   57,  58,  60,  62,  63,  65,  66,
94  68,   69,  71,  72,  74,  75,  77,  79,
95  80,   82,  83,  85,  86,  88,  89,  93,
96  96,   99, 102, 105, 108, 111, 114, 117,
97  120, 124, 127, 130, 133, 136, 139, 142,
98  145, 148, 151, 155, 158, 161, 164, 167,
99  170, 173, 176, 179, 184, 189, 193, 198,
100  203, 207, 212, 217, 221, 226, 230, 235,
101  240, 244, 249, 254, 258, 263, 268, 274,
102  280, 286, 292, 299, 305, 311, 317, 323,
103  330, 336, 342, 348, 354, 362, 370, 379,
104  385, 393, 401, 409, 416, 424, 432, 440
105};
106
107static const uint16_t kCoeffThresh[16] = {
108  0,  10, 20, 30,
109  10, 20, 30, 30,
110  20, 30, 30, 30,
111  30, 30, 30, 30
112};
113
114// TODO(skal): tune more. Coeff thresholding?
115static const uint8_t kBiasMatrices[3][16] = {  // [3] = [luma-ac,luma-dc,chroma]
116  { 96, 96, 96, 96,
117    96, 96, 96, 96,
118    96, 96, 96, 96,
119    96, 96, 96, 96 },
120  { 96, 96, 96, 96,
121    96, 96, 96, 96,
122    96, 96, 96, 96,
123    96, 96, 96, 96 },
124  { 96, 96, 96, 96,
125    96, 96, 96, 96,
126    96, 96, 96, 96,
127    96, 96, 96, 96 }
128};
129
130// Sharpening by (slightly) raising the hi-frequency coeffs (only for trellis).
131// Hack-ish but helpful for mid-bitrate range. Use with care.
132static const uint8_t kFreqSharpening[16] = {
133  0,  30, 60, 90,
134  30, 60, 90, 90,
135  60, 90, 90, 90,
136  90, 90, 90, 90
137};
138
139//------------------------------------------------------------------------------
140// Initialize quantization parameters in VP8Matrix
141
142// Returns the average quantizer
143static int ExpandMatrix(VP8Matrix* const m, int type) {
144  int i;
145  int sum = 0;
146  for (i = 2; i < 16; ++i) {
147    m->q_[i] = m->q_[1];
148  }
149  for (i = 0; i < 16; ++i) {
150    const int j = kZigzag[i];
151    const int bias = kBiasMatrices[type][j];
152    m->iq_[j] = (1 << QFIX) / m->q_[j];
153    m->bias_[j] = BIAS(bias);
154    // TODO(skal): tune kCoeffThresh[]
155    m->zthresh_[j] = ((256 /*+ kCoeffThresh[j]*/ - bias) * m->q_[j] + 127) >> 8;
156    m->sharpen_[j] = (kFreqSharpening[j] * m->q_[j]) >> 11;
157    sum += m->q_[j];
158  }
159  return (sum + 8) >> 4;
160}
161
162static void SetupMatrices(VP8Encoder* enc) {
163  int i;
164  const int tlambda_scale =
165    (enc->method_ >= 4) ? enc->config_->sns_strength
166                        : 0;
167  const int num_segments = enc->segment_hdr_.num_segments_;
168  for (i = 0; i < num_segments; ++i) {
169    VP8SegmentInfo* const m = &enc->dqm_[i];
170    const int q = m->quant_;
171    int q4, q16, quv;
172    m->y1_.q_[0] = kDcTable[clip(q + enc->dq_y1_dc_, 0, 127)];
173    m->y1_.q_[1] = kAcTable[clip(q,                  0, 127)];
174
175    m->y2_.q_[0] = kDcTable[ clip(q + enc->dq_y2_dc_, 0, 127)] * 2;
176    m->y2_.q_[1] = kAcTable2[clip(q + enc->dq_y2_ac_, 0, 127)];
177
178    m->uv_.q_[0] = kDcTable[clip(q + enc->dq_uv_dc_, 0, 117)];
179    m->uv_.q_[1] = kAcTable[clip(q + enc->dq_uv_ac_, 0, 127)];
180
181    q4  = ExpandMatrix(&m->y1_, 0);
182    q16 = ExpandMatrix(&m->y2_, 1);
183    quv = ExpandMatrix(&m->uv_, 2);
184
185    // TODO: Switch to kLambda*[] tables?
186    {
187      m->lambda_i4_  = (3 * q4 * q4) >> 7;
188      m->lambda_i16_ = (3 * q16 * q16);
189      m->lambda_uv_  = (3 * quv * quv) >> 6;
190      m->lambda_mode_    = (1 * q4 * q4) >> 7;
191      m->lambda_trellis_i4_  = (7 * q4 * q4) >> 3;
192      m->lambda_trellis_i16_ = (q16 * q16) >> 2;
193      m->lambda_trellis_uv_  = (quv *quv) << 1;
194      m->tlambda_            = (tlambda_scale * q4) >> 5;
195    }
196  }
197}
198
199//------------------------------------------------------------------------------
200// Initialize filtering parameters
201
202// Very small filter-strength values have close to no visual effect. So we can
203// save a little decoding-CPU by turning filtering off for these.
204#define FSTRENGTH_CUTOFF 3
205
206static void SetupFilterStrength(VP8Encoder* const enc) {
207  int i;
208  const int level0 = enc->config_->filter_strength;
209  for (i = 0; i < NUM_MB_SEGMENTS; ++i) {
210    // Segments with lower quantizer will be less filtered. TODO: tune (wrt SNS)
211    const int level = level0 * 256 * enc->dqm_[i].quant_ / 128;
212    const int f = level / (256 + enc->dqm_[i].beta_);
213    enc->dqm_[i].fstrength_ = (f < FSTRENGTH_CUTOFF) ? 0 : (f > 63) ? 63 : f;
214  }
215  // We record the initial strength (mainly for the case of 1-segment only).
216  enc->filter_hdr_.level_ = enc->dqm_[0].fstrength_;
217  enc->filter_hdr_.simple_ = (enc->config_->filter_type == 0);
218  enc->filter_hdr_.sharpness_ = enc->config_->filter_sharpness;
219}
220
221//------------------------------------------------------------------------------
222
223// Note: if you change the values below, remember that the max range
224// allowed by the syntax for DQ_UV is [-16,16].
225#define MAX_DQ_UV (6)
226#define MIN_DQ_UV (-4)
227
228// We want to emulate jpeg-like behaviour where the expected "good" quality
229// is around q=75. Internally, our "good" middle is around c=50. So we
230// map accordingly using linear piece-wise function
231static double QualityToCompression(double c) {
232  const double linear_c = (c < 0.75) ? c * (2. / 3.) : 2. * c - 1.;
233  // The file size roughly scales as pow(quantizer, 3.). Actually, the
234  // exponent is somewhere between 2.8 and 3.2, but we're mostly interested
235  // in the mid-quant range. So we scale the compressibility inversely to
236  // this power-law: quant ~= compression ^ 1/3. This law holds well for
237  // low quant. Finer modelling for high-quant would make use of kAcTable[]
238  // more explicitly.
239  const double v = pow(linear_c, 1 / 3.);
240  return v;
241}
242
243static double QualityToJPEGCompression(double c, double alpha) {
244  // We map the complexity 'alpha' and quality setting 'c' to a compression
245  // exponent empirically matched to the compression curve of libjpeg6b.
246  // On average, the WebP output size will be roughly similar to that of a
247  // JPEG file compressed with same quality factor.
248  const double amin = 0.30;
249  const double amax = 0.85;
250  const double exp_min = 0.4;
251  const double exp_max = 0.9;
252  const double slope = (exp_min - exp_max) / (amax - amin);
253  // Linearly interpolate 'expn' from exp_min to exp_max
254  // in the [amin, amax] range.
255  const double expn = (alpha > amax) ? exp_min
256                    : (alpha < amin) ? exp_max
257                    : exp_max + slope * (alpha - amin);
258  const double v = pow(c, expn);
259  return v;
260}
261
262static int SegmentsAreEquivalent(const VP8SegmentInfo* const S1,
263                                 const VP8SegmentInfo* const S2) {
264  return (S1->quant_ == S2->quant_) && (S1->fstrength_ == S2->fstrength_);
265}
266
267static void SimplifySegments(VP8Encoder* const enc) {
268  int map[NUM_MB_SEGMENTS] = { 0, 1, 2, 3 };
269  const int num_segments = enc->segment_hdr_.num_segments_;
270  int num_final_segments = 1;
271  int s1, s2;
272  for (s1 = 1; s1 < num_segments; ++s1) {    // find similar segments
273    const VP8SegmentInfo* const S1 = &enc->dqm_[s1];
274    int found = 0;
275    // check if we already have similar segment
276    for (s2 = 0; s2 < num_final_segments; ++s2) {
277      const VP8SegmentInfo* const S2 = &enc->dqm_[s2];
278      if (SegmentsAreEquivalent(S1, S2)) {
279        found = 1;
280        break;
281      }
282    }
283    map[s1] = s2;
284    if (!found) {
285      if (num_final_segments != s1) {
286        enc->dqm_[num_final_segments] = enc->dqm_[s1];
287      }
288      ++num_final_segments;
289    }
290  }
291  if (num_final_segments < num_segments) {  // Remap
292    int i = enc->mb_w_ * enc->mb_h_;
293    while (i-- > 0) enc->mb_info_[i].segment_ = map[enc->mb_info_[i].segment_];
294    enc->segment_hdr_.num_segments_ = num_final_segments;
295    // Replicate the trailing segment infos (it's mostly cosmetics)
296    for (i = num_final_segments; i < num_segments; ++i) {
297      enc->dqm_[i] = enc->dqm_[num_final_segments - 1];
298    }
299  }
300}
301
302void VP8SetSegmentParams(VP8Encoder* const enc, float quality) {
303  int i;
304  int dq_uv_ac, dq_uv_dc;
305  const int num_segments = enc->segment_hdr_.num_segments_;
306  const double amp = SNS_TO_DQ * enc->config_->sns_strength / 100. / 128.;
307  const double Q = quality / 100.;
308  const double c_base = enc->config_->emulate_jpeg_size ?
309      QualityToJPEGCompression(Q, enc->alpha_ / 255.) :
310      QualityToCompression(Q);
311  for (i = 0; i < num_segments; ++i) {
312    // We modulate the base coefficient to accommodate for the quantization
313    // susceptibility and allow denser segments to be quantized more.
314    const double expn = 1. - amp * enc->dqm_[i].alpha_;
315    const double c = pow(c_base, expn);
316    const int q = (int)(127. * (1. - c));
317    assert(expn > 0.);
318    enc->dqm_[i].quant_ = clip(q, 0, 127);
319  }
320
321  // purely indicative in the bitstream (except for the 1-segment case)
322  enc->base_quant_ = enc->dqm_[0].quant_;
323
324  // fill-in values for the unused segments (required by the syntax)
325  for (i = num_segments; i < NUM_MB_SEGMENTS; ++i) {
326    enc->dqm_[i].quant_ = enc->base_quant_;
327  }
328
329  // uv_alpha_ is normally spread around ~60. The useful range is
330  // typically ~30 (quite bad) to ~100 (ok to decimate UV more).
331  // We map it to the safe maximal range of MAX/MIN_DQ_UV for dq_uv.
332  dq_uv_ac = (enc->uv_alpha_ - MID_ALPHA) * (MAX_DQ_UV - MIN_DQ_UV)
333                                          / (MAX_ALPHA - MIN_ALPHA);
334  // we rescale by the user-defined strength of adaptation
335  dq_uv_ac = dq_uv_ac * enc->config_->sns_strength / 100;
336  // and make it safe.
337  dq_uv_ac = clip(dq_uv_ac, MIN_DQ_UV, MAX_DQ_UV);
338  // We also boost the dc-uv-quant a little, based on sns-strength, since
339  // U/V channels are quite more reactive to high quants (flat DC-blocks
340  // tend to appear, and are displeasant).
341  dq_uv_dc = -4 * enc->config_->sns_strength / 100;
342  dq_uv_dc = clip(dq_uv_dc, -15, 15);   // 4bit-signed max allowed
343
344  enc->dq_y1_dc_ = 0;       // TODO(skal): dq-lum
345  enc->dq_y2_dc_ = 0;
346  enc->dq_y2_ac_ = 0;
347  enc->dq_uv_dc_ = dq_uv_dc;
348  enc->dq_uv_ac_ = dq_uv_ac;
349
350  SetupFilterStrength(enc);   // initialize segments' filtering, eventually
351
352  if (num_segments > 1) SimplifySegments(enc);
353
354  SetupMatrices(enc);         // finalize quantization matrices
355}
356
357//------------------------------------------------------------------------------
358// Form the predictions in cache
359
360// Must be ordered using {DC_PRED, TM_PRED, V_PRED, H_PRED} as index
361const int VP8I16ModeOffsets[4] = { I16DC16, I16TM16, I16VE16, I16HE16 };
362const int VP8UVModeOffsets[4] = { C8DC8, C8TM8, C8VE8, C8HE8 };
363
364// Must be indexed using {B_DC_PRED -> B_HU_PRED} as index
365const int VP8I4ModeOffsets[NUM_BMODES] = {
366  I4DC4, I4TM4, I4VE4, I4HE4, I4RD4, I4VR4, I4LD4, I4VL4, I4HD4, I4HU4
367};
368
369void VP8MakeLuma16Preds(const VP8EncIterator* const it) {
370  const VP8Encoder* const enc = it->enc_;
371  const uint8_t* const left = it->x_ ? enc->y_left_ : NULL;
372  const uint8_t* const top = it->y_ ? enc->y_top_ + it->x_ * 16 : NULL;
373  VP8EncPredLuma16(it->yuv_p_, left, top);
374}
375
376void VP8MakeChroma8Preds(const VP8EncIterator* const it) {
377  const VP8Encoder* const enc = it->enc_;
378  const uint8_t* const left = it->x_ ? enc->u_left_ : NULL;
379  const uint8_t* const top = it->y_ ? enc->uv_top_ + it->x_ * 16 : NULL;
380  VP8EncPredChroma8(it->yuv_p_, left, top);
381}
382
383void VP8MakeIntra4Preds(const VP8EncIterator* const it) {
384  VP8EncPredLuma4(it->yuv_p_, it->i4_top_);
385}
386
387//------------------------------------------------------------------------------
388// Quantize
389
390// Layout:
391// +----+
392// |YYYY| 0
393// |YYYY| 4
394// |YYYY| 8
395// |YYYY| 12
396// +----+
397// |UUVV| 16
398// |UUVV| 20
399// +----+
400
401const int VP8Scan[16 + 4 + 4] = {
402  // Luma
403  0 +  0 * BPS,  4 +  0 * BPS, 8 +  0 * BPS, 12 +  0 * BPS,
404  0 +  4 * BPS,  4 +  4 * BPS, 8 +  4 * BPS, 12 +  4 * BPS,
405  0 +  8 * BPS,  4 +  8 * BPS, 8 +  8 * BPS, 12 +  8 * BPS,
406  0 + 12 * BPS,  4 + 12 * BPS, 8 + 12 * BPS, 12 + 12 * BPS,
407
408  0 + 0 * BPS,   4 + 0 * BPS, 0 + 4 * BPS,  4 + 4 * BPS,    // U
409  8 + 0 * BPS,  12 + 0 * BPS, 8 + 4 * BPS, 12 + 4 * BPS     // V
410};
411
412//------------------------------------------------------------------------------
413// Distortion measurement
414
415static const uint16_t kWeightY[16] = {
416  38, 32, 20, 9, 32, 28, 17, 7, 20, 17, 10, 4, 9, 7, 4, 2
417};
418
419static const uint16_t kWeightTrellis[16] = {
420#if USE_TDISTO == 0
421  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
422#else
423  30, 27, 19, 11,
424  27, 24, 17, 10,
425  19, 17, 12,  8,
426  11, 10,  8,  6
427#endif
428};
429
430// Init/Copy the common fields in score.
431static void InitScore(VP8ModeScore* const rd) {
432  rd->D  = 0;
433  rd->SD = 0;
434  rd->R  = 0;
435  rd->nz = 0;
436  rd->score = MAX_COST;
437}
438
439static void CopyScore(VP8ModeScore* const dst, const VP8ModeScore* const src) {
440  dst->D  = src->D;
441  dst->SD = src->SD;
442  dst->R  = src->R;
443  dst->nz = src->nz;      // note that nz is not accumulated, but just copied.
444  dst->score = src->score;
445}
446
447static void AddScore(VP8ModeScore* const dst, const VP8ModeScore* const src) {
448  dst->D  += src->D;
449  dst->SD += src->SD;
450  dst->R  += src->R;
451  dst->nz |= src->nz;     // here, new nz bits are accumulated.
452  dst->score += src->score;
453}
454
455//------------------------------------------------------------------------------
456// Performs trellis-optimized quantization.
457
458// Trellis
459
460typedef struct {
461  int prev;        // best previous
462  int level;       // level
463  int sign;        // sign of coeff_i
464  score_t cost;    // bit cost
465  score_t error;   // distortion = sum of (|coeff_i| - level_i * Q_i)^2
466  int ctx;         // context (only depends on 'level'. Could be spared.)
467} Node;
468
469// If a coefficient was quantized to a value Q (using a neutral bias),
470// we test all alternate possibilities between [Q-MIN_DELTA, Q+MAX_DELTA]
471// We don't test negative values though.
472#define MIN_DELTA 0   // how much lower level to try
473#define MAX_DELTA 1   // how much higher
474#define NUM_NODES (MIN_DELTA + 1 + MAX_DELTA)
475#define NODE(n, l) (nodes[(n) + 1][(l) + MIN_DELTA])
476
477static WEBP_INLINE void SetRDScore(int lambda, VP8ModeScore* const rd) {
478  // TODO: incorporate the "* 256" in the tables?
479  rd->score = rd->R * lambda + 256 * (rd->D + rd->SD);
480}
481
482static WEBP_INLINE score_t RDScoreTrellis(int lambda, score_t rate,
483                                          score_t distortion) {
484  return rate * lambda + 256 * distortion;
485}
486
487static int TrellisQuantizeBlock(const VP8EncIterator* const it,
488                                int16_t in[16], int16_t out[16],
489                                int ctx0, int coeff_type,
490                                const VP8Matrix* const mtx,
491                                int lambda) {
492  ProbaArray* const last_costs = it->enc_->proba_.coeffs_[coeff_type];
493  CostArray* const costs = it->enc_->proba_.level_cost_[coeff_type];
494  const int first = (coeff_type == 0) ? 1 : 0;
495  Node nodes[17][NUM_NODES];
496  int best_path[3] = {-1, -1, -1};   // store best-last/best-level/best-previous
497  score_t best_score;
498  int best_node;
499  int last = first - 1;
500  int n, m, p, nz;
501
502  {
503    score_t cost;
504    score_t max_error;
505    const int thresh = mtx->q_[1] * mtx->q_[1] / 4;
506    const int last_proba = last_costs[VP8EncBands[first]][ctx0][0];
507
508    // compute maximal distortion.
509    max_error = 0;
510    for (n = first; n < 16; ++n) {
511      const int j  = kZigzag[n];
512      const int err = in[j] * in[j];
513      max_error += kWeightTrellis[j] * err;
514      if (err > thresh) last = n;
515    }
516    // we don't need to go inspect up to n = 16 coeffs. We can just go up
517    // to last + 1 (inclusive) without losing much.
518    if (last < 15) ++last;
519
520    // compute 'skip' score. This is the max score one can do.
521    cost = VP8BitCost(0, last_proba);
522    best_score = RDScoreTrellis(lambda, cost, max_error);
523
524    // initialize source node.
525    n = first - 1;
526    for (m = -MIN_DELTA; m <= MAX_DELTA; ++m) {
527      NODE(n, m).cost = 0;
528      NODE(n, m).error = max_error;
529      NODE(n, m).ctx = ctx0;
530    }
531  }
532
533  // traverse trellis.
534  for (n = first; n <= last; ++n) {
535    const int j  = kZigzag[n];
536    const int Q  = mtx->q_[j];
537    const int iQ = mtx->iq_[j];
538    const int B = BIAS(0x00);     // neutral bias
539    // note: it's important to take sign of the _original_ coeff,
540    // so we don't have to consider level < 0 afterward.
541    const int sign = (in[j] < 0);
542    int coeff0 = (sign ? -in[j] : in[j]) + mtx->sharpen_[j];
543    int level0;
544    if (coeff0 > 2047) coeff0 = 2047;
545
546    level0 = QUANTDIV(coeff0, iQ, B);
547    // test all alternate level values around level0.
548    for (m = -MIN_DELTA; m <= MAX_DELTA; ++m) {
549      Node* const cur = &NODE(n, m);
550      int delta_error, new_error;
551      score_t cur_score = MAX_COST;
552      int level = level0 + m;
553      int last_proba;
554
555      cur->sign = sign;
556      cur->level = level;
557      cur->ctx = (level == 0) ? 0 : (level == 1) ? 1 : 2;
558      if (level >= 2048 || level < 0) {   // node is dead?
559        cur->cost = MAX_COST;
560        continue;
561      }
562      last_proba = last_costs[VP8EncBands[n + 1]][cur->ctx][0];
563
564      // Compute delta_error = how much coding this level will
565      // subtract as distortion to max_error
566      new_error = coeff0 - level * Q;
567      delta_error =
568        kWeightTrellis[j] * (coeff0 * coeff0 - new_error * new_error);
569
570      // Inspect all possible non-dead predecessors. Retain only the best one.
571      for (p = -MIN_DELTA; p <= MAX_DELTA; ++p) {
572        const Node* const prev = &NODE(n - 1, p);
573        const int prev_ctx = prev->ctx;
574        const uint16_t* const tcost = costs[VP8EncBands[n]][prev_ctx];
575        const score_t total_error = prev->error - delta_error;
576        score_t cost, base_cost, score;
577
578        if (prev->cost >= MAX_COST) {   // dead node?
579          continue;
580        }
581
582        // Base cost of both terminal/non-terminal
583        base_cost = prev->cost + VP8LevelCost(tcost, level);
584
585        // Examine node assuming it's a non-terminal one.
586        cost = base_cost;
587        if (level && n < 15) {
588          cost += VP8BitCost(1, last_proba);
589        }
590        score = RDScoreTrellis(lambda, cost, total_error);
591        if (score < cur_score) {
592          cur_score = score;
593          cur->cost  = cost;
594          cur->error = total_error;
595          cur->prev  = p;
596        }
597
598        // Now, record best terminal node (and thus best entry in the graph).
599        if (level) {
600          cost = base_cost;
601          if (n < 15) cost += VP8BitCost(0, last_proba);
602          score = RDScoreTrellis(lambda, cost, total_error);
603          if (score < best_score) {
604            best_score = score;
605            best_path[0] = n;   // best eob position
606            best_path[1] = m;   // best level
607            best_path[2] = p;   // best predecessor
608          }
609        }
610      }
611    }
612  }
613
614  // Fresh start
615  memset(in + first, 0, (16 - first) * sizeof(*in));
616  memset(out + first, 0, (16 - first) * sizeof(*out));
617  if (best_path[0] == -1) {
618    return 0;   // skip!
619  }
620
621  // Unwind the best path.
622  // Note: best-prev on terminal node is not necessarily equal to the
623  // best_prev for non-terminal. So we patch best_path[2] in.
624  n = best_path[0];
625  best_node = best_path[1];
626  NODE(n, best_node).prev = best_path[2];   // force best-prev for terminal
627  nz = 0;
628
629  for (; n >= first; --n) {
630    const Node* const node = &NODE(n, best_node);
631    const int j = kZigzag[n];
632    out[n] = node->sign ? -node->level : node->level;
633    nz |= (node->level != 0);
634    in[j] = out[n] * mtx->q_[j];
635    best_node = node->prev;
636  }
637  return nz;
638}
639
640#undef NODE
641
642//------------------------------------------------------------------------------
643// Performs: difference, transform, quantize, back-transform, add
644// all at once. Output is the reconstructed block in *yuv_out, and the
645// quantized levels in *levels.
646
647static int ReconstructIntra16(VP8EncIterator* const it,
648                              VP8ModeScore* const rd,
649                              uint8_t* const yuv_out,
650                              int mode) {
651  const VP8Encoder* const enc = it->enc_;
652  const uint8_t* const ref = it->yuv_p_ + VP8I16ModeOffsets[mode];
653  const uint8_t* const src = it->yuv_in_ + Y_OFF;
654  const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
655  int nz = 0;
656  int n;
657  int16_t tmp[16][16], dc_tmp[16];
658
659  for (n = 0; n < 16; ++n) {
660    VP8FTransform(src + VP8Scan[n], ref + VP8Scan[n], tmp[n]);
661  }
662  VP8FTransformWHT(tmp[0], dc_tmp);
663  nz |= VP8EncQuantizeBlock(dc_tmp, rd->y_dc_levels, 0, &dqm->y2_) << 24;
664
665  if (DO_TRELLIS_I16 && it->do_trellis_) {
666    int x, y;
667    VP8IteratorNzToBytes(it);
668    for (y = 0, n = 0; y < 4; ++y) {
669      for (x = 0; x < 4; ++x, ++n) {
670        const int ctx = it->top_nz_[x] + it->left_nz_[y];
671        const int non_zero =
672           TrellisQuantizeBlock(it, tmp[n], rd->y_ac_levels[n], ctx, 0,
673                                &dqm->y1_, dqm->lambda_trellis_i16_);
674        it->top_nz_[x] = it->left_nz_[y] = non_zero;
675        nz |= non_zero << n;
676      }
677    }
678  } else {
679    for (n = 0; n < 16; ++n) {
680      nz |= VP8EncQuantizeBlock(tmp[n], rd->y_ac_levels[n], 1, &dqm->y1_) << n;
681    }
682  }
683
684  // Transform back
685  VP8ITransformWHT(dc_tmp, tmp[0]);
686  for (n = 0; n < 16; n += 2) {
687    VP8ITransform(ref + VP8Scan[n], tmp[n], yuv_out + VP8Scan[n], 1);
688  }
689
690  return nz;
691}
692
693static int ReconstructIntra4(VP8EncIterator* const it,
694                             int16_t levels[16],
695                             const uint8_t* const src,
696                             uint8_t* const yuv_out,
697                             int mode) {
698  const VP8Encoder* const enc = it->enc_;
699  const uint8_t* const ref = it->yuv_p_ + VP8I4ModeOffsets[mode];
700  const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
701  int nz = 0;
702  int16_t tmp[16];
703
704  VP8FTransform(src, ref, tmp);
705  if (DO_TRELLIS_I4 && it->do_trellis_) {
706    const int x = it->i4_ & 3, y = it->i4_ >> 2;
707    const int ctx = it->top_nz_[x] + it->left_nz_[y];
708    nz = TrellisQuantizeBlock(it, tmp, levels, ctx, 3, &dqm->y1_,
709                              dqm->lambda_trellis_i4_);
710  } else {
711    nz = VP8EncQuantizeBlock(tmp, levels, 0, &dqm->y1_);
712  }
713  VP8ITransform(ref, tmp, yuv_out, 0);
714  return nz;
715}
716
717static int ReconstructUV(VP8EncIterator* const it, VP8ModeScore* const rd,
718                         uint8_t* const yuv_out, int mode) {
719  const VP8Encoder* const enc = it->enc_;
720  const uint8_t* const ref = it->yuv_p_ + VP8UVModeOffsets[mode];
721  const uint8_t* const src = it->yuv_in_ + U_OFF;
722  const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
723  int nz = 0;
724  int n;
725  int16_t tmp[8][16];
726
727  for (n = 0; n < 8; ++n) {
728    VP8FTransform(src + VP8Scan[16 + n], ref + VP8Scan[16 + n], tmp[n]);
729  }
730  if (DO_TRELLIS_UV && it->do_trellis_) {
731    int ch, x, y;
732    for (ch = 0, n = 0; ch <= 2; ch += 2) {
733      for (y = 0; y < 2; ++y) {
734        for (x = 0; x < 2; ++x, ++n) {
735          const int ctx = it->top_nz_[4 + ch + x] + it->left_nz_[4 + ch + y];
736          const int non_zero =
737            TrellisQuantizeBlock(it, tmp[n], rd->uv_levels[n], ctx, 2,
738                                 &dqm->uv_, dqm->lambda_trellis_uv_);
739          it->top_nz_[4 + ch + x] = it->left_nz_[4 + ch + y] = non_zero;
740          nz |= non_zero << n;
741        }
742      }
743    }
744  } else {
745    for (n = 0; n < 8; ++n) {
746      nz |= VP8EncQuantizeBlock(tmp[n], rd->uv_levels[n], 0, &dqm->uv_) << n;
747    }
748  }
749
750  for (n = 0; n < 8; n += 2) {
751    VP8ITransform(ref + VP8Scan[16 + n], tmp[n], yuv_out + VP8Scan[16 + n], 1);
752  }
753  return (nz << 16);
754}
755
756//------------------------------------------------------------------------------
757// RD-opt decision. Reconstruct each modes, evalue distortion and bit-cost.
758// Pick the mode is lower RD-cost = Rate + lamba * Distortion.
759
760static void SwapPtr(uint8_t** a, uint8_t** b) {
761  uint8_t* const tmp = *a;
762  *a = *b;
763  *b = tmp;
764}
765
766static void SwapOut(VP8EncIterator* const it) {
767  SwapPtr(&it->yuv_out_, &it->yuv_out2_);
768}
769
770static void PickBestIntra16(VP8EncIterator* const it, VP8ModeScore* const rd) {
771  const VP8Encoder* const enc = it->enc_;
772  const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
773  const int lambda = dqm->lambda_i16_;
774  const int tlambda = dqm->tlambda_;
775  const uint8_t* const src = it->yuv_in_ + Y_OFF;
776  VP8ModeScore rd16;
777  int mode;
778
779  rd->mode_i16 = -1;
780  for (mode = 0; mode < NUM_PRED_MODES; ++mode) {
781    uint8_t* const tmp_dst = it->yuv_out2_ + Y_OFF;  // scratch buffer
782    int nz;
783
784    // Reconstruct
785    nz = ReconstructIntra16(it, &rd16, tmp_dst, mode);
786
787    // Measure RD-score
788    rd16.D = VP8SSE16x16(src, tmp_dst);
789    rd16.SD = tlambda ? MULT_8B(tlambda, VP8TDisto16x16(src, tmp_dst, kWeightY))
790            : 0;
791    rd16.R = VP8GetCostLuma16(it, &rd16);
792    rd16.R += VP8FixedCostsI16[mode];
793
794    // Since we always examine Intra16 first, we can overwrite *rd directly.
795    SetRDScore(lambda, &rd16);
796    if (mode == 0 || rd16.score < rd->score) {
797      CopyScore(rd, &rd16);
798      rd->mode_i16 = mode;
799      rd->nz = nz;
800      memcpy(rd->y_ac_levels, rd16.y_ac_levels, sizeof(rd16.y_ac_levels));
801      memcpy(rd->y_dc_levels, rd16.y_dc_levels, sizeof(rd16.y_dc_levels));
802      SwapOut(it);
803    }
804  }
805  SetRDScore(dqm->lambda_mode_, rd);   // finalize score for mode decision.
806  VP8SetIntra16Mode(it, rd->mode_i16);
807}
808
809//------------------------------------------------------------------------------
810
811// return the cost array corresponding to the surrounding prediction modes.
812static const uint16_t* GetCostModeI4(VP8EncIterator* const it,
813                                     const uint8_t modes[16]) {
814  const int preds_w = it->enc_->preds_w_;
815  const int x = (it->i4_ & 3), y = it->i4_ >> 2;
816  const int left = (x == 0) ? it->preds_[y * preds_w - 1] : modes[it->i4_ - 1];
817  const int top = (y == 0) ? it->preds_[-preds_w + x] : modes[it->i4_ - 4];
818  return VP8FixedCostsI4[top][left];
819}
820
821static int PickBestIntra4(VP8EncIterator* const it, VP8ModeScore* const rd) {
822  const VP8Encoder* const enc = it->enc_;
823  const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
824  const int lambda = dqm->lambda_i4_;
825  const int tlambda = dqm->tlambda_;
826  const uint8_t* const src0 = it->yuv_in_ + Y_OFF;
827  uint8_t* const best_blocks = it->yuv_out2_ + Y_OFF;
828  int total_header_bits = 0;
829  VP8ModeScore rd_best;
830
831  if (enc->max_i4_header_bits_ == 0) {
832    return 0;
833  }
834
835  InitScore(&rd_best);
836  rd_best.score = 211;  // '211' is the value of VP8BitCost(0, 145)
837  VP8IteratorStartI4(it);
838  do {
839    VP8ModeScore rd_i4;
840    int mode;
841    int best_mode = -1;
842    const uint8_t* const src = src0 + VP8Scan[it->i4_];
843    const uint16_t* const mode_costs = GetCostModeI4(it, rd->modes_i4);
844    uint8_t* best_block = best_blocks + VP8Scan[it->i4_];
845    uint8_t* tmp_dst = it->yuv_p_ + I4TMP;    // scratch buffer.
846
847    InitScore(&rd_i4);
848    VP8MakeIntra4Preds(it);
849    for (mode = 0; mode < NUM_BMODES; ++mode) {
850      VP8ModeScore rd_tmp;
851      int16_t tmp_levels[16];
852
853      // Reconstruct
854      rd_tmp.nz =
855          ReconstructIntra4(it, tmp_levels, src, tmp_dst, mode) << it->i4_;
856
857      // Compute RD-score
858      rd_tmp.D = VP8SSE4x4(src, tmp_dst);
859      rd_tmp.SD =
860          tlambda ? MULT_8B(tlambda, VP8TDisto4x4(src, tmp_dst, kWeightY))
861                  : 0;
862      rd_tmp.R = VP8GetCostLuma4(it, tmp_levels);
863      rd_tmp.R += mode_costs[mode];
864
865      SetRDScore(lambda, &rd_tmp);
866      if (best_mode < 0 || rd_tmp.score < rd_i4.score) {
867        CopyScore(&rd_i4, &rd_tmp);
868        best_mode = mode;
869        SwapPtr(&tmp_dst, &best_block);
870        memcpy(rd_best.y_ac_levels[it->i4_], tmp_levels, sizeof(tmp_levels));
871      }
872    }
873    SetRDScore(dqm->lambda_mode_, &rd_i4);
874    AddScore(&rd_best, &rd_i4);
875    total_header_bits += mode_costs[best_mode];
876    if (rd_best.score >= rd->score ||
877        total_header_bits > enc->max_i4_header_bits_) {
878      return 0;
879    }
880    // Copy selected samples if not in the right place already.
881    if (best_block != best_blocks + VP8Scan[it->i4_])
882      VP8Copy4x4(best_block, best_blocks + VP8Scan[it->i4_]);
883    rd->modes_i4[it->i4_] = best_mode;
884    it->top_nz_[it->i4_ & 3] = it->left_nz_[it->i4_ >> 2] = (rd_i4.nz ? 1 : 0);
885  } while (VP8IteratorRotateI4(it, best_blocks));
886
887  // finalize state
888  CopyScore(rd, &rd_best);
889  VP8SetIntra4Mode(it, rd->modes_i4);
890  SwapOut(it);
891  memcpy(rd->y_ac_levels, rd_best.y_ac_levels, sizeof(rd->y_ac_levels));
892  return 1;   // select intra4x4 over intra16x16
893}
894
895//------------------------------------------------------------------------------
896
897static void PickBestUV(VP8EncIterator* const it, VP8ModeScore* const rd) {
898  const VP8Encoder* const enc = it->enc_;
899  const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
900  const int lambda = dqm->lambda_uv_;
901  const uint8_t* const src = it->yuv_in_ + U_OFF;
902  uint8_t* const tmp_dst = it->yuv_out2_ + U_OFF;  // scratch buffer
903  uint8_t* const dst0 = it->yuv_out_ + U_OFF;
904  VP8ModeScore rd_best;
905  int mode;
906
907  rd->mode_uv = -1;
908  InitScore(&rd_best);
909  for (mode = 0; mode < NUM_PRED_MODES; ++mode) {
910    VP8ModeScore rd_uv;
911
912    // Reconstruct
913    rd_uv.nz = ReconstructUV(it, &rd_uv, tmp_dst, mode);
914
915    // Compute RD-score
916    rd_uv.D  = VP8SSE16x8(src, tmp_dst);
917    rd_uv.SD = 0;    // TODO: should we call TDisto? it tends to flatten areas.
918    rd_uv.R  = VP8GetCostUV(it, &rd_uv);
919    rd_uv.R += VP8FixedCostsUV[mode];
920
921    SetRDScore(lambda, &rd_uv);
922    if (mode == 0 || rd_uv.score < rd_best.score) {
923      CopyScore(&rd_best, &rd_uv);
924      rd->mode_uv = mode;
925      memcpy(rd->uv_levels, rd_uv.uv_levels, sizeof(rd->uv_levels));
926      memcpy(dst0, tmp_dst, UV_SIZE);   //  TODO: SwapUVOut() ?
927    }
928  }
929  VP8SetIntraUVMode(it, rd->mode_uv);
930  AddScore(rd, &rd_best);
931}
932
933//------------------------------------------------------------------------------
934// Final reconstruction and quantization.
935
936static void SimpleQuantize(VP8EncIterator* const it, VP8ModeScore* const rd) {
937  const VP8Encoder* const enc = it->enc_;
938  const int is_i16 = (it->mb_->type_ == 1);
939  int nz = 0;
940
941  if (is_i16) {
942    nz = ReconstructIntra16(it, rd, it->yuv_out_ + Y_OFF, it->preds_[0]);
943  } else {
944    VP8IteratorStartI4(it);
945    do {
946      const int mode =
947          it->preds_[(it->i4_ & 3) + (it->i4_ >> 2) * enc->preds_w_];
948      const uint8_t* const src = it->yuv_in_ + Y_OFF + VP8Scan[it->i4_];
949      uint8_t* const dst = it->yuv_out_ + Y_OFF + VP8Scan[it->i4_];
950      VP8MakeIntra4Preds(it);
951      nz |= ReconstructIntra4(it, rd->y_ac_levels[it->i4_],
952                              src, dst, mode) << it->i4_;
953    } while (VP8IteratorRotateI4(it, it->yuv_out_ + Y_OFF));
954  }
955
956  nz |= ReconstructUV(it, rd, it->yuv_out_ + U_OFF, it->mb_->uv_mode_);
957  rd->nz = nz;
958}
959
960// Refine intra16/intra4 sub-modes based on distortion only (not rate).
961static void DistoRefine(VP8EncIterator* const it, int try_both_i4_i16) {
962  const int is_i16 = (it->mb_->type_ == 1);
963  score_t best_score = MAX_COST;
964
965  if (try_both_i4_i16 || is_i16) {
966    int mode;
967    int best_mode = -1;
968    for (mode = 0; mode < NUM_PRED_MODES; ++mode) {
969      const uint8_t* const ref = it->yuv_p_ + VP8I16ModeOffsets[mode];
970      const uint8_t* const src = it->yuv_in_ + Y_OFF;
971      const score_t score = VP8SSE16x16(src, ref);
972      if (score < best_score) {
973        best_mode = mode;
974        best_score = score;
975      }
976    }
977    VP8SetIntra16Mode(it, best_mode);
978  }
979  if (try_both_i4_i16 || !is_i16) {
980    uint8_t modes_i4[16];
981    // We don't evaluate the rate here, but just account for it through a
982    // constant penalty (i4 mode usually needs more bits compared to i16).
983    score_t score_i4 = (score_t)I4_PENALTY;
984
985    VP8IteratorStartI4(it);
986    do {
987      int mode;
988      int best_sub_mode = -1;
989      score_t best_sub_score = MAX_COST;
990      const uint8_t* const src = it->yuv_in_ + Y_OFF + VP8Scan[it->i4_];
991
992      // TODO(skal): we don't really need the prediction pixels here,
993      // but just the distortion against 'src'.
994      VP8MakeIntra4Preds(it);
995      for (mode = 0; mode < NUM_BMODES; ++mode) {
996        const uint8_t* const ref = it->yuv_p_ + VP8I4ModeOffsets[mode];
997        const score_t score = VP8SSE4x4(src, ref);
998        if (score < best_sub_score) {
999          best_sub_mode = mode;
1000          best_sub_score = score;
1001        }
1002      }
1003      modes_i4[it->i4_] = best_sub_mode;
1004      score_i4 += best_sub_score;
1005      if (score_i4 >= best_score) break;
1006    } while (VP8IteratorRotateI4(it, it->yuv_in_ + Y_OFF));
1007    if (score_i4 < best_score) {
1008      VP8SetIntra4Mode(it, modes_i4);
1009    }
1010  }
1011}
1012
1013//------------------------------------------------------------------------------
1014// Entry point
1015
1016int VP8Decimate(VP8EncIterator* const it, VP8ModeScore* const rd,
1017                VP8RDLevel rd_opt) {
1018  int is_skipped;
1019  const int method = it->enc_->method_;
1020
1021  InitScore(rd);
1022
1023  // We can perform predictions for Luma16x16 and Chroma8x8 already.
1024  // Luma4x4 predictions needs to be done as-we-go.
1025  VP8MakeLuma16Preds(it);
1026  VP8MakeChroma8Preds(it);
1027
1028  if (rd_opt > RD_OPT_NONE) {
1029    it->do_trellis_ = (rd_opt >= RD_OPT_TRELLIS_ALL);
1030    PickBestIntra16(it, rd);
1031    if (method >= 2) {
1032      PickBestIntra4(it, rd);
1033    }
1034    PickBestUV(it, rd);
1035    if (rd_opt == RD_OPT_TRELLIS) {   // finish off with trellis-optim now
1036      it->do_trellis_ = 1;
1037      SimpleQuantize(it, rd);
1038    }
1039  } else {
1040    // For method == 2, pick the best intra4/intra16 based on SSE (~tad slower).
1041    // For method <= 1, we refine intra4 or intra16 (but don't re-examine mode).
1042    DistoRefine(it, (method >= 2));
1043    SimpleQuantize(it, rd);
1044  }
1045  is_skipped = (rd->nz == 0);
1046  VP8SetSkip(it, is_skipped);
1047  return is_skipped;
1048}
1049
1050#if defined(__cplusplus) || defined(c_plusplus)
1051}    // extern "C"
1052#endif
1053