quant_enc.c revision fa39824bb690c5806358871f46940d0450973d8a
1// Copyright 2011 Google Inc. All Rights Reserved. 2// 3// Use of this source code is governed by a BSD-style license 4// that can be found in the COPYING file in the root of the source 5// tree. An additional intellectual property rights grant can be found 6// in the file PATENTS. All contributing project authors may 7// be found in the AUTHORS file in the root of the source tree. 8// ----------------------------------------------------------------------------- 9// 10// Quantization 11// 12// Author: Skal (pascal.massimino@gmail.com) 13 14#include <assert.h> 15#include <math.h> 16#include <stdlib.h> // for abs() 17 18#include "./vp8i_enc.h" 19#include "./cost_enc.h" 20 21#define DO_TRELLIS_I4 1 22#define DO_TRELLIS_I16 1 // not a huge gain, but ok at low bitrate. 23#define DO_TRELLIS_UV 0 // disable trellis for UV. Risky. Not worth. 24#define USE_TDISTO 1 25 26#define MID_ALPHA 64 // neutral value for susceptibility 27#define MIN_ALPHA 30 // lowest usable value for susceptibility 28#define MAX_ALPHA 100 // higher meaningful value for susceptibility 29 30#define SNS_TO_DQ 0.9 // Scaling constant between the sns value and the QP 31 // power-law modulation. Must be strictly less than 1. 32 33// number of non-zero coeffs below which we consider the block very flat 34// (and apply a penalty to complex predictions) 35#define FLATNESS_LIMIT_I16 10 // I16 mode 36#define FLATNESS_LIMIT_I4 3 // I4 mode 37#define FLATNESS_LIMIT_UV 2 // UV mode 38#define FLATNESS_PENALTY 140 // roughly ~1bit per block 39 40#define MULT_8B(a, b) (((a) * (b) + 128) >> 8) 41 42#define RD_DISTO_MULT 256 // distortion multiplier (equivalent of lambda) 43 44// #define DEBUG_BLOCK 45 46//------------------------------------------------------------------------------ 47 48#if defined(DEBUG_BLOCK) 49 50#include <stdio.h> 51#include <stdlib.h> 52 53static void PrintBlockInfo(const VP8EncIterator* const it, 54 const VP8ModeScore* const rd) { 55 int i, j; 56 const int is_i16 = (it->mb_->type_ == 1); 57 const uint8_t* const y_in = it->yuv_in_ + Y_OFF_ENC; 58 const uint8_t* const y_out = it->yuv_out_ + Y_OFF_ENC; 59 const uint8_t* const uv_in = it->yuv_in_ + U_OFF_ENC; 60 const uint8_t* const uv_out = it->yuv_out_ + U_OFF_ENC; 61 printf("SOURCE / OUTPUT / ABS DELTA\n"); 62 for (j = 0; j < 16; ++j) { 63 for (i = 0; i < 16; ++i) printf("%3d ", y_in[i + j * BPS]); 64 printf(" "); 65 for (i = 0; i < 16; ++i) printf("%3d ", y_out[i + j * BPS]); 66 printf(" "); 67 for (i = 0; i < 16; ++i) { 68 printf("%1d ", abs(y_in[i + j * BPS] - y_out[i + j * BPS])); 69 } 70 printf("\n"); 71 } 72 printf("\n"); // newline before the U/V block 73 for (j = 0; j < 8; ++j) { 74 for (i = 0; i < 8; ++i) printf("%3d ", uv_in[i + j * BPS]); 75 printf(" "); 76 for (i = 8; i < 16; ++i) printf("%3d ", uv_in[i + j * BPS]); 77 printf(" "); 78 for (i = 0; i < 8; ++i) printf("%3d ", uv_out[i + j * BPS]); 79 printf(" "); 80 for (i = 8; i < 16; ++i) printf("%3d ", uv_out[i + j * BPS]); 81 printf(" "); 82 for (i = 0; i < 8; ++i) { 83 printf("%1d ", abs(uv_out[i + j * BPS] - uv_in[i + j * BPS])); 84 } 85 printf(" "); 86 for (i = 8; i < 16; ++i) { 87 printf("%1d ", abs(uv_out[i + j * BPS] - uv_in[i + j * BPS])); 88 } 89 printf("\n"); 90 } 91 printf("\nD:%d SD:%d R:%d H:%d nz:0x%x score:%d\n", 92 (int)rd->D, (int)rd->SD, (int)rd->R, (int)rd->H, (int)rd->nz, 93 (int)rd->score); 94 if (is_i16) { 95 printf("Mode: %d\n", rd->mode_i16); 96 printf("y_dc_levels:"); 97 for (i = 0; i < 16; ++i) printf("%3d ", rd->y_dc_levels[i]); 98 printf("\n"); 99 } else { 100 printf("Modes[16]: "); 101 for (i = 0; i < 16; ++i) printf("%d ", rd->modes_i4[i]); 102 printf("\n"); 103 } 104 printf("y_ac_levels:\n"); 105 for (j = 0; j < 16; ++j) { 106 for (i = is_i16 ? 1 : 0; i < 16; ++i) { 107 printf("%4d ", rd->y_ac_levels[j][i]); 108 } 109 printf("\n"); 110 } 111 printf("\n"); 112 printf("uv_levels (mode=%d):\n", rd->mode_uv); 113 for (j = 0; j < 8; ++j) { 114 for (i = 0; i < 16; ++i) { 115 printf("%4d ", rd->uv_levels[j][i]); 116 } 117 printf("\n"); 118 } 119} 120 121#endif // DEBUG_BLOCK 122 123//------------------------------------------------------------------------------ 124 125static WEBP_INLINE int clip(int v, int m, int M) { 126 return v < m ? m : v > M ? M : v; 127} 128 129static const uint8_t kZigzag[16] = { 130 0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15 131}; 132 133static const uint8_t kDcTable[128] = { 134 4, 5, 6, 7, 8, 9, 10, 10, 135 11, 12, 13, 14, 15, 16, 17, 17, 136 18, 19, 20, 20, 21, 21, 22, 22, 137 23, 23, 24, 25, 25, 26, 27, 28, 138 29, 30, 31, 32, 33, 34, 35, 36, 139 37, 37, 38, 39, 40, 41, 42, 43, 140 44, 45, 46, 46, 47, 48, 49, 50, 141 51, 52, 53, 54, 55, 56, 57, 58, 142 59, 60, 61, 62, 63, 64, 65, 66, 143 67, 68, 69, 70, 71, 72, 73, 74, 144 75, 76, 76, 77, 78, 79, 80, 81, 145 82, 83, 84, 85, 86, 87, 88, 89, 146 91, 93, 95, 96, 98, 100, 101, 102, 147 104, 106, 108, 110, 112, 114, 116, 118, 148 122, 124, 126, 128, 130, 132, 134, 136, 149 138, 140, 143, 145, 148, 151, 154, 157 150}; 151 152static const uint16_t kAcTable[128] = { 153 4, 5, 6, 7, 8, 9, 10, 11, 154 12, 13, 14, 15, 16, 17, 18, 19, 155 20, 21, 22, 23, 24, 25, 26, 27, 156 28, 29, 30, 31, 32, 33, 34, 35, 157 36, 37, 38, 39, 40, 41, 42, 43, 158 44, 45, 46, 47, 48, 49, 50, 51, 159 52, 53, 54, 55, 56, 57, 58, 60, 160 62, 64, 66, 68, 70, 72, 74, 76, 161 78, 80, 82, 84, 86, 88, 90, 92, 162 94, 96, 98, 100, 102, 104, 106, 108, 163 110, 112, 114, 116, 119, 122, 125, 128, 164 131, 134, 137, 140, 143, 146, 149, 152, 165 155, 158, 161, 164, 167, 170, 173, 177, 166 181, 185, 189, 193, 197, 201, 205, 209, 167 213, 217, 221, 225, 229, 234, 239, 245, 168 249, 254, 259, 264, 269, 274, 279, 284 169}; 170 171static const uint16_t kAcTable2[128] = { 172 8, 8, 9, 10, 12, 13, 15, 17, 173 18, 20, 21, 23, 24, 26, 27, 29, 174 31, 32, 34, 35, 37, 38, 40, 41, 175 43, 44, 46, 48, 49, 51, 52, 54, 176 55, 57, 58, 60, 62, 63, 65, 66, 177 68, 69, 71, 72, 74, 75, 77, 79, 178 80, 82, 83, 85, 86, 88, 89, 93, 179 96, 99, 102, 105, 108, 111, 114, 117, 180 120, 124, 127, 130, 133, 136, 139, 142, 181 145, 148, 151, 155, 158, 161, 164, 167, 182 170, 173, 176, 179, 184, 189, 193, 198, 183 203, 207, 212, 217, 221, 226, 230, 235, 184 240, 244, 249, 254, 258, 263, 268, 274, 185 280, 286, 292, 299, 305, 311, 317, 323, 186 330, 336, 342, 348, 354, 362, 370, 379, 187 385, 393, 401, 409, 416, 424, 432, 440 188}; 189 190static const uint8_t kBiasMatrices[3][2] = { // [luma-ac,luma-dc,chroma][dc,ac] 191 { 96, 110 }, { 96, 108 }, { 110, 115 } 192}; 193 194// Sharpening by (slightly) raising the hi-frequency coeffs. 195// Hack-ish but helpful for mid-bitrate range. Use with care. 196#define SHARPEN_BITS 11 // number of descaling bits for sharpening bias 197static const uint8_t kFreqSharpening[16] = { 198 0, 30, 60, 90, 199 30, 60, 90, 90, 200 60, 90, 90, 90, 201 90, 90, 90, 90 202}; 203 204//------------------------------------------------------------------------------ 205// Initialize quantization parameters in VP8Matrix 206 207// Returns the average quantizer 208static int ExpandMatrix(VP8Matrix* const m, int type) { 209 int i, sum; 210 for (i = 0; i < 2; ++i) { 211 const int is_ac_coeff = (i > 0); 212 const int bias = kBiasMatrices[type][is_ac_coeff]; 213 m->iq_[i] = (1 << QFIX) / m->q_[i]; 214 m->bias_[i] = BIAS(bias); 215 // zthresh_ is the exact value such that QUANTDIV(coeff, iQ, B) is: 216 // * zero if coeff <= zthresh 217 // * non-zero if coeff > zthresh 218 m->zthresh_[i] = ((1 << QFIX) - 1 - m->bias_[i]) / m->iq_[i]; 219 } 220 for (i = 2; i < 16; ++i) { 221 m->q_[i] = m->q_[1]; 222 m->iq_[i] = m->iq_[1]; 223 m->bias_[i] = m->bias_[1]; 224 m->zthresh_[i] = m->zthresh_[1]; 225 } 226 for (sum = 0, i = 0; i < 16; ++i) { 227 if (type == 0) { // we only use sharpening for AC luma coeffs 228 m->sharpen_[i] = (kFreqSharpening[i] * m->q_[i]) >> SHARPEN_BITS; 229 } else { 230 m->sharpen_[i] = 0; 231 } 232 sum += m->q_[i]; 233 } 234 return (sum + 8) >> 4; 235} 236 237static void CheckLambdaValue(int* const v) { if (*v < 1) *v = 1; } 238 239static void SetupMatrices(VP8Encoder* enc) { 240 int i; 241 const int tlambda_scale = 242 (enc->method_ >= 4) ? enc->config_->sns_strength 243 : 0; 244 const int num_segments = enc->segment_hdr_.num_segments_; 245 for (i = 0; i < num_segments; ++i) { 246 VP8SegmentInfo* const m = &enc->dqm_[i]; 247 const int q = m->quant_; 248 int q_i4, q_i16, q_uv; 249 m->y1_.q_[0] = kDcTable[clip(q + enc->dq_y1_dc_, 0, 127)]; 250 m->y1_.q_[1] = kAcTable[clip(q, 0, 127)]; 251 252 m->y2_.q_[0] = kDcTable[ clip(q + enc->dq_y2_dc_, 0, 127)] * 2; 253 m->y2_.q_[1] = kAcTable2[clip(q + enc->dq_y2_ac_, 0, 127)]; 254 255 m->uv_.q_[0] = kDcTable[clip(q + enc->dq_uv_dc_, 0, 117)]; 256 m->uv_.q_[1] = kAcTable[clip(q + enc->dq_uv_ac_, 0, 127)]; 257 258 q_i4 = ExpandMatrix(&m->y1_, 0); 259 q_i16 = ExpandMatrix(&m->y2_, 1); 260 q_uv = ExpandMatrix(&m->uv_, 2); 261 262 m->lambda_i4_ = (3 * q_i4 * q_i4) >> 7; 263 m->lambda_i16_ = (3 * q_i16 * q_i16); 264 m->lambda_uv_ = (3 * q_uv * q_uv) >> 6; 265 m->lambda_mode_ = (1 * q_i4 * q_i4) >> 7; 266 m->lambda_trellis_i4_ = (7 * q_i4 * q_i4) >> 3; 267 m->lambda_trellis_i16_ = (q_i16 * q_i16) >> 2; 268 m->lambda_trellis_uv_ = (q_uv * q_uv) << 1; 269 m->tlambda_ = (tlambda_scale * q_i4) >> 5; 270 271 // none of these constants should be < 1 272 CheckLambdaValue(&m->lambda_i4_); 273 CheckLambdaValue(&m->lambda_i16_); 274 CheckLambdaValue(&m->lambda_uv_); 275 CheckLambdaValue(&m->lambda_mode_); 276 CheckLambdaValue(&m->lambda_trellis_i4_); 277 CheckLambdaValue(&m->lambda_trellis_i16_); 278 CheckLambdaValue(&m->lambda_trellis_uv_); 279 CheckLambdaValue(&m->tlambda_); 280 281 m->min_disto_ = 20 * m->y1_.q_[0]; // quantization-aware min disto 282 m->max_edge_ = 0; 283 284 m->i4_penalty_ = 1000 * q_i4 * q_i4; 285 } 286} 287 288//------------------------------------------------------------------------------ 289// Initialize filtering parameters 290 291// Very small filter-strength values have close to no visual effect. So we can 292// save a little decoding-CPU by turning filtering off for these. 293#define FSTRENGTH_CUTOFF 2 294 295static void SetupFilterStrength(VP8Encoder* const enc) { 296 int i; 297 // level0 is in [0..500]. Using '-f 50' as filter_strength is mid-filtering. 298 const int level0 = 5 * enc->config_->filter_strength; 299 for (i = 0; i < NUM_MB_SEGMENTS; ++i) { 300 VP8SegmentInfo* const m = &enc->dqm_[i]; 301 // We focus on the quantization of AC coeffs. 302 const int qstep = kAcTable[clip(m->quant_, 0, 127)] >> 2; 303 const int base_strength = 304 VP8FilterStrengthFromDelta(enc->filter_hdr_.sharpness_, qstep); 305 // Segments with lower complexity ('beta') will be less filtered. 306 const int f = base_strength * level0 / (256 + m->beta_); 307 m->fstrength_ = (f < FSTRENGTH_CUTOFF) ? 0 : (f > 63) ? 63 : f; 308 } 309 // We record the initial strength (mainly for the case of 1-segment only). 310 enc->filter_hdr_.level_ = enc->dqm_[0].fstrength_; 311 enc->filter_hdr_.simple_ = (enc->config_->filter_type == 0); 312 enc->filter_hdr_.sharpness_ = enc->config_->filter_sharpness; 313} 314 315//------------------------------------------------------------------------------ 316 317// Note: if you change the values below, remember that the max range 318// allowed by the syntax for DQ_UV is [-16,16]. 319#define MAX_DQ_UV (6) 320#define MIN_DQ_UV (-4) 321 322// We want to emulate jpeg-like behaviour where the expected "good" quality 323// is around q=75. Internally, our "good" middle is around c=50. So we 324// map accordingly using linear piece-wise function 325static double QualityToCompression(double c) { 326 const double linear_c = (c < 0.75) ? c * (2. / 3.) : 2. * c - 1.; 327 // The file size roughly scales as pow(quantizer, 3.). Actually, the 328 // exponent is somewhere between 2.8 and 3.2, but we're mostly interested 329 // in the mid-quant range. So we scale the compressibility inversely to 330 // this power-law: quant ~= compression ^ 1/3. This law holds well for 331 // low quant. Finer modeling for high-quant would make use of kAcTable[] 332 // more explicitly. 333 const double v = pow(linear_c, 1 / 3.); 334 return v; 335} 336 337static double QualityToJPEGCompression(double c, double alpha) { 338 // We map the complexity 'alpha' and quality setting 'c' to a compression 339 // exponent empirically matched to the compression curve of libjpeg6b. 340 // On average, the WebP output size will be roughly similar to that of a 341 // JPEG file compressed with same quality factor. 342 const double amin = 0.30; 343 const double amax = 0.85; 344 const double exp_min = 0.4; 345 const double exp_max = 0.9; 346 const double slope = (exp_min - exp_max) / (amax - amin); 347 // Linearly interpolate 'expn' from exp_min to exp_max 348 // in the [amin, amax] range. 349 const double expn = (alpha > amax) ? exp_min 350 : (alpha < amin) ? exp_max 351 : exp_max + slope * (alpha - amin); 352 const double v = pow(c, expn); 353 return v; 354} 355 356static int SegmentsAreEquivalent(const VP8SegmentInfo* const S1, 357 const VP8SegmentInfo* const S2) { 358 return (S1->quant_ == S2->quant_) && (S1->fstrength_ == S2->fstrength_); 359} 360 361static void SimplifySegments(VP8Encoder* const enc) { 362 int map[NUM_MB_SEGMENTS] = { 0, 1, 2, 3 }; 363 // 'num_segments_' is previously validated and <= NUM_MB_SEGMENTS, but an 364 // explicit check is needed to avoid a spurious warning about 'i' exceeding 365 // array bounds of 'dqm_' with some compilers (noticed with gcc-4.9). 366 const int num_segments = (enc->segment_hdr_.num_segments_ < NUM_MB_SEGMENTS) 367 ? enc->segment_hdr_.num_segments_ 368 : NUM_MB_SEGMENTS; 369 int num_final_segments = 1; 370 int s1, s2; 371 for (s1 = 1; s1 < num_segments; ++s1) { // find similar segments 372 const VP8SegmentInfo* const S1 = &enc->dqm_[s1]; 373 int found = 0; 374 // check if we already have similar segment 375 for (s2 = 0; s2 < num_final_segments; ++s2) { 376 const VP8SegmentInfo* const S2 = &enc->dqm_[s2]; 377 if (SegmentsAreEquivalent(S1, S2)) { 378 found = 1; 379 break; 380 } 381 } 382 map[s1] = s2; 383 if (!found) { 384 if (num_final_segments != s1) { 385 enc->dqm_[num_final_segments] = enc->dqm_[s1]; 386 } 387 ++num_final_segments; 388 } 389 } 390 if (num_final_segments < num_segments) { // Remap 391 int i = enc->mb_w_ * enc->mb_h_; 392 while (i-- > 0) enc->mb_info_[i].segment_ = map[enc->mb_info_[i].segment_]; 393 enc->segment_hdr_.num_segments_ = num_final_segments; 394 // Replicate the trailing segment infos (it's mostly cosmetics) 395 for (i = num_final_segments; i < num_segments; ++i) { 396 enc->dqm_[i] = enc->dqm_[num_final_segments - 1]; 397 } 398 } 399} 400 401void VP8SetSegmentParams(VP8Encoder* const enc, float quality) { 402 int i; 403 int dq_uv_ac, dq_uv_dc; 404 const int num_segments = enc->segment_hdr_.num_segments_; 405 const double amp = SNS_TO_DQ * enc->config_->sns_strength / 100. / 128.; 406 const double Q = quality / 100.; 407 const double c_base = enc->config_->emulate_jpeg_size ? 408 QualityToJPEGCompression(Q, enc->alpha_ / 255.) : 409 QualityToCompression(Q); 410 for (i = 0; i < num_segments; ++i) { 411 // We modulate the base coefficient to accommodate for the quantization 412 // susceptibility and allow denser segments to be quantized more. 413 const double expn = 1. - amp * enc->dqm_[i].alpha_; 414 const double c = pow(c_base, expn); 415 const int q = (int)(127. * (1. - c)); 416 assert(expn > 0.); 417 enc->dqm_[i].quant_ = clip(q, 0, 127); 418 } 419 420 // purely indicative in the bitstream (except for the 1-segment case) 421 enc->base_quant_ = enc->dqm_[0].quant_; 422 423 // fill-in values for the unused segments (required by the syntax) 424 for (i = num_segments; i < NUM_MB_SEGMENTS; ++i) { 425 enc->dqm_[i].quant_ = enc->base_quant_; 426 } 427 428 // uv_alpha_ is normally spread around ~60. The useful range is 429 // typically ~30 (quite bad) to ~100 (ok to decimate UV more). 430 // We map it to the safe maximal range of MAX/MIN_DQ_UV for dq_uv. 431 dq_uv_ac = (enc->uv_alpha_ - MID_ALPHA) * (MAX_DQ_UV - MIN_DQ_UV) 432 / (MAX_ALPHA - MIN_ALPHA); 433 // we rescale by the user-defined strength of adaptation 434 dq_uv_ac = dq_uv_ac * enc->config_->sns_strength / 100; 435 // and make it safe. 436 dq_uv_ac = clip(dq_uv_ac, MIN_DQ_UV, MAX_DQ_UV); 437 // We also boost the dc-uv-quant a little, based on sns-strength, since 438 // U/V channels are quite more reactive to high quants (flat DC-blocks 439 // tend to appear, and are unpleasant). 440 dq_uv_dc = -4 * enc->config_->sns_strength / 100; 441 dq_uv_dc = clip(dq_uv_dc, -15, 15); // 4bit-signed max allowed 442 443 enc->dq_y1_dc_ = 0; // TODO(skal): dq-lum 444 enc->dq_y2_dc_ = 0; 445 enc->dq_y2_ac_ = 0; 446 enc->dq_uv_dc_ = dq_uv_dc; 447 enc->dq_uv_ac_ = dq_uv_ac; 448 449 SetupFilterStrength(enc); // initialize segments' filtering, eventually 450 451 if (num_segments > 1) SimplifySegments(enc); 452 453 SetupMatrices(enc); // finalize quantization matrices 454} 455 456//------------------------------------------------------------------------------ 457// Form the predictions in cache 458 459// Must be ordered using {DC_PRED, TM_PRED, V_PRED, H_PRED} as index 460const int VP8I16ModeOffsets[4] = { I16DC16, I16TM16, I16VE16, I16HE16 }; 461const int VP8UVModeOffsets[4] = { C8DC8, C8TM8, C8VE8, C8HE8 }; 462 463// Must be indexed using {B_DC_PRED -> B_HU_PRED} as index 464const int VP8I4ModeOffsets[NUM_BMODES] = { 465 I4DC4, I4TM4, I4VE4, I4HE4, I4RD4, I4VR4, I4LD4, I4VL4, I4HD4, I4HU4 466}; 467 468void VP8MakeLuma16Preds(const VP8EncIterator* const it) { 469 const uint8_t* const left = it->x_ ? it->y_left_ : NULL; 470 const uint8_t* const top = it->y_ ? it->y_top_ : NULL; 471 VP8EncPredLuma16(it->yuv_p_, left, top); 472} 473 474void VP8MakeChroma8Preds(const VP8EncIterator* const it) { 475 const uint8_t* const left = it->x_ ? it->u_left_ : NULL; 476 const uint8_t* const top = it->y_ ? it->uv_top_ : NULL; 477 VP8EncPredChroma8(it->yuv_p_, left, top); 478} 479 480void VP8MakeIntra4Preds(const VP8EncIterator* const it) { 481 VP8EncPredLuma4(it->yuv_p_, it->i4_top_); 482} 483 484//------------------------------------------------------------------------------ 485// Quantize 486 487// Layout: 488// +----+----+ 489// |YYYY|UUVV| 0 490// |YYYY|UUVV| 4 491// |YYYY|....| 8 492// |YYYY|....| 12 493// +----+----+ 494 495const int VP8Scan[16] = { // Luma 496 0 + 0 * BPS, 4 + 0 * BPS, 8 + 0 * BPS, 12 + 0 * BPS, 497 0 + 4 * BPS, 4 + 4 * BPS, 8 + 4 * BPS, 12 + 4 * BPS, 498 0 + 8 * BPS, 4 + 8 * BPS, 8 + 8 * BPS, 12 + 8 * BPS, 499 0 + 12 * BPS, 4 + 12 * BPS, 8 + 12 * BPS, 12 + 12 * BPS, 500}; 501 502static const int VP8ScanUV[4 + 4] = { 503 0 + 0 * BPS, 4 + 0 * BPS, 0 + 4 * BPS, 4 + 4 * BPS, // U 504 8 + 0 * BPS, 12 + 0 * BPS, 8 + 4 * BPS, 12 + 4 * BPS // V 505}; 506 507//------------------------------------------------------------------------------ 508// Distortion measurement 509 510static const uint16_t kWeightY[16] = { 511 38, 32, 20, 9, 32, 28, 17, 7, 20, 17, 10, 4, 9, 7, 4, 2 512}; 513 514static const uint16_t kWeightTrellis[16] = { 515#if USE_TDISTO == 0 516 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16 517#else 518 30, 27, 19, 11, 519 27, 24, 17, 10, 520 19, 17, 12, 8, 521 11, 10, 8, 6 522#endif 523}; 524 525// Init/Copy the common fields in score. 526static void InitScore(VP8ModeScore* const rd) { 527 rd->D = 0; 528 rd->SD = 0; 529 rd->R = 0; 530 rd->H = 0; 531 rd->nz = 0; 532 rd->score = MAX_COST; 533} 534 535static void CopyScore(VP8ModeScore* const dst, const VP8ModeScore* const src) { 536 dst->D = src->D; 537 dst->SD = src->SD; 538 dst->R = src->R; 539 dst->H = src->H; 540 dst->nz = src->nz; // note that nz is not accumulated, but just copied. 541 dst->score = src->score; 542} 543 544static void AddScore(VP8ModeScore* const dst, const VP8ModeScore* const src) { 545 dst->D += src->D; 546 dst->SD += src->SD; 547 dst->R += src->R; 548 dst->H += src->H; 549 dst->nz |= src->nz; // here, new nz bits are accumulated. 550 dst->score += src->score; 551} 552 553//------------------------------------------------------------------------------ 554// Performs trellis-optimized quantization. 555 556// Trellis node 557typedef struct { 558 int8_t prev; // best previous node 559 int8_t sign; // sign of coeff_i 560 int16_t level; // level 561} Node; 562 563// Score state 564typedef struct { 565 score_t score; // partial RD score 566 const uint16_t* costs; // shortcut to cost tables 567} ScoreState; 568 569// If a coefficient was quantized to a value Q (using a neutral bias), 570// we test all alternate possibilities between [Q-MIN_DELTA, Q+MAX_DELTA] 571// We don't test negative values though. 572#define MIN_DELTA 0 // how much lower level to try 573#define MAX_DELTA 1 // how much higher 574#define NUM_NODES (MIN_DELTA + 1 + MAX_DELTA) 575#define NODE(n, l) (nodes[(n)][(l) + MIN_DELTA]) 576#define SCORE_STATE(n, l) (score_states[n][(l) + MIN_DELTA]) 577 578static WEBP_INLINE void SetRDScore(int lambda, VP8ModeScore* const rd) { 579 rd->score = (rd->R + rd->H) * lambda + RD_DISTO_MULT * (rd->D + rd->SD); 580} 581 582static WEBP_INLINE score_t RDScoreTrellis(int lambda, score_t rate, 583 score_t distortion) { 584 return rate * lambda + RD_DISTO_MULT * distortion; 585} 586 587static int TrellisQuantizeBlock(const VP8Encoder* const enc, 588 int16_t in[16], int16_t out[16], 589 int ctx0, int coeff_type, 590 const VP8Matrix* const mtx, 591 int lambda) { 592 const ProbaArray* const probas = enc->proba_.coeffs_[coeff_type]; 593 CostArrayPtr const costs = 594 (CostArrayPtr)enc->proba_.remapped_costs_[coeff_type]; 595 const int first = (coeff_type == 0) ? 1 : 0; 596 Node nodes[16][NUM_NODES]; 597 ScoreState score_states[2][NUM_NODES]; 598 ScoreState* ss_cur = &SCORE_STATE(0, MIN_DELTA); 599 ScoreState* ss_prev = &SCORE_STATE(1, MIN_DELTA); 600 int best_path[3] = {-1, -1, -1}; // store best-last/best-level/best-previous 601 score_t best_score; 602 int n, m, p, last; 603 604 { 605 score_t cost; 606 const int thresh = mtx->q_[1] * mtx->q_[1] / 4; 607 const int last_proba = probas[VP8EncBands[first]][ctx0][0]; 608 609 // compute the position of the last interesting coefficient 610 last = first - 1; 611 for (n = 15; n >= first; --n) { 612 const int j = kZigzag[n]; 613 const int err = in[j] * in[j]; 614 if (err > thresh) { 615 last = n; 616 break; 617 } 618 } 619 // we don't need to go inspect up to n = 16 coeffs. We can just go up 620 // to last + 1 (inclusive) without losing much. 621 if (last < 15) ++last; 622 623 // compute 'skip' score. This is the max score one can do. 624 cost = VP8BitCost(0, last_proba); 625 best_score = RDScoreTrellis(lambda, cost, 0); 626 627 // initialize source node. 628 for (m = -MIN_DELTA; m <= MAX_DELTA; ++m) { 629 const score_t rate = (ctx0 == 0) ? VP8BitCost(1, last_proba) : 0; 630 ss_cur[m].score = RDScoreTrellis(lambda, rate, 0); 631 ss_cur[m].costs = costs[first][ctx0]; 632 } 633 } 634 635 // traverse trellis. 636 for (n = first; n <= last; ++n) { 637 const int j = kZigzag[n]; 638 const uint32_t Q = mtx->q_[j]; 639 const uint32_t iQ = mtx->iq_[j]; 640 const uint32_t B = BIAS(0x00); // neutral bias 641 // note: it's important to take sign of the _original_ coeff, 642 // so we don't have to consider level < 0 afterward. 643 const int sign = (in[j] < 0); 644 const uint32_t coeff0 = (sign ? -in[j] : in[j]) + mtx->sharpen_[j]; 645 int level0 = QUANTDIV(coeff0, iQ, B); 646 int thresh_level = QUANTDIV(coeff0, iQ, BIAS(0x80)); 647 if (thresh_level > MAX_LEVEL) thresh_level = MAX_LEVEL; 648 if (level0 > MAX_LEVEL) level0 = MAX_LEVEL; 649 650 { // Swap current and previous score states 651 ScoreState* const tmp = ss_cur; 652 ss_cur = ss_prev; 653 ss_prev = tmp; 654 } 655 656 // test all alternate level values around level0. 657 for (m = -MIN_DELTA; m <= MAX_DELTA; ++m) { 658 Node* const cur = &NODE(n, m); 659 int level = level0 + m; 660 const int ctx = (level > 2) ? 2 : level; 661 const int band = VP8EncBands[n + 1]; 662 score_t base_score; 663 score_t best_cur_score = MAX_COST; 664 int best_prev = 0; // default, in case 665 666 ss_cur[m].score = MAX_COST; 667 ss_cur[m].costs = costs[n + 1][ctx]; 668 if (level < 0 || level > thresh_level) { 669 // Node is dead. 670 continue; 671 } 672 673 { 674 // Compute delta_error = how much coding this level will 675 // subtract to max_error as distortion. 676 // Here, distortion = sum of (|coeff_i| - level_i * Q_i)^2 677 const int new_error = coeff0 - level * Q; 678 const int delta_error = 679 kWeightTrellis[j] * (new_error * new_error - coeff0 * coeff0); 680 base_score = RDScoreTrellis(lambda, 0, delta_error); 681 } 682 683 // Inspect all possible non-dead predecessors. Retain only the best one. 684 for (p = -MIN_DELTA; p <= MAX_DELTA; ++p) { 685 // Dead nodes (with ss_prev[p].score >= MAX_COST) are automatically 686 // eliminated since their score can't be better than the current best. 687 const score_t cost = VP8LevelCost(ss_prev[p].costs, level); 688 // Examine node assuming it's a non-terminal one. 689 const score_t score = 690 base_score + ss_prev[p].score + RDScoreTrellis(lambda, cost, 0); 691 if (score < best_cur_score) { 692 best_cur_score = score; 693 best_prev = p; 694 } 695 } 696 // Store best finding in current node. 697 cur->sign = sign; 698 cur->level = level; 699 cur->prev = best_prev; 700 ss_cur[m].score = best_cur_score; 701 702 // Now, record best terminal node (and thus best entry in the graph). 703 if (level != 0) { 704 const score_t last_pos_cost = 705 (n < 15) ? VP8BitCost(0, probas[band][ctx][0]) : 0; 706 const score_t last_pos_score = RDScoreTrellis(lambda, last_pos_cost, 0); 707 const score_t score = best_cur_score + last_pos_score; 708 if (score < best_score) { 709 best_score = score; 710 best_path[0] = n; // best eob position 711 best_path[1] = m; // best node index 712 best_path[2] = best_prev; // best predecessor 713 } 714 } 715 } 716 } 717 718 // Fresh start 719 memset(in + first, 0, (16 - first) * sizeof(*in)); 720 memset(out + first, 0, (16 - first) * sizeof(*out)); 721 if (best_path[0] == -1) { 722 return 0; // skip! 723 } 724 725 { 726 // Unwind the best path. 727 // Note: best-prev on terminal node is not necessarily equal to the 728 // best_prev for non-terminal. So we patch best_path[2] in. 729 int nz = 0; 730 int best_node = best_path[1]; 731 n = best_path[0]; 732 NODE(n, best_node).prev = best_path[2]; // force best-prev for terminal 733 734 for (; n >= first; --n) { 735 const Node* const node = &NODE(n, best_node); 736 const int j = kZigzag[n]; 737 out[n] = node->sign ? -node->level : node->level; 738 nz |= node->level; 739 in[j] = out[n] * mtx->q_[j]; 740 best_node = node->prev; 741 } 742 return (nz != 0); 743 } 744} 745 746#undef NODE 747 748//------------------------------------------------------------------------------ 749// Performs: difference, transform, quantize, back-transform, add 750// all at once. Output is the reconstructed block in *yuv_out, and the 751// quantized levels in *levels. 752 753static int ReconstructIntra16(VP8EncIterator* const it, 754 VP8ModeScore* const rd, 755 uint8_t* const yuv_out, 756 int mode) { 757 const VP8Encoder* const enc = it->enc_; 758 const uint8_t* const ref = it->yuv_p_ + VP8I16ModeOffsets[mode]; 759 const uint8_t* const src = it->yuv_in_ + Y_OFF_ENC; 760 const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_]; 761 int nz = 0; 762 int n; 763 int16_t tmp[16][16], dc_tmp[16]; 764 765 for (n = 0; n < 16; n += 2) { 766 VP8FTransform2(src + VP8Scan[n], ref + VP8Scan[n], tmp[n]); 767 } 768 VP8FTransformWHT(tmp[0], dc_tmp); 769 nz |= VP8EncQuantizeBlockWHT(dc_tmp, rd->y_dc_levels, &dqm->y2_) << 24; 770 771 if (DO_TRELLIS_I16 && it->do_trellis_) { 772 int x, y; 773 VP8IteratorNzToBytes(it); 774 for (y = 0, n = 0; y < 4; ++y) { 775 for (x = 0; x < 4; ++x, ++n) { 776 const int ctx = it->top_nz_[x] + it->left_nz_[y]; 777 const int non_zero = 778 TrellisQuantizeBlock(enc, tmp[n], rd->y_ac_levels[n], ctx, 0, 779 &dqm->y1_, dqm->lambda_trellis_i16_); 780 it->top_nz_[x] = it->left_nz_[y] = non_zero; 781 rd->y_ac_levels[n][0] = 0; 782 nz |= non_zero << n; 783 } 784 } 785 } else { 786 for (n = 0; n < 16; n += 2) { 787 // Zero-out the first coeff, so that: a) nz is correct below, and 788 // b) finding 'last' non-zero coeffs in SetResidualCoeffs() is simplified. 789 tmp[n][0] = tmp[n + 1][0] = 0; 790 nz |= VP8EncQuantize2Blocks(tmp[n], rd->y_ac_levels[n], &dqm->y1_) << n; 791 assert(rd->y_ac_levels[n + 0][0] == 0); 792 assert(rd->y_ac_levels[n + 1][0] == 0); 793 } 794 } 795 796 // Transform back 797 VP8TransformWHT(dc_tmp, tmp[0]); 798 for (n = 0; n < 16; n += 2) { 799 VP8ITransform(ref + VP8Scan[n], tmp[n], yuv_out + VP8Scan[n], 1); 800 } 801 802 return nz; 803} 804 805static int ReconstructIntra4(VP8EncIterator* const it, 806 int16_t levels[16], 807 const uint8_t* const src, 808 uint8_t* const yuv_out, 809 int mode) { 810 const VP8Encoder* const enc = it->enc_; 811 const uint8_t* const ref = it->yuv_p_ + VP8I4ModeOffsets[mode]; 812 const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_]; 813 int nz = 0; 814 int16_t tmp[16]; 815 816 VP8FTransform(src, ref, tmp); 817 if (DO_TRELLIS_I4 && it->do_trellis_) { 818 const int x = it->i4_ & 3, y = it->i4_ >> 2; 819 const int ctx = it->top_nz_[x] + it->left_nz_[y]; 820 nz = TrellisQuantizeBlock(enc, tmp, levels, ctx, 3, &dqm->y1_, 821 dqm->lambda_trellis_i4_); 822 } else { 823 nz = VP8EncQuantizeBlock(tmp, levels, &dqm->y1_); 824 } 825 VP8ITransform(ref, tmp, yuv_out, 0); 826 return nz; 827} 828 829static int ReconstructUV(VP8EncIterator* const it, VP8ModeScore* const rd, 830 uint8_t* const yuv_out, int mode) { 831 const VP8Encoder* const enc = it->enc_; 832 const uint8_t* const ref = it->yuv_p_ + VP8UVModeOffsets[mode]; 833 const uint8_t* const src = it->yuv_in_ + U_OFF_ENC; 834 const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_]; 835 int nz = 0; 836 int n; 837 int16_t tmp[8][16]; 838 839 for (n = 0; n < 8; n += 2) { 840 VP8FTransform2(src + VP8ScanUV[n], ref + VP8ScanUV[n], tmp[n]); 841 } 842 if (DO_TRELLIS_UV && it->do_trellis_) { 843 int ch, x, y; 844 for (ch = 0, n = 0; ch <= 2; ch += 2) { 845 for (y = 0; y < 2; ++y) { 846 for (x = 0; x < 2; ++x, ++n) { 847 const int ctx = it->top_nz_[4 + ch + x] + it->left_nz_[4 + ch + y]; 848 const int non_zero = 849 TrellisQuantizeBlock(enc, tmp[n], rd->uv_levels[n], ctx, 2, 850 &dqm->uv_, dqm->lambda_trellis_uv_); 851 it->top_nz_[4 + ch + x] = it->left_nz_[4 + ch + y] = non_zero; 852 nz |= non_zero << n; 853 } 854 } 855 } 856 } else { 857 for (n = 0; n < 8; n += 2) { 858 nz |= VP8EncQuantize2Blocks(tmp[n], rd->uv_levels[n], &dqm->uv_) << n; 859 } 860 } 861 862 for (n = 0; n < 8; n += 2) { 863 VP8ITransform(ref + VP8ScanUV[n], tmp[n], yuv_out + VP8ScanUV[n], 1); 864 } 865 return (nz << 16); 866} 867 868//------------------------------------------------------------------------------ 869// RD-opt decision. Reconstruct each modes, evalue distortion and bit-cost. 870// Pick the mode is lower RD-cost = Rate + lambda * Distortion. 871 872static void StoreMaxDelta(VP8SegmentInfo* const dqm, const int16_t DCs[16]) { 873 // We look at the first three AC coefficients to determine what is the average 874 // delta between each sub-4x4 block. 875 const int v0 = abs(DCs[1]); 876 const int v1 = abs(DCs[2]); 877 const int v2 = abs(DCs[4]); 878 int max_v = (v1 > v0) ? v1 : v0; 879 max_v = (v2 > max_v) ? v2 : max_v; 880 if (max_v > dqm->max_edge_) dqm->max_edge_ = max_v; 881} 882 883static void SwapModeScore(VP8ModeScore** a, VP8ModeScore** b) { 884 VP8ModeScore* const tmp = *a; 885 *a = *b; 886 *b = tmp; 887} 888 889static void SwapPtr(uint8_t** a, uint8_t** b) { 890 uint8_t* const tmp = *a; 891 *a = *b; 892 *b = tmp; 893} 894 895static void SwapOut(VP8EncIterator* const it) { 896 SwapPtr(&it->yuv_out_, &it->yuv_out2_); 897} 898 899static score_t IsFlat(const int16_t* levels, int num_blocks, score_t thresh) { 900 score_t score = 0; 901 while (num_blocks-- > 0) { // TODO(skal): refine positional scoring? 902 int i; 903 for (i = 1; i < 16; ++i) { // omit DC, we're only interested in AC 904 score += (levels[i] != 0); 905 if (score > thresh) return 0; 906 } 907 levels += 16; 908 } 909 return 1; 910} 911 912static void PickBestIntra16(VP8EncIterator* const it, VP8ModeScore* rd) { 913 const int kNumBlocks = 16; 914 VP8SegmentInfo* const dqm = &it->enc_->dqm_[it->mb_->segment_]; 915 const int lambda = dqm->lambda_i16_; 916 const int tlambda = dqm->tlambda_; 917 const uint8_t* const src = it->yuv_in_ + Y_OFF_ENC; 918 VP8ModeScore rd_tmp; 919 VP8ModeScore* rd_cur = &rd_tmp; 920 VP8ModeScore* rd_best = rd; 921 int mode; 922 923 rd->mode_i16 = -1; 924 for (mode = 0; mode < NUM_PRED_MODES; ++mode) { 925 uint8_t* const tmp_dst = it->yuv_out2_ + Y_OFF_ENC; // scratch buffer 926 rd_cur->mode_i16 = mode; 927 928 // Reconstruct 929 rd_cur->nz = ReconstructIntra16(it, rd_cur, tmp_dst, mode); 930 931 // Measure RD-score 932 rd_cur->D = VP8SSE16x16(src, tmp_dst); 933 rd_cur->SD = 934 tlambda ? MULT_8B(tlambda, VP8TDisto16x16(src, tmp_dst, kWeightY)) : 0; 935 rd_cur->H = VP8FixedCostsI16[mode]; 936 rd_cur->R = VP8GetCostLuma16(it, rd_cur); 937 if (mode > 0 && 938 IsFlat(rd_cur->y_ac_levels[0], kNumBlocks, FLATNESS_LIMIT_I16)) { 939 // penalty to avoid flat area to be mispredicted by complex mode 940 rd_cur->R += FLATNESS_PENALTY * kNumBlocks; 941 } 942 943 // Since we always examine Intra16 first, we can overwrite *rd directly. 944 SetRDScore(lambda, rd_cur); 945 if (mode == 0 || rd_cur->score < rd_best->score) { 946 SwapModeScore(&rd_cur, &rd_best); 947 SwapOut(it); 948 } 949 } 950 if (rd_best != rd) { 951 memcpy(rd, rd_best, sizeof(*rd)); 952 } 953 SetRDScore(dqm->lambda_mode_, rd); // finalize score for mode decision. 954 VP8SetIntra16Mode(it, rd->mode_i16); 955 956 // we have a blocky macroblock (only DCs are non-zero) with fairly high 957 // distortion, record max delta so we can later adjust the minimal filtering 958 // strength needed to smooth these blocks out. 959 if ((rd->nz & 0x100ffff) == 0x1000000 && rd->D > dqm->min_disto_) { 960 StoreMaxDelta(dqm, rd->y_dc_levels); 961 } 962} 963 964//------------------------------------------------------------------------------ 965 966// return the cost array corresponding to the surrounding prediction modes. 967static const uint16_t* GetCostModeI4(VP8EncIterator* const it, 968 const uint8_t modes[16]) { 969 const int preds_w = it->enc_->preds_w_; 970 const int x = (it->i4_ & 3), y = it->i4_ >> 2; 971 const int left = (x == 0) ? it->preds_[y * preds_w - 1] : modes[it->i4_ - 1]; 972 const int top = (y == 0) ? it->preds_[-preds_w + x] : modes[it->i4_ - 4]; 973 return VP8FixedCostsI4[top][left]; 974} 975 976static int PickBestIntra4(VP8EncIterator* const it, VP8ModeScore* const rd) { 977 const VP8Encoder* const enc = it->enc_; 978 const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_]; 979 const int lambda = dqm->lambda_i4_; 980 const int tlambda = dqm->tlambda_; 981 const uint8_t* const src0 = it->yuv_in_ + Y_OFF_ENC; 982 uint8_t* const best_blocks = it->yuv_out2_ + Y_OFF_ENC; 983 int total_header_bits = 0; 984 VP8ModeScore rd_best; 985 986 if (enc->max_i4_header_bits_ == 0) { 987 return 0; 988 } 989 990 InitScore(&rd_best); 991 rd_best.H = 211; // '211' is the value of VP8BitCost(0, 145) 992 SetRDScore(dqm->lambda_mode_, &rd_best); 993 VP8IteratorStartI4(it); 994 do { 995 const int kNumBlocks = 1; 996 VP8ModeScore rd_i4; 997 int mode; 998 int best_mode = -1; 999 const uint8_t* const src = src0 + VP8Scan[it->i4_]; 1000 const uint16_t* const mode_costs = GetCostModeI4(it, rd->modes_i4); 1001 uint8_t* best_block = best_blocks + VP8Scan[it->i4_]; 1002 uint8_t* tmp_dst = it->yuv_p_ + I4TMP; // scratch buffer. 1003 1004 InitScore(&rd_i4); 1005 VP8MakeIntra4Preds(it); 1006 for (mode = 0; mode < NUM_BMODES; ++mode) { 1007 VP8ModeScore rd_tmp; 1008 int16_t tmp_levels[16]; 1009 1010 // Reconstruct 1011 rd_tmp.nz = 1012 ReconstructIntra4(it, tmp_levels, src, tmp_dst, mode) << it->i4_; 1013 1014 // Compute RD-score 1015 rd_tmp.D = VP8SSE4x4(src, tmp_dst); 1016 rd_tmp.SD = 1017 tlambda ? MULT_8B(tlambda, VP8TDisto4x4(src, tmp_dst, kWeightY)) 1018 : 0; 1019 rd_tmp.H = mode_costs[mode]; 1020 1021 // Add flatness penalty 1022 if (mode > 0 && IsFlat(tmp_levels, kNumBlocks, FLATNESS_LIMIT_I4)) { 1023 rd_tmp.R = FLATNESS_PENALTY * kNumBlocks; 1024 } else { 1025 rd_tmp.R = 0; 1026 } 1027 1028 // early-out check 1029 SetRDScore(lambda, &rd_tmp); 1030 if (best_mode >= 0 && rd_tmp.score >= rd_i4.score) continue; 1031 1032 // finish computing score 1033 rd_tmp.R += VP8GetCostLuma4(it, tmp_levels); 1034 SetRDScore(lambda, &rd_tmp); 1035 1036 if (best_mode < 0 || rd_tmp.score < rd_i4.score) { 1037 CopyScore(&rd_i4, &rd_tmp); 1038 best_mode = mode; 1039 SwapPtr(&tmp_dst, &best_block); 1040 memcpy(rd_best.y_ac_levels[it->i4_], tmp_levels, 1041 sizeof(rd_best.y_ac_levels[it->i4_])); 1042 } 1043 } 1044 SetRDScore(dqm->lambda_mode_, &rd_i4); 1045 AddScore(&rd_best, &rd_i4); 1046 if (rd_best.score >= rd->score) { 1047 return 0; 1048 } 1049 total_header_bits += (int)rd_i4.H; // <- equal to mode_costs[best_mode]; 1050 if (total_header_bits > enc->max_i4_header_bits_) { 1051 return 0; 1052 } 1053 // Copy selected samples if not in the right place already. 1054 if (best_block != best_blocks + VP8Scan[it->i4_]) { 1055 VP8Copy4x4(best_block, best_blocks + VP8Scan[it->i4_]); 1056 } 1057 rd->modes_i4[it->i4_] = best_mode; 1058 it->top_nz_[it->i4_ & 3] = it->left_nz_[it->i4_ >> 2] = (rd_i4.nz ? 1 : 0); 1059 } while (VP8IteratorRotateI4(it, best_blocks)); 1060 1061 // finalize state 1062 CopyScore(rd, &rd_best); 1063 VP8SetIntra4Mode(it, rd->modes_i4); 1064 SwapOut(it); 1065 memcpy(rd->y_ac_levels, rd_best.y_ac_levels, sizeof(rd->y_ac_levels)); 1066 return 1; // select intra4x4 over intra16x16 1067} 1068 1069//------------------------------------------------------------------------------ 1070 1071static void PickBestUV(VP8EncIterator* const it, VP8ModeScore* const rd) { 1072 const int kNumBlocks = 8; 1073 const VP8SegmentInfo* const dqm = &it->enc_->dqm_[it->mb_->segment_]; 1074 const int lambda = dqm->lambda_uv_; 1075 const uint8_t* const src = it->yuv_in_ + U_OFF_ENC; 1076 uint8_t* tmp_dst = it->yuv_out2_ + U_OFF_ENC; // scratch buffer 1077 uint8_t* dst0 = it->yuv_out_ + U_OFF_ENC; 1078 uint8_t* dst = dst0; 1079 VP8ModeScore rd_best; 1080 int mode; 1081 1082 rd->mode_uv = -1; 1083 InitScore(&rd_best); 1084 for (mode = 0; mode < NUM_PRED_MODES; ++mode) { 1085 VP8ModeScore rd_uv; 1086 1087 // Reconstruct 1088 rd_uv.nz = ReconstructUV(it, &rd_uv, tmp_dst, mode); 1089 1090 // Compute RD-score 1091 rd_uv.D = VP8SSE16x8(src, tmp_dst); 1092 rd_uv.SD = 0; // not calling TDisto here: it tends to flatten areas. 1093 rd_uv.H = VP8FixedCostsUV[mode]; 1094 rd_uv.R = VP8GetCostUV(it, &rd_uv); 1095 if (mode > 0 && IsFlat(rd_uv.uv_levels[0], kNumBlocks, FLATNESS_LIMIT_UV)) { 1096 rd_uv.R += FLATNESS_PENALTY * kNumBlocks; 1097 } 1098 1099 SetRDScore(lambda, &rd_uv); 1100 if (mode == 0 || rd_uv.score < rd_best.score) { 1101 CopyScore(&rd_best, &rd_uv); 1102 rd->mode_uv = mode; 1103 memcpy(rd->uv_levels, rd_uv.uv_levels, sizeof(rd->uv_levels)); 1104 SwapPtr(&dst, &tmp_dst); 1105 } 1106 } 1107 VP8SetIntraUVMode(it, rd->mode_uv); 1108 AddScore(rd, &rd_best); 1109 if (dst != dst0) { // copy 16x8 block if needed 1110 VP8Copy16x8(dst, dst0); 1111 } 1112} 1113 1114//------------------------------------------------------------------------------ 1115// Final reconstruction and quantization. 1116 1117static void SimpleQuantize(VP8EncIterator* const it, VP8ModeScore* const rd) { 1118 const VP8Encoder* const enc = it->enc_; 1119 const int is_i16 = (it->mb_->type_ == 1); 1120 int nz = 0; 1121 1122 if (is_i16) { 1123 nz = ReconstructIntra16(it, rd, it->yuv_out_ + Y_OFF_ENC, it->preds_[0]); 1124 } else { 1125 VP8IteratorStartI4(it); 1126 do { 1127 const int mode = 1128 it->preds_[(it->i4_ & 3) + (it->i4_ >> 2) * enc->preds_w_]; 1129 const uint8_t* const src = it->yuv_in_ + Y_OFF_ENC + VP8Scan[it->i4_]; 1130 uint8_t* const dst = it->yuv_out_ + Y_OFF_ENC + VP8Scan[it->i4_]; 1131 VP8MakeIntra4Preds(it); 1132 nz |= ReconstructIntra4(it, rd->y_ac_levels[it->i4_], 1133 src, dst, mode) << it->i4_; 1134 } while (VP8IteratorRotateI4(it, it->yuv_out_ + Y_OFF_ENC)); 1135 } 1136 1137 nz |= ReconstructUV(it, rd, it->yuv_out_ + U_OFF_ENC, it->mb_->uv_mode_); 1138 rd->nz = nz; 1139} 1140 1141// Refine intra16/intra4 sub-modes based on distortion only (not rate). 1142static void RefineUsingDistortion(VP8EncIterator* const it, 1143 int try_both_modes, int refine_uv_mode, 1144 VP8ModeScore* const rd) { 1145 score_t best_score = MAX_COST; 1146 int nz = 0; 1147 int mode; 1148 int is_i16 = try_both_modes || (it->mb_->type_ == 1); 1149 1150 const VP8SegmentInfo* const dqm = &it->enc_->dqm_[it->mb_->segment_]; 1151 // Some empiric constants, of approximate order of magnitude. 1152 const int lambda_d_i16 = 106; 1153 const int lambda_d_i4 = 11; 1154 const int lambda_d_uv = 120; 1155 score_t score_i4 = dqm->i4_penalty_; 1156 score_t i4_bit_sum = 0; 1157 const score_t bit_limit = try_both_modes ? it->enc_->mb_header_limit_ 1158 : MAX_COST; // no early-out allowed 1159 1160 if (is_i16) { // First, evaluate Intra16 distortion 1161 int best_mode = -1; 1162 const uint8_t* const src = it->yuv_in_ + Y_OFF_ENC; 1163 for (mode = 0; mode < NUM_PRED_MODES; ++mode) { 1164 const uint8_t* const ref = it->yuv_p_ + VP8I16ModeOffsets[mode]; 1165 const score_t score = VP8SSE16x16(src, ref) * RD_DISTO_MULT 1166 + VP8FixedCostsI16[mode] * lambda_d_i16; 1167 if (mode > 0 && VP8FixedCostsI16[mode] > bit_limit) { 1168 continue; 1169 } 1170 if (score < best_score) { 1171 best_mode = mode; 1172 best_score = score; 1173 } 1174 } 1175 VP8SetIntra16Mode(it, best_mode); 1176 // we'll reconstruct later, if i16 mode actually gets selected 1177 } 1178 1179 // Next, evaluate Intra4 1180 if (try_both_modes || !is_i16) { 1181 // We don't evaluate the rate here, but just account for it through a 1182 // constant penalty (i4 mode usually needs more bits compared to i16). 1183 is_i16 = 0; 1184 VP8IteratorStartI4(it); 1185 do { 1186 int best_i4_mode = -1; 1187 score_t best_i4_score = MAX_COST; 1188 const uint8_t* const src = it->yuv_in_ + Y_OFF_ENC + VP8Scan[it->i4_]; 1189 const uint16_t* const mode_costs = GetCostModeI4(it, rd->modes_i4); 1190 1191 VP8MakeIntra4Preds(it); 1192 for (mode = 0; mode < NUM_BMODES; ++mode) { 1193 const uint8_t* const ref = it->yuv_p_ + VP8I4ModeOffsets[mode]; 1194 const score_t score = VP8SSE4x4(src, ref) * RD_DISTO_MULT 1195 + mode_costs[mode] * lambda_d_i4; 1196 if (score < best_i4_score) { 1197 best_i4_mode = mode; 1198 best_i4_score = score; 1199 } 1200 } 1201 i4_bit_sum += mode_costs[best_i4_mode]; 1202 rd->modes_i4[it->i4_] = best_i4_mode; 1203 score_i4 += best_i4_score; 1204 if (score_i4 >= best_score || i4_bit_sum > bit_limit) { 1205 // Intra4 won't be better than Intra16. Bail out and pick Intra16. 1206 is_i16 = 1; 1207 break; 1208 } else { // reconstruct partial block inside yuv_out2_ buffer 1209 uint8_t* const tmp_dst = it->yuv_out2_ + Y_OFF_ENC + VP8Scan[it->i4_]; 1210 nz |= ReconstructIntra4(it, rd->y_ac_levels[it->i4_], 1211 src, tmp_dst, best_i4_mode) << it->i4_; 1212 } 1213 } while (VP8IteratorRotateI4(it, it->yuv_out2_ + Y_OFF_ENC)); 1214 } 1215 1216 // Final reconstruction, depending on which mode is selected. 1217 if (!is_i16) { 1218 VP8SetIntra4Mode(it, rd->modes_i4); 1219 SwapOut(it); 1220 best_score = score_i4; 1221 } else { 1222 nz = ReconstructIntra16(it, rd, it->yuv_out_ + Y_OFF_ENC, it->preds_[0]); 1223 } 1224 1225 // ... and UV! 1226 if (refine_uv_mode) { 1227 int best_mode = -1; 1228 score_t best_uv_score = MAX_COST; 1229 const uint8_t* const src = it->yuv_in_ + U_OFF_ENC; 1230 for (mode = 0; mode < NUM_PRED_MODES; ++mode) { 1231 const uint8_t* const ref = it->yuv_p_ + VP8UVModeOffsets[mode]; 1232 const score_t score = VP8SSE16x8(src, ref) * RD_DISTO_MULT 1233 + VP8FixedCostsUV[mode] * lambda_d_uv; 1234 if (score < best_uv_score) { 1235 best_mode = mode; 1236 best_uv_score = score; 1237 } 1238 } 1239 VP8SetIntraUVMode(it, best_mode); 1240 } 1241 nz |= ReconstructUV(it, rd, it->yuv_out_ + U_OFF_ENC, it->mb_->uv_mode_); 1242 1243 rd->nz = nz; 1244 rd->score = best_score; 1245} 1246 1247//------------------------------------------------------------------------------ 1248// Entry point 1249 1250int VP8Decimate(VP8EncIterator* const it, VP8ModeScore* const rd, 1251 VP8RDLevel rd_opt) { 1252 int is_skipped; 1253 const int method = it->enc_->method_; 1254 1255 InitScore(rd); 1256 1257 // We can perform predictions for Luma16x16 and Chroma8x8 already. 1258 // Luma4x4 predictions needs to be done as-we-go. 1259 VP8MakeLuma16Preds(it); 1260 VP8MakeChroma8Preds(it); 1261 1262 if (rd_opt > RD_OPT_NONE) { 1263 it->do_trellis_ = (rd_opt >= RD_OPT_TRELLIS_ALL); 1264 PickBestIntra16(it, rd); 1265 if (method >= 2) { 1266 PickBestIntra4(it, rd); 1267 } 1268 PickBestUV(it, rd); 1269 if (rd_opt == RD_OPT_TRELLIS) { // finish off with trellis-optim now 1270 it->do_trellis_ = 1; 1271 SimpleQuantize(it, rd); 1272 } 1273 } else { 1274 // At this point we have heuristically decided intra16 / intra4. 1275 // For method >= 2, pick the best intra4/intra16 based on SSE (~tad slower). 1276 // For method <= 1, we don't re-examine the decision but just go ahead with 1277 // quantization/reconstruction. 1278 RefineUsingDistortion(it, (method >= 2), (method >= 1), rd); 1279 } 1280 is_skipped = (rd->nz == 0); 1281 VP8SetSkip(it, is_skipped); 1282 return is_skipped; 1283} 1284