1/* 2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11#include "./vpx_dsp_rtcd.h" 12 13#include "vpx_config.h" 14#include "vp8_rtcd.h" 15#include "encodemb.h" 16#include "vp8/common/reconinter.h" 17#include "vp8/encoder/quantize.h" 18#include "tokenize.h" 19#include "vp8/common/invtrans.h" 20#include "vpx_mem/vpx_mem.h" 21#include "rdopt.h" 22 23void vp8_subtract_b(BLOCK *be, BLOCKD *bd, int pitch) { 24 unsigned char *src_ptr = (*(be->base_src) + be->src); 25 short *diff_ptr = be->src_diff; 26 unsigned char *pred_ptr = bd->predictor; 27 int src_stride = be->src_stride; 28 29 vpx_subtract_block(4, 4, diff_ptr, pitch, src_ptr, src_stride, pred_ptr, 30 pitch); 31} 32 33void vp8_subtract_mbuv(short *diff, unsigned char *usrc, unsigned char *vsrc, 34 int src_stride, unsigned char *upred, 35 unsigned char *vpred, int pred_stride) { 36 short *udiff = diff + 256; 37 short *vdiff = diff + 320; 38 39 vpx_subtract_block(8, 8, udiff, 8, usrc, src_stride, upred, pred_stride); 40 vpx_subtract_block(8, 8, vdiff, 8, vsrc, src_stride, vpred, pred_stride); 41} 42 43void vp8_subtract_mby(short *diff, unsigned char *src, int src_stride, 44 unsigned char *pred, int pred_stride) { 45 vpx_subtract_block(16, 16, diff, 16, src, src_stride, pred, pred_stride); 46} 47 48static void vp8_subtract_mb(MACROBLOCK *x) { 49 BLOCK *b = &x->block[0]; 50 51 vp8_subtract_mby(x->src_diff, *(b->base_src), b->src_stride, 52 x->e_mbd.dst.y_buffer, x->e_mbd.dst.y_stride); 53 vp8_subtract_mbuv(x->src_diff, x->src.u_buffer, x->src.v_buffer, 54 x->src.uv_stride, x->e_mbd.dst.u_buffer, 55 x->e_mbd.dst.v_buffer, x->e_mbd.dst.uv_stride); 56} 57 58static void build_dcblock(MACROBLOCK *x) { 59 short *src_diff_ptr = &x->src_diff[384]; 60 int i; 61 62 for (i = 0; i < 16; ++i) { 63 src_diff_ptr[i] = x->coeff[i * 16]; 64 } 65} 66 67void vp8_transform_mbuv(MACROBLOCK *x) { 68 int i; 69 70 for (i = 16; i < 24; i += 2) { 71 x->short_fdct8x4(&x->block[i].src_diff[0], &x->block[i].coeff[0], 16); 72 } 73} 74 75void vp8_transform_intra_mby(MACROBLOCK *x) { 76 int i; 77 78 for (i = 0; i < 16; i += 2) { 79 x->short_fdct8x4(&x->block[i].src_diff[0], &x->block[i].coeff[0], 32); 80 } 81 82 /* build dc block from 16 y dc values */ 83 build_dcblock(x); 84 85 /* do 2nd order transform on the dc block */ 86 x->short_walsh4x4(&x->block[24].src_diff[0], &x->block[24].coeff[0], 8); 87} 88 89static void transform_mb(MACROBLOCK *x) { 90 int i; 91 92 for (i = 0; i < 16; i += 2) { 93 x->short_fdct8x4(&x->block[i].src_diff[0], &x->block[i].coeff[0], 32); 94 } 95 96 /* build dc block from 16 y dc values */ 97 if (x->e_mbd.mode_info_context->mbmi.mode != SPLITMV) build_dcblock(x); 98 99 for (i = 16; i < 24; i += 2) { 100 x->short_fdct8x4(&x->block[i].src_diff[0], &x->block[i].coeff[0], 16); 101 } 102 103 /* do 2nd order transform on the dc block */ 104 if (x->e_mbd.mode_info_context->mbmi.mode != SPLITMV) { 105 x->short_walsh4x4(&x->block[24].src_diff[0], &x->block[24].coeff[0], 8); 106 } 107} 108 109static void transform_mby(MACROBLOCK *x) { 110 int i; 111 112 for (i = 0; i < 16; i += 2) { 113 x->short_fdct8x4(&x->block[i].src_diff[0], &x->block[i].coeff[0], 32); 114 } 115 116 /* build dc block from 16 y dc values */ 117 if (x->e_mbd.mode_info_context->mbmi.mode != SPLITMV) { 118 build_dcblock(x); 119 x->short_walsh4x4(&x->block[24].src_diff[0], &x->block[24].coeff[0], 8); 120 } 121} 122 123#define RDTRUNC(RM, DM, R, D) ((128 + (R) * (RM)) & 0xFF) 124 125typedef struct vp8_token_state vp8_token_state; 126 127struct vp8_token_state { 128 int rate; 129 int error; 130 signed char next; 131 signed char token; 132 short qc; 133}; 134 135/* TODO: experiments to find optimal multiple numbers */ 136#define Y1_RD_MULT 4 137#define UV_RD_MULT 2 138#define Y2_RD_MULT 16 139 140static const int plane_rd_mult[4] = { Y1_RD_MULT, Y2_RD_MULT, UV_RD_MULT, 141 Y1_RD_MULT }; 142 143static void optimize_b(MACROBLOCK *mb, int ib, int type, ENTROPY_CONTEXT *a, 144 ENTROPY_CONTEXT *l) { 145 BLOCK *b; 146 BLOCKD *d; 147 vp8_token_state tokens[17][2]; 148 unsigned best_mask[2]; 149 const short *dequant_ptr; 150 const short *coeff_ptr; 151 short *qcoeff_ptr; 152 short *dqcoeff_ptr; 153 int eob; 154 int i0; 155 int rc; 156 int x; 157 int sz = 0; 158 int next; 159 int rdmult; 160 int rddiv; 161 int final_eob; 162 int rd_cost0; 163 int rd_cost1; 164 int rate0; 165 int rate1; 166 int error0; 167 int error1; 168 int t0; 169 int t1; 170 int best; 171 int band; 172 int pt; 173 int i; 174 int err_mult = plane_rd_mult[type]; 175 176 b = &mb->block[ib]; 177 d = &mb->e_mbd.block[ib]; 178 179 dequant_ptr = d->dequant; 180 coeff_ptr = b->coeff; 181 qcoeff_ptr = d->qcoeff; 182 dqcoeff_ptr = d->dqcoeff; 183 i0 = !type; 184 eob = *d->eob; 185 186 /* Now set up a Viterbi trellis to evaluate alternative roundings. */ 187 rdmult = mb->rdmult * err_mult; 188 if (mb->e_mbd.mode_info_context->mbmi.ref_frame == INTRA_FRAME) { 189 rdmult = (rdmult * 9) >> 4; 190 } 191 192 rddiv = mb->rddiv; 193 best_mask[0] = best_mask[1] = 0; 194 /* Initialize the sentinel node of the trellis. */ 195 tokens[eob][0].rate = 0; 196 tokens[eob][0].error = 0; 197 tokens[eob][0].next = 16; 198 tokens[eob][0].token = DCT_EOB_TOKEN; 199 tokens[eob][0].qc = 0; 200 *(tokens[eob] + 1) = *(tokens[eob] + 0); 201 next = eob; 202 for (i = eob; i-- > i0;) { 203 int base_bits; 204 int d2; 205 int dx; 206 207 rc = vp8_default_zig_zag1d[i]; 208 x = qcoeff_ptr[rc]; 209 /* Only add a trellis state for non-zero coefficients. */ 210 if (x) { 211 int shortcut = 0; 212 error0 = tokens[next][0].error; 213 error1 = tokens[next][1].error; 214 /* Evaluate the first possibility for this state. */ 215 rate0 = tokens[next][0].rate; 216 rate1 = tokens[next][1].rate; 217 t0 = (vp8_dct_value_tokens_ptr + x)->Token; 218 /* Consider both possible successor states. */ 219 if (next < 16) { 220 band = vp8_coef_bands[i + 1]; 221 pt = vp8_prev_token_class[t0]; 222 rate0 += mb->token_costs[type][band][pt][tokens[next][0].token]; 223 rate1 += mb->token_costs[type][band][pt][tokens[next][1].token]; 224 } 225 rd_cost0 = RDCOST(rdmult, rddiv, rate0, error0); 226 rd_cost1 = RDCOST(rdmult, rddiv, rate1, error1); 227 if (rd_cost0 == rd_cost1) { 228 rd_cost0 = RDTRUNC(rdmult, rddiv, rate0, error0); 229 rd_cost1 = RDTRUNC(rdmult, rddiv, rate1, error1); 230 } 231 /* And pick the best. */ 232 best = rd_cost1 < rd_cost0; 233 base_bits = *(vp8_dct_value_cost_ptr + x); 234 dx = dqcoeff_ptr[rc] - coeff_ptr[rc]; 235 d2 = dx * dx; 236 tokens[i][0].rate = base_bits + (best ? rate1 : rate0); 237 tokens[i][0].error = d2 + (best ? error1 : error0); 238 tokens[i][0].next = next; 239 tokens[i][0].token = t0; 240 tokens[i][0].qc = x; 241 best_mask[0] |= best << i; 242 /* Evaluate the second possibility for this state. */ 243 rate0 = tokens[next][0].rate; 244 rate1 = tokens[next][1].rate; 245 246 if ((abs(x) * dequant_ptr[rc] > abs(coeff_ptr[rc])) && 247 (abs(x) * dequant_ptr[rc] < abs(coeff_ptr[rc]) + dequant_ptr[rc])) { 248 shortcut = 1; 249 } else { 250 shortcut = 0; 251 } 252 253 if (shortcut) { 254 sz = -(x < 0); 255 x -= 2 * sz + 1; 256 } 257 258 /* Consider both possible successor states. */ 259 if (!x) { 260 /* If we reduced this coefficient to zero, check to see if 261 * we need to move the EOB back here. 262 */ 263 t0 = 264 tokens[next][0].token == DCT_EOB_TOKEN ? DCT_EOB_TOKEN : ZERO_TOKEN; 265 t1 = 266 tokens[next][1].token == DCT_EOB_TOKEN ? DCT_EOB_TOKEN : ZERO_TOKEN; 267 } else { 268 t0 = t1 = (vp8_dct_value_tokens_ptr + x)->Token; 269 } 270 if (next < 16) { 271 band = vp8_coef_bands[i + 1]; 272 if (t0 != DCT_EOB_TOKEN) { 273 pt = vp8_prev_token_class[t0]; 274 rate0 += mb->token_costs[type][band][pt][tokens[next][0].token]; 275 } 276 if (t1 != DCT_EOB_TOKEN) { 277 pt = vp8_prev_token_class[t1]; 278 rate1 += mb->token_costs[type][band][pt][tokens[next][1].token]; 279 } 280 } 281 282 rd_cost0 = RDCOST(rdmult, rddiv, rate0, error0); 283 rd_cost1 = RDCOST(rdmult, rddiv, rate1, error1); 284 if (rd_cost0 == rd_cost1) { 285 rd_cost0 = RDTRUNC(rdmult, rddiv, rate0, error0); 286 rd_cost1 = RDTRUNC(rdmult, rddiv, rate1, error1); 287 } 288 /* And pick the best. */ 289 best = rd_cost1 < rd_cost0; 290 base_bits = *(vp8_dct_value_cost_ptr + x); 291 292 if (shortcut) { 293 dx -= (dequant_ptr[rc] + sz) ^ sz; 294 d2 = dx * dx; 295 } 296 tokens[i][1].rate = base_bits + (best ? rate1 : rate0); 297 tokens[i][1].error = d2 + (best ? error1 : error0); 298 tokens[i][1].next = next; 299 tokens[i][1].token = best ? t1 : t0; 300 tokens[i][1].qc = x; 301 best_mask[1] |= best << i; 302 /* Finally, make this the new head of the trellis. */ 303 next = i; 304 } 305 /* There's no choice to make for a zero coefficient, so we don't 306 * add a new trellis node, but we do need to update the costs. 307 */ 308 else { 309 band = vp8_coef_bands[i + 1]; 310 t0 = tokens[next][0].token; 311 t1 = tokens[next][1].token; 312 /* Update the cost of each path if we're past the EOB token. */ 313 if (t0 != DCT_EOB_TOKEN) { 314 tokens[next][0].rate += mb->token_costs[type][band][0][t0]; 315 tokens[next][0].token = ZERO_TOKEN; 316 } 317 if (t1 != DCT_EOB_TOKEN) { 318 tokens[next][1].rate += mb->token_costs[type][band][0][t1]; 319 tokens[next][1].token = ZERO_TOKEN; 320 } 321 /* Don't update next, because we didn't add a new node. */ 322 } 323 } 324 325 /* Now pick the best path through the whole trellis. */ 326 band = vp8_coef_bands[i + 1]; 327 VP8_COMBINEENTROPYCONTEXTS(pt, *a, *l); 328 rate0 = tokens[next][0].rate; 329 rate1 = tokens[next][1].rate; 330 error0 = tokens[next][0].error; 331 error1 = tokens[next][1].error; 332 t0 = tokens[next][0].token; 333 t1 = tokens[next][1].token; 334 rate0 += mb->token_costs[type][band][pt][t0]; 335 rate1 += mb->token_costs[type][band][pt][t1]; 336 rd_cost0 = RDCOST(rdmult, rddiv, rate0, error0); 337 rd_cost1 = RDCOST(rdmult, rddiv, rate1, error1); 338 if (rd_cost0 == rd_cost1) { 339 rd_cost0 = RDTRUNC(rdmult, rddiv, rate0, error0); 340 rd_cost1 = RDTRUNC(rdmult, rddiv, rate1, error1); 341 } 342 best = rd_cost1 < rd_cost0; 343 final_eob = i0 - 1; 344 for (i = next; i < eob; i = next) { 345 x = tokens[i][best].qc; 346 if (x) final_eob = i; 347 rc = vp8_default_zig_zag1d[i]; 348 qcoeff_ptr[rc] = x; 349 dqcoeff_ptr[rc] = x * dequant_ptr[rc]; 350 next = tokens[i][best].next; 351 best = (best_mask[best] >> i) & 1; 352 } 353 final_eob++; 354 355 *a = *l = (final_eob != !type); 356 *d->eob = (char)final_eob; 357} 358static void check_reset_2nd_coeffs(MACROBLOCKD *x, int type, ENTROPY_CONTEXT *a, 359 ENTROPY_CONTEXT *l) { 360 int sum = 0; 361 int i; 362 BLOCKD *bd = &x->block[24]; 363 364 if (bd->dequant[0] >= 35 && bd->dequant[1] >= 35) return; 365 366 for (i = 0; i < (*bd->eob); ++i) { 367 int coef = bd->dqcoeff[vp8_default_zig_zag1d[i]]; 368 sum += (coef >= 0) ? coef : -coef; 369 if (sum >= 35) return; 370 } 371 /************************************************************************** 372 our inverse hadamard transform effectively is weighted sum of all 16 inputs 373 with weight either 1 or -1. It has a last stage scaling of (sum+3)>>3. And 374 dc only idct is (dc+4)>>3. So if all the sums are between -35 and 29, the 375 output after inverse wht and idct will be all zero. A sum of absolute value 376 smaller than 35 guarantees all 16 different (+1/-1) weighted sums in wht 377 fall between -35 and +35. 378 **************************************************************************/ 379 if (sum < 35) { 380 for (i = 0; i < (*bd->eob); ++i) { 381 int rc = vp8_default_zig_zag1d[i]; 382 bd->qcoeff[rc] = 0; 383 bd->dqcoeff[rc] = 0; 384 } 385 *bd->eob = 0; 386 *a = *l = (*bd->eob != !type); 387 } 388} 389 390static void optimize_mb(MACROBLOCK *x) { 391 int b; 392 int type; 393 int has_2nd_order; 394 395 ENTROPY_CONTEXT_PLANES t_above, t_left; 396 ENTROPY_CONTEXT *ta; 397 ENTROPY_CONTEXT *tl; 398 399 memcpy(&t_above, x->e_mbd.above_context, sizeof(ENTROPY_CONTEXT_PLANES)); 400 memcpy(&t_left, x->e_mbd.left_context, sizeof(ENTROPY_CONTEXT_PLANES)); 401 402 ta = (ENTROPY_CONTEXT *)&t_above; 403 tl = (ENTROPY_CONTEXT *)&t_left; 404 405 has_2nd_order = (x->e_mbd.mode_info_context->mbmi.mode != B_PRED && 406 x->e_mbd.mode_info_context->mbmi.mode != SPLITMV); 407 type = has_2nd_order ? PLANE_TYPE_Y_NO_DC : PLANE_TYPE_Y_WITH_DC; 408 409 for (b = 0; b < 16; ++b) { 410 optimize_b(x, b, type, ta + vp8_block2above[b], tl + vp8_block2left[b]); 411 } 412 413 for (b = 16; b < 24; ++b) { 414 optimize_b(x, b, PLANE_TYPE_UV, ta + vp8_block2above[b], 415 tl + vp8_block2left[b]); 416 } 417 418 if (has_2nd_order) { 419 b = 24; 420 optimize_b(x, b, PLANE_TYPE_Y2, ta + vp8_block2above[b], 421 tl + vp8_block2left[b]); 422 check_reset_2nd_coeffs(&x->e_mbd, PLANE_TYPE_Y2, ta + vp8_block2above[b], 423 tl + vp8_block2left[b]); 424 } 425} 426 427void vp8_optimize_mby(MACROBLOCK *x) { 428 int b; 429 int type; 430 int has_2nd_order; 431 432 ENTROPY_CONTEXT_PLANES t_above, t_left; 433 ENTROPY_CONTEXT *ta; 434 ENTROPY_CONTEXT *tl; 435 436 if (!x->e_mbd.above_context) return; 437 438 if (!x->e_mbd.left_context) return; 439 440 memcpy(&t_above, x->e_mbd.above_context, sizeof(ENTROPY_CONTEXT_PLANES)); 441 memcpy(&t_left, x->e_mbd.left_context, sizeof(ENTROPY_CONTEXT_PLANES)); 442 443 ta = (ENTROPY_CONTEXT *)&t_above; 444 tl = (ENTROPY_CONTEXT *)&t_left; 445 446 has_2nd_order = (x->e_mbd.mode_info_context->mbmi.mode != B_PRED && 447 x->e_mbd.mode_info_context->mbmi.mode != SPLITMV); 448 type = has_2nd_order ? PLANE_TYPE_Y_NO_DC : PLANE_TYPE_Y_WITH_DC; 449 450 for (b = 0; b < 16; ++b) { 451 optimize_b(x, b, type, ta + vp8_block2above[b], tl + vp8_block2left[b]); 452 } 453 454 if (has_2nd_order) { 455 b = 24; 456 optimize_b(x, b, PLANE_TYPE_Y2, ta + vp8_block2above[b], 457 tl + vp8_block2left[b]); 458 check_reset_2nd_coeffs(&x->e_mbd, PLANE_TYPE_Y2, ta + vp8_block2above[b], 459 tl + vp8_block2left[b]); 460 } 461} 462 463void vp8_optimize_mbuv(MACROBLOCK *x) { 464 int b; 465 ENTROPY_CONTEXT_PLANES t_above, t_left; 466 ENTROPY_CONTEXT *ta; 467 ENTROPY_CONTEXT *tl; 468 469 if (!x->e_mbd.above_context) return; 470 471 if (!x->e_mbd.left_context) return; 472 473 memcpy(&t_above, x->e_mbd.above_context, sizeof(ENTROPY_CONTEXT_PLANES)); 474 memcpy(&t_left, x->e_mbd.left_context, sizeof(ENTROPY_CONTEXT_PLANES)); 475 476 ta = (ENTROPY_CONTEXT *)&t_above; 477 tl = (ENTROPY_CONTEXT *)&t_left; 478 479 for (b = 16; b < 24; ++b) { 480 optimize_b(x, b, PLANE_TYPE_UV, ta + vp8_block2above[b], 481 tl + vp8_block2left[b]); 482 } 483} 484 485void vp8_encode_inter16x16(MACROBLOCK *x) { 486 vp8_build_inter_predictors_mb(&x->e_mbd); 487 488 vp8_subtract_mb(x); 489 490 transform_mb(x); 491 492 vp8_quantize_mb(x); 493 494 if (x->optimize) optimize_mb(x); 495} 496 497/* this funciton is used by first pass only */ 498void vp8_encode_inter16x16y(MACROBLOCK *x) { 499 BLOCK *b = &x->block[0]; 500 501 vp8_build_inter16x16_predictors_mby(&x->e_mbd, x->e_mbd.dst.y_buffer, 502 x->e_mbd.dst.y_stride); 503 504 vp8_subtract_mby(x->src_diff, *(b->base_src), b->src_stride, 505 x->e_mbd.dst.y_buffer, x->e_mbd.dst.y_stride); 506 507 transform_mby(x); 508 509 vp8_quantize_mby(x); 510 511 vp8_inverse_transform_mby(&x->e_mbd); 512} 513